[lua-torch-torch7] 01/01: Imported Upstream version 0~20160604-g69d7a01

Wed Jun 8 09:28:43 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-torch7.

commit d2ac251234a167f063f3f1a64ffac1faa6f859ef
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Mon Jun 6 03:47:54 2016 +0000

    Imported Upstream version 0~20160604-g69d7a01
---
 .gitignore                            |    1 +
 .travis.yml                           |   67 +
 CMakeLists.txt                        |   89 +
 CONTRIBUTING.md                       |  130 ++
 COPYRIGHT.txt                         |   36 +
 CmdLine.lua                           |  269 +++
 DiskFile.c                            |  103 +
 FFI.lua                               |  205 ++
 File.c                                |  207 ++
 File.lua                              |  454 +++++
 Generator.c                           |   50 +
 MemoryFile.c                          |   70 +
 PipeFile.c                            |   43 +
 README.md                             |   45 +
 ROADMAP.md                            |  144 ++
 Storage.c                             |    9 +
 Tensor.c                              |    9 +
 Tensor.lua                            |  561 ++++++
 TensorMath.lua                        | 1443 ++++++++++++++
 TensorOperator.c                      |    8 +
 TestSuite.lua                         |   30 +
 Tester.lua                            |  878 +++++++++
 Timer.c                               |  170 ++
 cmake/TorchConfig.cmake.in            |   35 +
 cmake/TorchExports.cmake              |   14 +
 cmake/TorchPackage.cmake              |   53 +
 cmake/TorchPaths.cmake                |   32 +
 cmake/TorchPathsInit.cmake            |   42 +
 cmake/TorchWrap.cmake                 |   18 +
 cmake/TorchWrap.cmake.in              |   19 +
 doc/cmdline.md                        |  148 ++
 doc/diskfile.md                       |   74 +
 doc/file.md                           |  364 ++++
 doc/gather.png                        |  Bin 0 -> 56988 bytes
 doc/index.md                          |   32 +
 doc/maths.md                          | 2851 +++++++++++++++++++++++++++
 doc/memoryfile.md                     |   42 +
 doc/pipefile.md                       |   22 +
 doc/random.md                         |  173 ++
 doc/serialization.md                  |  112 ++
 doc/storage.md                        |  300 +++
 doc/tensor.md                         | 2415 +++++++++++++++++++++++
 doc/tester.md                         |  363 ++++
 doc/timer.md                          |   47 +
 doc/utility.md                        |  327 ++++
 general.h                             |   28 +
 generic/Storage.c                     |  286 +++
 generic/Tensor.c                      | 1323 +++++++++++++
 generic/TensorOperator.c              |  191 ++
 generic/luaG.h                        |   37 +
 init.c                                |   88 +
 init.lua                              |  189 ++
 lib/CMakeLists.txt                    |    7 +
 lib/TH/CMakeLists.txt                 |  370 ++++
 lib/TH/TH.h                           |   24 +
 lib/TH/THAllocator.c                  |  311 +++
 lib/TH/THAllocator.h                  |   31 +
 lib/TH/THAtomic.c                     |  177 ++
 lib/TH/THAtomic.h                     |   89 +
 lib/TH/THBlas.c                       |    4 +
 lib/TH/THBlas.h                       |   11 +
 lib/TH/THConfig.cmake.in              |    9 +
 lib/TH/THDiskFile.c                   |  771 ++++++++
 lib/TH/THDiskFile.h                   |   19 +
 lib/TH/THFile.c                       |  154 ++
 lib/TH/THFile.h                       |   84 +
 lib/TH/THFilePrivate.h                |   43 +
 lib/TH/THGeneral.c                    |  274 +++
 lib/TH/THGeneral.h.in                 |  116 ++
 lib/TH/THGenerateAllTypes.h           |   97 +
 lib/TH/THGenerateFloatTypes.h         |   31 +
 lib/TH/THGenerateIntTypes.h           |   70 +
 lib/TH/THLapack.c                     |    4 +
 lib/TH/THLapack.h                     |   27 +
 lib/TH/THLogAdd.c                     |   88 +
 lib/TH/THLogAdd.h                     |   14 +
 lib/TH/THMath.h                       |   21 +
 lib/TH/THMemoryFile.c                 |  678 +++++++
 lib/TH/THMemoryFile.h                 |   13 +
 lib/TH/THRandom.c                     |  274 +++
 lib/TH/THRandom.h                     |   81 +
 lib/TH/THStorage.c                    |    8 +
 lib/TH/THStorage.h                    |   20 +
 lib/TH/THTensor.c                     |   26 +
 lib/TH/THTensor.h                     |   41 +
 lib/TH/THTensorApply.h                |  428 ++++
 lib/TH/THTensorDimApply.h             |  232 +++
 lib/TH/THTensorMacros.h               |   30 +
 lib/TH/THVector.h                     |  574 ++++++
 lib/TH/cmake/FindARM.cmake            |   67 +
 lib/TH/cmake/FindBLAS.cmake           |  299 +++
 lib/TH/cmake/FindLAPACK.cmake         |  190 ++
 lib/TH/cmake/FindMKL.cmake            |  265 +++
 lib/TH/cmake/FindSSE.cmake            |  111 ++
 lib/TH/generic/THBlas.c               |  391 ++++
 lib/TH/generic/THBlas.h               |   19 +
 lib/TH/generic/THLapack.c             |  254 +++
 lib/TH/generic/THLapack.h             |   39 +
 lib/TH/generic/THStorage.c            |  206 ++
 lib/TH/generic/THStorage.h            |   70 +
 lib/TH/generic/THStorageCopy.c        |   36 +
 lib/TH/generic/THStorageCopy.h        |   17 +
 lib/TH/generic/THTensor.c             |  819 ++++++++
 lib/TH/generic/THTensor.h             |  130 ++
 lib/TH/generic/THTensorConv.c         | 1959 +++++++++++++++++++
 lib/TH/generic/THTensorConv.h         |   80 +
 lib/TH/generic/THTensorCopy.c         |   24 +
 lib/TH/generic/THTensorCopy.h         |   16 +
 lib/TH/generic/THTensorLapack.c       |  884 +++++++++
 lib/TH/generic/THTensorLapack.h       |   22 +
 lib/TH/generic/THTensorMath.c         | 2509 ++++++++++++++++++++++++
 lib/TH/generic/THTensorMath.h         |  186 ++
 lib/TH/generic/THTensorRandom.c       |  250 +++
 lib/TH/generic/THTensorRandom.h       |   25 +
 lib/TH/generic/THVector.c             |   84 +
 lib/TH/generic/simd/common_simd.h     |  395 ++++
 lib/TH/generic/simd/convolve.c        |  127 ++
 lib/TH/generic/simd/convolve.h        |    1 +
 lib/TH/generic/simd/convolve5x5_avx.c |  212 ++
 lib/TH/generic/simd/convolve5x5_sse.c |  320 +++
 lib/luaT/CMakeLists.txt               |   41 +
 lib/luaT/README.md                    |  266 +++
 lib/luaT/luaT.c                       | 1338 +++++++++++++
 lib/luaT/luaT.h                       |  132 ++
 lib/luaT/luaTConfig.cmake.in          |    9 +
 mkdocs.yml                            |   21 +
 random.lua                            |   53 +
 rocks/torch-scm-1.rockspec            |   36 +
 test/longSize.lua                     |   42 +
 test/test.lua                         | 3425 +++++++++++++++++++++++++++++++++
 test/test_Multinomial.lua             |   25 +
 test/test_Tester.lua                  |  626 ++++++
 test/test_qr.lua                      |  274 +++
 test/test_sharedmem.lua               |   76 +
 test/test_writeObject.lua             |  229 +++
 test/timeSort.lua                     |  148 ++
 torchcwrap.lua                        |  462 +++++
 utils.c                               |  260 +++
 utils.h                               |   36 +
 139 files changed, 37403 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..c28b4d1
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,67 @@
+language: c
+compiler:
+  - gcc
+  - clang
+cache:
+  directories:
+  - $HOME/OpenBlasInstall
+sudo: false
+env:
+  - TORCH_LUA_VERSION=LUAJIT21
+  - TORCH_LUA_VERSION=LUA51
+  - TORCH_LUA_VERSION=LUA52
+os:
+  - linux
+matrix:
+  include:
+  - os: osx
+    env: TORCH_LUA_VERSION=LUAJIT21
+    compiler: clang
+addons:
+  apt:
+    packages:
+    - cmake
+    - gfortran
+    - gcc-multilib
+    - gfortran-multilib
+    - liblapack-dev
+    - build-essential
+    - gcc
+    - g++
+    - curl
+    - cmake
+    - libreadline-dev
+    - git-core
+    - libqt4-core
+    - libqt4-gui
+    - libqt4-dev
+    - libjpeg-dev
+    - libpng-dev
+    - ncurses-dev
+    - imagemagick
+    - libzmq3-dev
+    - gfortran
+    - unzip
+    - gnuplot
+    - gnuplot-x11
+before_script:
+- export ROOT_TRAVIS_DIR=$(pwd)
+- export INSTALL_PREFIX=~/torch/install
+-  ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install)
+- git clone https://github.com/torch/distro.git ~/torch --recursive
+- cd ~/torch && git submodule update --init --recursive
+- mkdir build && cd build
+- export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH
+- cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON
+- make && make install
+- cd $ROOT_TRAVIS_DIR
+- export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
+script:
+- ${INSTALL_PREFIX}/bin/luarocks make rocks/torch-scm-1.rockspec
+- ${INSTALL_PREFIX}/bin/luarocks install luaffi
+- export PATH=${INSTALL_PREFIX}/bin:$PATH
+- export TESTLUA=$(which luajit lua | head -n 1)
+- ${TESTLUA} -ltorch -e "t=torch.test(); if t.errors[1] then os.exit(1) end"
+- cd test
+- ${TESTLUA} test_writeObject.lua
+- ${TESTLUA} test_Tester.lua
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..611258b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,89 @@
+IF(APPLE)
+  CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
+  CMAKE_POLICY(VERSION 2.8.12)
+ELSE()
+  CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
+  CMAKE_POLICY(VERSION 2.8)
+ENDIF()
+
+SET(CMAKE_MODULE_PATH
+  "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
+  "${CMAKE_MODULE_PATH}")
+
+IF (NOT MSVC)
+  IF (MINGW)
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=format")
+  ELSE()
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=implicit-function-declaration -Werror=format")
+  ENDIF(MINGW)
+ENDIF(NOT MSVC)
+
+# Flags
+# When using MSVC
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ENDIF(MSVC)
+
+# OpenMP support?
+SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+  IF (DARWIN_VERSION GREATER 9)
+    SET(APPLE_OPENMP_SUCKS 1)
+  ENDIF (DARWIN_VERSION GREATER 9)
+  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+    OUTPUT_VARIABLE GCC_VERSION)
+  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
+    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+  ENDIF ()
+ENDIF ()
+
+IF (WITH_OPENMP)
+  FIND_PACKAGE(OpenMP)
+  IF(OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+# Includes
+INCLUDE(TorchPaths)
+INCLUDE(TorchPathsInit)
+INCLUDE(TorchPackage)
+INCLUDE(TorchWrap)
+INCLUDE(TorchExports)
+
+# Torch libraries
+ADD_SUBDIRECTORY(lib)
+
+INCLUDE_DIRECTORIES(BEFORE "${LUA_INCDIR}")
+INCLUDE_DIRECTORIES(BEFORE "${CMAKE_CURRENT_SOURCE_DIR}/lib/TH")
+INCLUDE_DIRECTORIES(BEFORE "${CMAKE_CURRENT_BINARY_DIR}/lib/TH")
+INCLUDE_DIRECTORIES(BEFORE "${CMAKE_CURRENT_SOURCE_DIR}/lib/luaT")
+LINK_DIRECTORIES("${LUA_LIBDIR}")
+
+SET(src DiskFile.c File.c MemoryFile.c PipeFile.c Storage.c Tensor.c Timer.c utils.c init.c TensorOperator.c TensorMath.c random.c Generator.c)
+SET(luasrc init.lua File.lua Tensor.lua CmdLine.lua FFI.lua Tester.lua TestSuite.lua test/test.lua)
+
+# Necessary do generate wrapper
+ADD_TORCH_WRAP(tensormathwrap TensorMath.lua)
+ADD_TORCH_WRAP(randomwrap random.lua)
+
+ADD_TORCH_PACKAGE(torch "${src}" "${luasrc}")
+
+TARGET_LINK_LIBRARIES(torch luaT TH)
+
+IF(LUALIB)
+  TARGET_LINK_LIBRARIES(torch ${LUALIB})
+ENDIF()
+
+INSTALL(FILES "README.md" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/torch")
+INSTALL(DIRECTORY "doc" DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/torch")
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..f4f5597
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,130 @@
+# Contributing to Torch7 Core (torch7, nn, cutorch, cunn)
+
+Thanks a lot! There are plenty of ways you can help!
+
+Please take a moment to review this document in order to make the contribution
+process easy and effective for everyone involved.
+
+Following these guidelines helps to communicate that you respect the time of
+the developers managing and developing this open source project. In return,
+they should reciprocate that respect in addressing your issue or assessing
+patches and features.
+
+
+## Using the issue tracker
+
+The [issue tracker](https://github.com/torch/torch7/issues) is
+the preferred channel for [bug reports](#bugs), [features requests](#features)
+and [submitting pull requests](#pull-requests), but please respect the following
+restrictions:
+
+* Please **do not** use the issue tracker for personal support requests (use
+  [mailing-list](https://groups.google.com/forum/#!forum/torch7)).
+
+* Please **do not** open issues regarding the code in a torch package 
+  outside the core. For example don't open issues about the 
+  REPL in the torch7 issue tracker, use the trepl issue tracker for that.
+
+<a name="bugs"></a>
+## Bug reports
+
+A bug is a _demonstrable problem_ that is caused by the code in the repository.
+Good bug reports are extremely helpful - thank you!
+
+Guidelines for bug reports:
+
+1. **Use the GitHub issue search** — check if the issue has already been
+   reported.
+
+2. **Check if the issue has been fixed** — try to reproduce it using the
+   latest `master` or development branch in the repository.
+
+3. **Isolate the problem** — ideally create test case that is within reason,
+   preferably within 100 lines of code.
+
+A good bug report shouldn't leave others needing to chase you up for more
+information. Please try to be as detailed as possible in your report. What is
+your environment? What steps will reproduce the issue? What OS do you
+experience the problem? What would you expect to be the outcome? All these
+details will help people to fix any potential bugs.
+
+<a name="features"></a>
+## Feature requests
+
+Feature requests are welcome to be filed. Torch is community-developed, 
+the maintainers are not exclusive torch developers, so keep that in mind.
+The purpose of feature requests is for others who are looking to implement
+a feature are aware of the interest in the feature.
+
+
+<a name="pull-requests"></a>
+## Pull requests
+
+Good pull requests - patches, improvements, new features - are a fantastic
+help. They should remain focused in scope **and avoid containing unrelated
+commits.**
+
+**Please ask first** before embarking on any significant pull request (e.g.
+implementing features, refactoring code, porting to a different language),
+otherwise you risk spending a lot of time working on something that the
+project's developers might not want to merge into the project.
+
+Please adhere to the coding conventions used throughout a project (indentation,
+accurate comments, etc.) and any other requirements (such as test coverage).
+
+Adhering to the following this process is the best way to get your work
+included in the project:
+
+1. [Fork](https://help.github.com/articles/fork-a-repo) the project, clone your
+   fork, and configure the remotes:
+
+   ```bash
+   # Clone your fork of the repo into the current directory
+   git clone https://github.com/<your-username>/torch7.git
+   # Navigate to the newly cloned directory
+   cd torch7
+   # Assign the original repo to a remote called "upstream"
+   git remote add upstream https://github.com/torch/torch7.git
+   ```
+
+2. If you cloned a while ago, get the latest changes from upstream:
+
+   ```bash
+   git checkout master
+   git pull upstream master
+   ```
+
+3. Create a new topic branch (off the main project development branch) to
+   contain your feature, change, or fix:
+
+   ```bash
+   git checkout -b <topic-branch-name>
+   ```
+
+4. Commit your changes in logical chunks. Please try to adhere to these [git commit
+   message guidelines](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html)
+   . Use Git's [interactive rebase](https://help.github.com/articles/about-git-rebase)
+   feature to tidy up your commits before making them public. This helps us keep the 
+   commit history in logical blocks and clean, as torch grows. 
+   For example: 
+     - If you are adding a new function or a module, keep the module + tests + doc 
+       to a single commit unless logically warranted. 
+     - If you are fixing a bug, keep the bugfix to a single commit unless logically warranted.
+
+5. Locally merge (or rebase) the upstream development branch into your topic branch:
+
+   ```bash
+   git pull [--rebase] upstream master
+   ```
+
+6. Push your topic branch up to your fork:
+
+   ```bash
+   git push origin <topic-branch-name>
+   ```
+
+7. [Open a Pull Request](https://help.github.com/articles/using-pull-requests/)
+    with a clear title and description.
+
+**IMPORTANT**: By submitting a patch, you agree to allow the project owners to
+license your work under the terms of the BSD License.
diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
new file mode 100644
index 0000000..c9cc784
--- /dev/null
+++ b/COPYRIGHT.txt
@@ -0,0 +1,36 @@
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Deepmind Technologies, NYU, NEC Laboratories America 
+   and IDIAP Research Institute nor the names of its contributors may be 
+   used to endorse or promote products derived from this software without 
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/CmdLine.lua b/CmdLine.lua
new file mode 100644
index 0000000..23e9969
--- /dev/null
+++ b/CmdLine.lua
@@ -0,0 +1,269 @@
+local CmdLine = torch.class('torch.CmdLine')
+
+local function strip(str)
+   return string.match(str, '%-*(.*)')
+end
+
+local function pad(str, sz)
+   return str .. string.rep(' ', sz-#str)
+end
+
+function CmdLine:error(msg)
+   print('')
+   io.stderr:write(msg)
+   print('')
+   self:help()
+   os.exit(1)
+end
+
+function CmdLine:__readArgument__(params, arg, i, nArgument)
+   local argument = self.arguments[nArgument]
+   local value = arg[i]
+
+   if nArgument > #self.arguments then
+      self:error('invalid argument: ' .. value)
+   end
+   if argument.type and type(value) ~= argument.type then
+      self:error('invalid argument type for argument ' .. argument.key .. ' (should be ' .. argument.type .. ')')
+   end
+   params[strip(argument.key)] = value
+   return 1
+end
+
+function CmdLine:__readOption__(params, arg, i)
+   local key = arg[i]
+   local option = self.options[key]
+   if not option then
+      self:error('unknown option ' .. key)
+   end
+
+   if option.type and option.type == 'boolean' then
+      params[strip(key)] = not option.default
+      return 1
+   else
+      local value = arg[i+1]
+      if not value then
+         self:error('missing argument for option ' .. key)
+      end
+      if not option.type or option.type == 'string' then
+      elseif option.type == 'number' then
+         value = tonumber(value)
+      else
+         self:error('unknown required option type ' .. option.type)
+      end
+      if not value then
+         self:error('invalid type for option ' .. key .. ' (should be ' .. option.type .. ')')
+      end
+      params[strip(key)] = value
+      return 2
+   end
+end
+
+function CmdLine:__init(argseparator_,keyseparator_)
+   self.argseparator = argseparator_ or ','
+   self.keyseparator = keyseparator_ or '='
+   self.options = {}
+   self.arguments = {}
+   self.helplines = {}
+   self.dateformat = nil
+   self.silentio = false
+end
+
+function CmdLine:silent()
+   self.silentio = true
+end
+
+function CmdLine:addTime(name, format)
+   format = format or '%Y-%m-%d %H:%M:%S'
+   if type(format) ~= 'string' then
+      error('Argument has to be string')
+   end
+   if name ~= nil then
+      name = '[' .. name .. ']: '
+   else
+      name = ''
+   end
+   self.dateformat = format .. name
+end
+
+
+function CmdLine:argument(key, help, _type_)
+   table.insert(self.arguments, {key=key, help=help, type=_type_})
+   table.insert(self.helplines, self.arguments[#self.arguments])
+end
+
+function CmdLine:option(key, default, help, _type_)
+   if default == nil then
+      error('option ' .. key .. ' has no default value')
+   end
+   _type_ = _type_ or type(default)
+   if type(default) ~= _type_ then
+      error('option ' .. key .. ' has wrong default type value')
+   end
+   self.options[key] = {key=key, default=default, help=help, type=_type_}
+   table.insert(self.helplines, self.options[key])
+end
+
+function CmdLine:default()
+   local params = {}
+   for option,v in pairs(self.options) do
+      params[strip(option)] = v.default
+   end
+   return params
+end
+
+function CmdLine:parse(arg)
+   local i = 1
+   local params = self:default()
+
+   local nArgument = 0
+
+   while i <= #arg do
+      if arg[i] == '-help' or arg[i] == '-h' or arg[i] == '--help' then
+         self:help(arg)
+         os.exit(0)
+      end
+
+      if self.options[arg[i]] then
+         i = i + self:__readOption__(params, arg, i)
+      else
+         nArgument = nArgument + 1
+         i = i + self:__readArgument__(params, arg, i, nArgument)
+      end
+   end
+
+   if nArgument ~= #self.arguments then
+      self:error('not enough arguments')
+   end
+
+   return params
+end
+
+function CmdLine:string(prefix, params, ignore)
+   local arguments = {}
+   local options = {}
+   prefix = prefix or ''
+
+   for k,v in pairs(params) do
+      if ignore[k] then
+         print('-- ignore option ' .. k)
+      elseif self.options['-' .. k] then
+         if v ~= self.options['-' .. k].default or ignore[k] == false then
+            if type(v) == 'boolean' then
+               if v then
+                  v = 't'
+               else
+                  v = 'f'
+               end
+            end
+            table.insert(options, k .. self.keyseparator .. v)
+            print(k,v,self.options['-' .. k].default)
+        end
+       else
+         local narg
+         for i=1,#self.arguments do
+            if strip(self.arguments[i].key) == k then
+               narg = i
+            end
+         end
+         if narg then
+            arguments[narg] = k .. self.keyseparator .. v
+         else
+            print('WARNING: unknown option/argument: ' .. k .. ' IGNORING for DIRECTORY NAME')
+         end
+      end
+   end
+   table.sort(options)
+   local str = table.concat(arguments, self.argseparator)
+   if str == '' then
+      str = table.concat(options, self.argseparator)
+   else
+      str = str .. self.argseparator .. table.concat(options, self.argseparator)
+   end
+   if str == '' then
+      return prefix
+   else
+      return prefix .. self.argseparator .. str
+   end
+end
+
+local oprint = nil
+function CmdLine:log(file, params)
+   local f = (io.type(file) == 'file' and file) or io.open(file, 'w')
+   oprint = oprint or print -- get the current print function lazily
+   function print(...)
+      local n = select("#", ...)
+      local arg = {...}
+      if not self.silentio then
+	 oprint(...)
+      end
+      local str = {}
+      if self.dateformat then
+	 table.insert(str, os.date(self.dateformat))
+      end
+      for i=1,n do
+	 table.insert(str,tostring(arg[i]))
+      end
+      table.insert(str,'\n')
+      f:write(table.concat(str,' '))
+      f:flush()
+   end
+   print('[program started on ' .. os.date() .. ']')
+   print('[command line arguments]')
+   if params then
+      for k,v in pairs(params) do
+         print(k,v)
+      end
+   end
+   print('[----------------------]')
+end
+
+function CmdLine:text(txt)
+   txt = txt or ''
+   assert(type(txt) == 'string')
+   table.insert(self.helplines, txt)
+end
+
+function CmdLine:help(arg)
+   io.write('Usage: ')
+   if arg then io.write(arg[0] .. ' ') end
+   io.write('[options] ')
+   for i=1,#self.arguments do
+      io.write('<' .. strip(self.arguments[i].key) .. '>')
+   end
+   io.write('\n')
+
+   -- first pass to compute max length
+   local optsz = 0
+   for _,option in ipairs(self.helplines) do
+      if type(option) == 'table' then
+         if option.default ~= nil then -- it is an option
+            if #option.key > optsz then
+               optsz = #option.key
+            end
+         else -- it is an argument
+            if #strip(option.key)+2 > optsz then
+               optsz = #strip(option.key)+2
+            end
+         end
+      end
+   end
+
+   -- second pass to print
+   for _,option in ipairs(self.helplines) do
+      if type(option) == 'table' then
+         io.write('  ')
+         if option.default ~= nil then -- it is an option
+            io.write(pad(option.key, optsz))
+            if option.help then io.write(' ' .. option.help) end
+            io.write(' [' .. tostring(option.default) .. ']')
+         else -- it is an argument
+            io.write(pad('<' .. strip(option.key) .. '>', optsz))
+            if option.help then io.write(' ' .. option.help) end
+         end
+      else
+         io.write(option) -- just some additional help
+      end
+      io.write('\n')
+   end
+end
diff --git a/DiskFile.c b/DiskFile.c
new file mode 100644
index 0000000..c50b74f
--- /dev/null
+++ b/DiskFile.c
@@ -0,0 +1,103 @@
+#include "general.h"
+
+static int torch_DiskFile_new(lua_State *L)
+{
+  const char *name = luaL_checkstring(L, 1);
+  const char *mode = luaL_optstring(L, 2, "r");
+  int isQuiet = luaT_optboolean(L, 3, 0);
+  THFile *self = THDiskFile_new(name, mode, isQuiet);
+
+  luaT_pushudata(L, self, "torch.DiskFile");
+  return 1;
+}
+
+static int torch_DiskFile_free(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.DiskFile");
+  THFile_free(self);
+  return 0;
+}
+
+static int torch_DiskFile_isLittleEndianCPU(lua_State *L)
+{
+  lua_pushboolean(L, THDiskFile_isLittleEndianCPU());
+  return 1;
+}
+
+static int torch_DiskFile_isBigEndianCPU(lua_State *L)
+{
+  lua_pushboolean(L, !THDiskFile_isLittleEndianCPU());
+  return 1;
+}
+
+static int torch_DiskFile_nativeEndianEncoding(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.DiskFile");
+  THDiskFile_nativeEndianEncoding(self);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_DiskFile_littleEndianEncoding(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.DiskFile");
+  THDiskFile_littleEndianEncoding(self);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_DiskFile_bigEndianEncoding(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.DiskFile");
+  THDiskFile_bigEndianEncoding(self);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_DiskFile_longSize(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.DiskFile");
+  THDiskFile_longSize(self, lua_tointeger(L, 2));
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_DiskFile_noBuffer(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.DiskFile");
+  THDiskFile_noBuffer(self);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_DiskFile___tostring__(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.DiskFile");
+  lua_pushfstring(L, "torch.DiskFile on <%s> [status: %s -- mode %c%c]",
+                  THDiskFile_name(self),
+                  (THFile_isOpened(self) ? "open" : "closed"),
+                  (THFile_isReadable(self) ? 'r' : ' '),
+                  (THFile_isWritable(self) ? 'w' : ' '));
+
+  return 1;
+}
+static const struct luaL_Reg torch_DiskFile__ [] = {
+  {"isLittleEndianCPU", torch_DiskFile_isLittleEndianCPU},
+  {"isBigEndianCPU", torch_DiskFile_isBigEndianCPU},
+  {"nativeEndianEncoding", torch_DiskFile_nativeEndianEncoding},
+  {"littleEndianEncoding", torch_DiskFile_littleEndianEncoding},
+  {"bigEndianEncoding", torch_DiskFile_bigEndianEncoding},
+  {"longSize", torch_DiskFile_longSize},
+  {"noBuffer", torch_DiskFile_noBuffer},
+  {"__tostring__", torch_DiskFile___tostring__},
+  {NULL, NULL}
+};
+
+void torch_DiskFile_init(lua_State *L)
+{
+  luaT_newmetatable(L, "torch.DiskFile", "torch.File",
+                    torch_DiskFile_new, torch_DiskFile_free, NULL);
+
+  luaT_setfuncs(L, torch_DiskFile__, 0);
+  lua_pop(L, 1);
+}
diff --git a/FFI.lua b/FFI.lua
new file mode 100644
index 0000000..904302a
--- /dev/null
+++ b/FFI.lua
@@ -0,0 +1,205 @@
+local ok, ffi = pcall(require, 'ffi')
+
+local function checkArgument(condition, fn, ud, msg, level)
+   local level = level or 3
+   if not condition then
+      error("bad argument #" .. ud .. " to '" .. fn .. "' (" .. msg .. ")", level)
+   end
+end
+
+local function checkArgumentType(expected, actual, fn, ud, level)
+   local level = level or 3
+   if expected ~= actual then
+      checkArgument(false, fn, ud, expected .. " expected, got " .. actual, level + 1)
+   end
+end
+
+if ok then
+   local Real2real = {
+      Byte='unsigned char',
+      Char='char',
+      Short='short',
+      Int='int',
+      Long='long',
+      Float='float',
+      Double='double'
+   }
+
+   -- Allocator
+   ffi.cdef[[
+typedef struct THAllocator {
+  void* (*malloc)(void*, long);
+  void* (*realloc)(void*, void*, long);
+  void (*free)(void*, void*);
+} THAllocator;
+]]
+
+   -- Storage
+   for Real, real in pairs(Real2real) do
+
+      local cdefs = [[
+typedef struct THRealStorage
+{
+    real *data;
+    long size;
+    int refcount;
+    char flag;
+    THAllocator *allocator;
+    void *allocatorContext;
+} THRealStorage;
+]]
+      cdefs = cdefs:gsub('Real', Real):gsub('real', real)
+      ffi.cdef(cdefs)
+
+      local Storage = torch.getmetatable(string.format('torch.%sStorage', Real))
+      local Storage_tt = ffi.typeof('TH' .. Real .. 'Storage**')
+
+      rawset(Storage,
+             "cdata",
+             function(self)
+                return Storage_tt(self)[0]
+             end)
+
+      rawset(Storage,
+             "data",
+             function(self)
+                return Storage_tt(self)[0].data
+             end)
+   end
+
+   -- Tensor
+   for Real, real in pairs(Real2real) do
+
+      local cdefs = [[
+typedef struct THRealTensor
+{
+    long *size;
+    long *stride;
+    int nDimension;
+    
+    THRealStorage *storage;
+    long storageOffset;
+    int refcount;
+
+    char flag;
+
+} THRealTensor;
+]]
+      cdefs = cdefs:gsub('Real', Real):gsub('real', real)
+      ffi.cdef(cdefs)
+
+      local Tensor = torch.getmetatable(string.format('torch.%sTensor', Real))
+      local Tensor_tt = ffi.typeof('TH' .. Real .. 'Tensor**')
+
+      rawset(Tensor,
+             "cdata",
+             function(self)
+                if not self then return nil; end
+                return Tensor_tt(self)[0]
+             end)
+
+      rawset(Tensor,
+             "data",
+             function(self)
+                if not self then return nil; end
+                self = Tensor_tt(self)[0]
+                return self.storage ~= nil and self.storage.data + self.storageOffset or nil
+             end)
+
+      -- faster apply (contiguous case)
+      local apply = Tensor.apply
+      rawset(Tensor,
+             "apply",
+             function(self, func)
+                if self:isContiguous() and self.data then
+                   local self_d = self:data()
+                   for i=0,self:nElement()-1 do
+                      local res = func(tonumber(self_d[i])) -- tonumber() required for long...
+                      if res then
+                         self_d[i] = res
+                      end
+                   end
+                   return self
+                else
+                   return apply(self, func)
+                end
+             end)
+
+      -- faster map (contiguous case)
+      local map = Tensor.map
+      rawset(Tensor,
+             "map",
+             function(self, src, func)
+                checkArgument(torch.isTensor(src), "map", 1, "tensor expected")
+                checkArgumentType(self:type(), src:type(), "map", 1)
+
+                if self:isContiguous() and src:isContiguous() and self.data and src.data then
+                   local self_d = self:data()
+                   local src_d = src:data()
+                   assert(src:nElement() == self:nElement(), 'size mismatch')
+                   for i=0,self:nElement()-1 do
+                      local res = func(tonumber(self_d[i]), tonumber(src_d[i])) -- tonumber() required for long...
+                      if res then
+                         self_d[i] = res
+                      end
+                   end
+                   return self
+                else
+                   return map(self, src, func)
+                end
+             end)
+
+      -- faster map2 (contiguous case)
+      local map2 = Tensor.map2
+      rawset(Tensor,
+             "map2",
+             function(self, src1, src2, func)
+                checkArgument(torch.isTensor(src1), "map", 1, "tensor expected")
+                checkArgument(torch.isTensor(src2), "map", 2, "tensor expected")
+                checkArgumentType(self:type(), src1:type(), "map", 1)
+                checkArgumentType(self:type(), src2:type(), "map", 2)
+
+                if self:isContiguous() and src1:isContiguous() and src2:isContiguous() and self.data and src1.data and src2.data then
+                   local self_d = self:data()
+                   local src1_d = src1:data()
+                   local src2_d = src2:data()
+                   assert(src1:nElement() == self:nElement(), 'size mismatch')
+                   assert(src2:nElement() == self:nElement(), 'size mismatch')
+                   for i=0,self:nElement()-1 do
+                      local res = func(tonumber(self_d[i]), tonumber(src1_d[i]), tonumber(src2_d[i])) -- tonumber() required for long...
+                      if res then
+                         self_d[i] = res
+                      end
+                   end
+                   return self
+                else
+                   return map2(self, src1, src2, func)
+                end
+             end)
+   end
+
+   -- torch.data
+   -- will fail if :data() is not defined
+   function torch.data(self, asnumber)
+      if not self then return nil; end
+      local data = self:data()
+      if asnumber then
+         return ffi.cast('intptr_t', data)
+      else
+         return data
+      end
+   end
+
+   -- torch.cdata
+   -- will fail if :cdata() is not defined
+   function torch.cdata(self, asnumber)
+      if not self then return nil; end
+      local cdata = self:cdata()
+      if asnumber then
+         return ffi.cast('intptr_t', cdata)
+      else
+         return cdata
+      end
+   end
+
+end
diff --git a/File.c b/File.c
new file mode 100644
index 0000000..586efed
--- /dev/null
+++ b/File.c
@@ -0,0 +1,207 @@
+#include "general.h"
+#include "THFile.h"
+#include "luaT.h"
+
+#define IMPLEMENT_TORCH_FILE_FLAG(NAME)                   \
+  static int torch_File_##NAME(lua_State *L)              \
+  {                                                       \
+    THFile *self = luaT_checkudata(L, 1, "torch.File");  \
+    lua_pushboolean(L, THFile_##NAME(self));              \
+    return 1;                                             \
+  }
+
+IMPLEMENT_TORCH_FILE_FLAG(isQuiet)
+IMPLEMENT_TORCH_FILE_FLAG(isReadable)
+IMPLEMENT_TORCH_FILE_FLAG(isWritable)
+IMPLEMENT_TORCH_FILE_FLAG(isBinary)
+IMPLEMENT_TORCH_FILE_FLAG(isAutoSpacing)
+IMPLEMENT_TORCH_FILE_FLAG(hasError)
+
+#define IMPLEMENT_TORCH_FILE_FUNC(NAME)                   \
+  static int torch_File_##NAME(lua_State *L)              \
+  {                                                       \
+    THFile *self = luaT_checkudata(L, 1, "torch.File");  \
+    THFile_##NAME(self);                                  \
+    lua_settop(L, 1);                                     \
+    return 1;                                             \
+  }
+
+IMPLEMENT_TORCH_FILE_FUNC(binary)
+IMPLEMENT_TORCH_FILE_FUNC(ascii)
+IMPLEMENT_TORCH_FILE_FUNC(autoSpacing)
+IMPLEMENT_TORCH_FILE_FUNC(noAutoSpacing)
+IMPLEMENT_TORCH_FILE_FUNC(quiet)
+IMPLEMENT_TORCH_FILE_FUNC(pedantic)
+IMPLEMENT_TORCH_FILE_FUNC(clearError)
+
+IMPLEMENT_TORCH_FILE_FUNC(synchronize)
+
+static int torch_File_seek(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.File");
+  long position = luaL_checklong(L, 2)-1;
+  // >= 0 because it has 1 already subtracted
+  THArgCheck(position >= 0, 2, "position has to be greater than 0!");
+  THFile_seek(self, (size_t)position);
+  lua_settop(L, 1);
+  return 1;
+}
+
+IMPLEMENT_TORCH_FILE_FUNC(seekEnd)
+
+static int torch_File_position(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.File");
+  lua_pushnumber(L, THFile_position(self)+1);
+  return 1;
+}
+
+IMPLEMENT_TORCH_FILE_FUNC(close)
+
+#define IMPLEMENT_TORCH_FILE_RW(TYPEC, TYPE)                            \
+  static int torch_File_read##TYPEC(lua_State *L)                       \
+  {                                                                     \
+    THFile *self = luaT_checkudata(L, 1, "torch.File");                \
+    int narg = lua_gettop(L);                                           \
+                                                                        \
+    if(narg == 1)                                                       \
+    {                                                                   \
+      lua_pushnumber(L, THFile_read##TYPEC##Scalar(self));              \
+      return 1;                                                         \
+    }                                                                   \
+    else if(narg == 2)                                                  \
+    {                                                                   \
+      if(lua_isnumber(L, 2))                                            \
+      {                                                                 \
+        long size = lua_tonumber(L, 2);                                 \
+        long nread;                                                     \
+                                                                        \
+        TH##TYPEC##Storage *storage = TH##TYPEC##Storage_newWithSize(size); \
+        luaT_pushudata(L, storage, "torch." #TYPEC "Storage");          \
+        nread = THFile_read##TYPEC(self, storage);                      \
+        if(nread != size)                                               \
+          TH##TYPEC##Storage_resize(storage, nread);                    \
+        return 1;                                                       \
+      }                                                                 \
+      else if(luaT_toudata(L, 2, "torch." #TYPEC "Storage"))            \
+      {                                                                 \
+        TH##TYPEC##Storage *storage = luaT_toudata(L, 2, "torch." #TYPEC "Storage"); \
+        lua_pushnumber(L, THFile_read##TYPEC(self, storage));           \
+        return 1;                                                       \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    luaL_error(L, "nothing, number, or " #TYPEC "Storage expected");    \
+    return 0;                                                           \
+  }                                                                     \
+                                                                        \
+  static int torch_File_write##TYPEC(lua_State *L)                      \
+  {                                                                     \
+    THFile *self = luaT_checkudata(L, 1, "torch.File");                \
+    int narg = lua_gettop(L);                                           \
+                                                                        \
+    if(narg == 2)                                                       \
+    {                                                                   \
+      if(lua_isnumber(L, 2))                                            \
+      {                                                                 \
+        TYPE value = lua_tonumber(L, 2);                                \
+        THFile_write##TYPEC##Scalar(self, (TYPE)value);                 \
+        return 0;                                                       \
+      }                                                                 \
+      else if(luaT_toudata(L, 2, "torch." #TYPEC "Storage"))            \
+      {                                                                 \
+        TH##TYPEC##Storage *storage = luaT_toudata(L, 2, "torch." #TYPEC "Storage"); \
+        lua_pushnumber(L, THFile_write##TYPEC(self, storage));          \
+        return 1;                                                       \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    luaL_error(L, "number, or " #TYPEC "Storage expected");             \
+    return 0;                                                           \
+  }
+
+
+IMPLEMENT_TORCH_FILE_RW(Byte, unsigned char)
+IMPLEMENT_TORCH_FILE_RW(Char, char)
+IMPLEMENT_TORCH_FILE_RW(Short, short)
+IMPLEMENT_TORCH_FILE_RW(Int, int)
+IMPLEMENT_TORCH_FILE_RW(Long, long)
+IMPLEMENT_TORCH_FILE_RW(Float, float)
+IMPLEMENT_TORCH_FILE_RW(Double, double)
+
+static int torch_File_readString(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.File");
+  const char *format = luaL_checkstring(L, 2);
+  char *str;
+  long size;
+
+  size = THFile_readStringRaw(self, format, &str);
+  lua_pushlstring(L, str, size);
+  THFree(str);
+
+  return 1;
+}
+
+static int torch_File_writeString(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.File");
+  const char *str = NULL;
+  size_t size;
+
+  luaL_checktype(L, 2, LUA_TSTRING);
+  str = lua_tolstring(L, 2, &size);
+  lua_pushnumber(L, THFile_writeStringRaw(self, str, (long)size));
+  return 1;
+}
+
+static const struct luaL_Reg torch_File__ [] = {
+  {"isQuiet", torch_File_isQuiet},
+  {"isReadable", torch_File_isReadable},
+  {"isWritable", torch_File_isWritable},
+  {"isBinary", torch_File_isBinary},
+  {"isAutoSpacing", torch_File_isAutoSpacing},
+  {"hasError", torch_File_hasError},
+  {"binary", torch_File_binary},
+  {"ascii", torch_File_ascii},
+  {"autoSpacing", torch_File_autoSpacing},
+  {"noAutoSpacing", torch_File_noAutoSpacing},
+  {"quiet", torch_File_quiet},
+  {"pedantic", torch_File_pedantic},
+  {"clearError", torch_File_clearError},
+
+  /* DEBUG: CHECK DISK FREE & READ/WRITE STRING*/
+
+  {"readByte", torch_File_readByte},
+  {"readChar", torch_File_readChar},
+  {"readShort", torch_File_readShort},
+  {"readInt", torch_File_readInt},
+  {"readLong", torch_File_readLong},
+  {"readFloat", torch_File_readFloat},
+  {"readDouble", torch_File_readDouble},
+  {"readString", torch_File_readString},
+
+  {"writeByte", torch_File_writeByte},
+  {"writeChar", torch_File_writeChar},
+  {"writeShort", torch_File_writeShort},
+  {"writeInt", torch_File_writeInt},
+  {"writeLong", torch_File_writeLong},
+  {"writeFloat", torch_File_writeFloat},
+  {"writeDouble", torch_File_writeDouble},
+  {"writeString", torch_File_writeString},
+
+  {"synchronize", torch_File_synchronize},
+  {"seek", torch_File_seek},
+  {"seekEnd", torch_File_seekEnd},
+  {"position", torch_File_position},
+  {"close", torch_File_close},
+
+  {NULL, NULL}
+};
+
+void torch_File_init(lua_State *L)
+{
+  luaT_newmetatable(L, "torch.File", NULL, NULL, NULL, NULL);
+  luaT_setfuncs(L, torch_File__, 0);
+  lua_pop(L, 1);
+}
diff --git a/File.lua b/File.lua
new file mode 100644
index 0000000..1cc4dfe
--- /dev/null
+++ b/File.lua
@@ -0,0 +1,454 @@
+local File = torch.getmetatable('torch.File')
+
+function File:writeBool(value)
+   if value then
+      self:writeInt(1)
+   else
+      self:writeInt(0)
+   end
+end
+
+function File:readBool()
+   return (self:readInt() == 1)
+end
+
+local TYPE_NIL      = 0
+local TYPE_NUMBER   = 1
+local TYPE_STRING   = 2
+local TYPE_TABLE    = 3
+local TYPE_TORCH    = 4
+local TYPE_BOOLEAN  = 5
+local TYPE_FUNCTION = 6
+local TYPE_RECUR_FUNCTION = 8
+local LEGACY_TYPE_RECUR_FUNCTION = 7
+
+-- Lua 5.2 compatibility
+local loadstring = loadstring or load
+
+function File:isWritableObject(object)
+   local typename = type(object)
+   local typeidx
+   if type(object) ~= 'boolean' and not object then
+      typeidx = TYPE_NIL
+   elseif torch.typename(object) and torch.factory(torch.typename(object)) then
+      typeidx = TYPE_TORCH
+   elseif typename == 'table' then
+      typeidx = TYPE_TABLE
+   elseif typename == 'number' then
+      typeidx = TYPE_NUMBER
+   elseif typename == 'string' then
+      typeidx = TYPE_STRING
+   elseif typename == 'boolean' then
+      typeidx = TYPE_BOOLEAN
+   elseif typename == 'function' and pcall(string.dump, object) then
+      typeidx = TYPE_RECUR_FUNCTION
+   end
+   return typeidx
+end
+
+function File:referenced(ref)
+   -- we use an environment to keep a record of written objects
+   if not torch.getenv(self).writeObjects then
+      torch.setenv(self, {
+            writeObjects={}, writeObjectsRef={},
+            readObjects={},
+            objectNameStack={},
+            upvalueRefToId={}, upvalueIdToClosure={},
+         })
+   end
+   local env = torch.getenv(self)
+   env.force = not ref
+   torch.setenv(self,env)
+   return self
+end
+
+function File:isReferenced()
+   -- if no environment, then no forcing setup yet
+   if not torch.getenv(self).writeObjects then
+      return true
+   end
+   local env = torch.getenv(self)
+   return not env.force
+end
+
+local function getmetamethod(obj, name)
+   local func
+   local status
+
+   -- check getmetatable(obj).__name or
+   -- check getmetatable(obj).name
+   status, func = pcall(
+      function()
+         -- note that sometimes the metatable is hidden
+         -- we get it for sure through the torch type system
+         local mt = torch.getmetatable(torch.typename(obj))
+         if mt then
+            return mt['__' .. name] or mt[name]
+         end
+      end
+   )
+   if status and type(func) == 'function' then
+      return func
+   end
+end
+
+local UPVALUES_TOKEN = {} -- unique object
+local function formatStack(objectNameStack)
+   -- Format object name stack skipping UPVALUES_TOKEN and upvalue index
+   local parts = {}
+   for i, v in ipairs(objectNameStack) do
+      if v ~= UPVALUES_TOKEN and objectNameStack[i-1] ~= UPVALUES_TOKEN then
+         table.insert(parts, v)
+      end
+   end
+   return table.concat(parts, '.')
+end
+
+function File:writeObject(object, debugname, hook)
+   -- define a default hook function if not provided
+   hook = hook or function(object) return object end
+   -- we use an environment to keep a record of written objects
+   if not torch.getenv(self).writeObjects then
+      torch.setenv(self, {
+            writeObjects={}, writeObjectsRef={},
+            readObjects={},
+            objectNameStack={},
+            upvalueRefToId={}, upvalueIdToClosure={},
+         })
+   end
+   -- That guy is used for references' book-keeping
+   local sobject = object
+   -- That guy is the object that is actually persisted
+   -- hook(object) can be used to modify the object before writing it to the file.
+   -- Useful for serializing objects under a config
+   -- that we want to deserialize safely under another config.
+   -- (e.g. Cuda to Float tensors, cudnn to nn, ...)
+   object = hook(object)
+   local force = torch.getenv(self).force
+
+   -- if nil object, only write the type and return
+   if type(object) ~= 'boolean' and not object then
+      self:writeInt(TYPE_NIL)
+      return
+   end
+
+   local objectNameStack = torch.getenv(self).objectNameStack
+   table.insert(objectNameStack, debugname or '<?>')
+
+   -- check the type we are dealing with
+   local typeidx = self:isWritableObject(object)
+   if not typeidx then
+      error(string.format('Unwritable object <%s> at %s', type(object), formatStack(objectNameStack)))
+   end
+   self:writeInt(typeidx)
+
+   if typeidx == TYPE_NUMBER then
+      self:writeDouble(object)
+   elseif typeidx == TYPE_BOOLEAN then
+      self:writeBool(object)
+   elseif typeidx == TYPE_STRING then
+      local stringStorage = torch.CharStorage():string(object)
+      self:writeInt(#stringStorage)
+      self:writeChar(stringStorage)
+   elseif typeidx == TYPE_TORCH or typeidx == TYPE_TABLE or  typeidx == TYPE_RECUR_FUNCTION then
+      -- check it exists already (we look at the pointer!)
+      local objects = torch.getenv(self).writeObjects
+      local objectsRef = torch.getenv(self).writeObjectsRef
+      local index = objects[torch.pointer(sobject)]
+
+      if index and (not force) then
+         -- if already exists, write only its index
+         self:writeInt(index)
+      else
+         -- else write the object itself
+         index = objects.nWriteObject or 0
+         index = index + 1
+         if not force then
+            objects[torch.pointer(sobject)] = index
+            objectsRef[object] = index -- we make sure the object is not going to disappear
+         end
+         self:writeInt(index)
+         objects.nWriteObject = index
+         if typeidx == TYPE_RECUR_FUNCTION then
+            local upvalueRefToId = torch.getenv(self).upvalueRefToId
+            -- Unique ID for each ref since lightuserdata are not serializable
+            local nextId = 1
+            for _ in pairs(upvalueRefToId) do nextId=nextId+1 end
+            local upvalues = {}
+            local counter = 0
+            while true do
+               counter = counter + 1
+               local name,value = debug.getupvalue(object, counter)
+               if not name then break end
+               if name == '_ENV' then value = nil end
+               local id=nil
+               -- debug.upvalueid exists only for lua>=5.2 and luajit
+               if debug.upvalueid then
+                  local upvalueRef = debug.upvalueid(object, counter)
+                  if not upvalueRefToId[upvalueRef] then
+                     upvalueRefToId[upvalueRef] = nextId
+                     nextId = nextId + 1
+                  end
+                  id = upvalueRefToId[upvalueRef]
+               end
+               table.insert(upvalues, {name=name, id=id, value=value})
+            end
+            local dumped = string.dump(object)
+            local stringStorage = torch.CharStorage():string(dumped)
+            self:writeInt(#stringStorage)
+            self:writeChar(stringStorage)
+            self:writeObject(upvalues, UPVALUES_TOKEN, hook)
+         elseif typeidx == TYPE_TORCH then
+            local version   = torch.CharStorage():string('V ' .. torch.version(object))
+            local className = torch.CharStorage():string(torch.typename(object))
+            self:writeInt(#version)
+            self:writeChar(version)
+            self:writeInt(#className)
+            self:writeChar(className)
+            local write = getmetamethod(object, 'write')
+            if write then
+               write(object, self)
+            elseif type(object) == 'table' then
+               local var = {}
+               for k,v in pairs(object) do
+                  if self:isWritableObject(v) then
+                     var[k] = v
+                  else
+                     print(string.format('$ Warning: cannot write object field <%s> of <%s> %s', k, torch.typename(object), formatStack(objectNameStack)))
+                  end
+               end
+               self:writeObject(var, torch.typename(object), hook)
+            else
+               error(string.format('<%s> is a non-serializable Torch object %s', torch.typename(object), formatStack(objectNameStack)))
+            end
+         else -- it is a table
+            local size = 0; for k,v in pairs(object) do size = size + 1 end
+            self:writeInt(size)
+            for k,v in pairs(object) do
+               self:writeObject(k, nil, hook)
+               local name = (type(k) == 'string' or type(k) == 'number') and tostring(k) or nil
+               -- special case name for upvalues
+               if objectNameStack[#objectNameStack-1] == UPVALUES_TOKEN and
+                  name == 'value' and type(object.name) == 'string' then
+                  name = object.name
+               end
+               self:writeObject(v, name, hook)
+            end
+         end
+      end
+   else
+      error('Unwritable object')
+   end
+   table.remove(objectNameStack)
+end
+
+function File:readObject()
+   -- we use an environment to keep a record of read objects
+   if not torch.getenv(self).writeObjects then
+      torch.setenv(self, {
+            writeObjects={}, writeObjectsRef={},
+            readObjects={},
+            objectNameStack={},
+            upvalueRefToId={}, upvalueIdToClosure={},
+         })
+   end
+
+   local force = torch.getenv(self).force
+
+   -- read the typeidx
+   local typeidx = self:readInt()
+
+   -- is it nil?
+   if typeidx == TYPE_NIL then
+      return nil
+   end
+
+   if typeidx == TYPE_NUMBER then
+      return self:readDouble()
+   elseif typeidx == TYPE_BOOLEAN then
+      return self:readBool()
+   elseif typeidx == TYPE_STRING then
+      local size = self:readInt()
+      return self:readChar(size):string()
+   elseif typeidx == TYPE_FUNCTION then
+       local size = self:readInt()
+       local dumped = self:readChar(size):string()
+       local func, err = loadstring(dumped)
+       if not func then
+          error(string.format('Failed to load function from bytecode: %s', err))
+       end
+       local upvalues = self:readObject()
+       for index,upvalue in ipairs(upvalues) do
+          debug.setupvalue(func, index, upvalue)
+       end
+       return func
+   elseif typeidx == TYPE_TABLE or typeidx == TYPE_TORCH or typeidx == TYPE_RECUR_FUNCTION or typeidx == LEGACY_TYPE_RECUR_FUNCTION then
+      -- read the index
+      local index = self:readInt()
+
+      -- check it is loaded already
+      local objects = torch.getenv(self).readObjects
+      if objects[index] and not force then
+         return objects[index]
+      end
+
+      -- otherwise read it
+      if typeidx == TYPE_RECUR_FUNCTION or typeidx == LEGACY_TYPE_RECUR_FUNCTION then
+         local size = self:readInt()
+         local dumped = self:readChar(size):string()
+         local func, err = loadstring(dumped)
+         if not func then
+            error(string.format('Failed to load function from bytecode: %s', err))
+         end
+         if not force then
+             objects[index] = func
+         end
+         local upvalueIdToClosure = torch.getenv(self).upvalueIdToClosure
+         local upvalues = self:readObject()
+         for index,upvalue in ipairs(upvalues) do
+            if typeidx == LEGACY_TYPE_RECUR_FUNCTION then
+               debug.setupvalue(func, index, upvalue)
+            elseif upvalue.name == '_ENV' then
+               debug.setupvalue(func, index, _ENV)
+            else
+               debug.setupvalue(func, index, upvalue.value)
+               -- debug.upvaluejoin exists only for lua>=5.2 and luajit
+               if debug.upvaluejoin and upvalue.id then
+                  if upvalueIdToClosure[upvalue.id] then
+                     -- This upvalue is linked to another one
+                     local otherClosure = upvalueIdToClosure[upvalue.id]
+                     debug.upvaluejoin(func, index, otherClosure.func, otherClosure.index)
+                  else
+                     -- Save this closure for next time
+                     upvalueIdToClosure[upvalue.id] = {
+                        func = func,
+                        index = index,
+                     }
+                  end
+               end
+            end
+         end
+         return func
+      elseif typeidx == TYPE_TORCH then
+         local version, className, versionNumber
+         version = self:readChar(self:readInt()):string()
+         versionNumber = tonumber(string.match(version, '^V (.*)$'))
+         if not versionNumber then
+            className = version
+            versionNumber = 0 -- file created before existence of versioning system
+         else
+            className = self:readChar(self:readInt()):string()
+         end
+         if not torch.factory(className) then
+            error(string.format('unknown Torch class <%s>', tostring(className)))
+         end
+         local object = torch.factory(className)(self)
+         if not force then
+             objects[index] = object
+         end
+         local read = getmetamethod(object, 'read')
+         if read then
+            read(object, self, versionNumber)
+         elseif type(object) == 'table' then
+            local var = self:readObject()
+            for k,v in pairs(var) do
+               object[k] = v
+            end
+         else
+            error(string.format('Cannot load object class <%s>', tostring(className)))
+         end
+         return object
+      else -- it is a table
+         local size = self:readInt()
+         local object = {}
+         if not force then
+             objects[index] = object
+         end
+         for i = 1,size do
+            local k = self:readObject()
+            local v = self:readObject()
+            object[k] = v
+         end
+         return object
+      end
+   else
+      error('unknown object')
+   end
+end
+
+-- simple helpers to save/load arbitrary objects/tables
+function torch.save(filename, object, mode, referenced)
+   assert(mode == nil or mode == 'binary' or mode == 'ascii', '"binary" or "ascii" (or nil) expected for mode')
+   assert(referenced == nil or referenced == true or referenced == false, 'true or false (or nil) expected for referenced')
+   mode = mode or 'binary'
+   referenced = referenced == nil and true or referenced
+   local file = torch.DiskFile(filename, 'w')
+   file[mode](file)
+   file:referenced(referenced)
+   file:writeObject(object)
+   file:close()
+end
+
+function torch.load(filename, mode, referenced)
+   assert(mode == 'binary' or mode == 'b32' or mode == 'b64' or
+          mode == nil or mode == 'ascii',
+          '"binary", "b32", "b64" or "ascii" (or nil) expected for mode')
+   assert(referenced == nil or referenced == true or referenced == false,
+          'true or false (or nil) expected for referenced')
+   local longSize
+   if mode == 'b32' or mode == 'b64' then
+      longSize = tonumber(mode:match('%d+')) / 8
+      mode = 'binary'
+   end
+   mode = mode or 'binary'
+   referenced = referenced == nil and true or referenced
+   local file = torch.DiskFile(filename, 'r')
+   file[mode](file)
+   file:referenced(referenced)
+   if longSize then file:longSize(longSize) end
+   local object = file:readObject()
+   file:close()
+   return object
+end
+
+-- simple helpers to serialize/deserialize arbitrary objects/tables
+function torch.serialize(object, mode)
+   local storage = torch.serializeToStorage(object, mode)
+   return storage:string()
+end
+
+-- Serialize to a CharStorage, not a lua string. This avoids
+function torch.serializeToStorage(object, mode)
+   mode = mode or 'binary'
+   local f = torch.MemoryFile()
+   f = f[mode](f)
+   f:writeObject(object)
+   local storage = f:storage()
+   -- the storage includes an extra NULL character: get rid of it
+   storage:resize(storage:size()-1)
+   f:close()
+   return storage
+end
+
+function torch.deserializeFromStorage(storage, mode)
+   mode = mode or 'binary'
+   local tx = torch.CharTensor(storage)
+   local xp = torch.CharStorage(tx:size(1)+1)
+   local txp = torch.CharTensor(xp)
+   txp:narrow(1,1,tx:size(1)):copy(tx)
+   txp[tx:size(1)+1] = 0
+   local f = torch.MemoryFile(xp)
+   f = f[mode](f)
+   local object = f:readObject()
+   f:close()
+   return object
+end
+
+function torch.deserialize(str, mode)
+   local storage = torch.CharStorage():string(str)
+   return torch.deserializeFromStorage(storage, mode)
+end
+
+-- public API (saveobj/loadobj are safe for global import)
+torch.saveobj = torch.save
+torch.loadobj = torch.load
diff --git a/Generator.c b/Generator.c
new file mode 100644
index 0000000..8cf5ba6
--- /dev/null
+++ b/Generator.c
@@ -0,0 +1,50 @@
+#include <general.h>
+
+int torch_Generator_new(lua_State *L)
+{
+  THGenerator *gen = THGenerator_new();
+  luaT_pushudata(L, gen, torch_Generator);
+  return 1;
+}
+
+int torch_Generator_free(lua_State *L)
+{
+  THGenerator *gen= luaT_checkudata(L, 1, torch_Generator);
+  THGenerator_free(gen);
+  return 0;
+}
+
+static int torch_Generator_write(lua_State *L)
+{
+  THGenerator *gen = luaT_checkudata(L, 1, torch_Generator);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+  THFile_writeByteRaw(file, (unsigned char *)gen, sizeof(THGenerator));
+  return 0;
+}
+
+static int torch_Generator_read(lua_State *L)
+{
+  THGenerator *gen = luaT_checkudata(L, 1, torch_Generator);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+  THFile_readByteRaw(file, (unsigned char *)gen, sizeof(THGenerator));
+  return 0;
+}
+
+
+static const struct luaL_Reg torch_Generator_table_ [] = {
+  {"write", torch_Generator_write},
+  {"read", torch_Generator_read},
+  {NULL, NULL}
+};
+
+#define torch_Generator_factory torch_Generator_new
+
+void torch_Generator_init(lua_State *L)
+{
+  luaT_newmetatable(L, torch_Generator, NULL,
+                    torch_Generator_new, torch_Generator_free, torch_Generator_factory);
+  luaT_setfuncs(L, torch_Generator_table_, 0);
+  lua_pop(L, 1);
+}
diff --git a/MemoryFile.c b/MemoryFile.c
new file mode 100644
index 0000000..a22dc17
--- /dev/null
+++ b/MemoryFile.c
@@ -0,0 +1,70 @@
+#include "general.h"
+
+static int torch_MemoryFile_new(lua_State *L)
+{
+  const char *mode;
+  THCharStorage *storage = luaT_toudata(L, 1, "torch.CharStorage");
+  THFile *self;
+
+  if(storage)
+  {
+    mode = luaL_optstring(L, 2, "rw");
+    self = THMemoryFile_newWithStorage(storage, mode);
+  }
+  else
+  {
+    mode = luaL_optstring(L, 1, "rw");
+    self = THMemoryFile_new(mode);
+  }
+
+  luaT_pushudata(L, self, "torch.MemoryFile");
+  return 1;
+}
+
+static int torch_MemoryFile_storage(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.MemoryFile");
+  THCharStorage_retain(THMemoryFile_storage(self));
+  luaT_pushudata(L, THMemoryFile_storage(self), "torch.CharStorage");
+  return 1;
+}
+
+static int torch_longSize(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.MemoryFile");
+  THMemoryFile_longSize(self, lua_tointeger(L, 2));
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_MemoryFile_free(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.MemoryFile");
+  THFile_free(self);
+  return 0;
+}
+
+static int torch_MemoryFile___tostring__(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.MemoryFile");
+  lua_pushfstring(L, "torch.MemoryFile [status: %s -- mode: %c%c]",
+                  (THFile_isOpened(self) ? "open" : "closed"),
+                  (THFile_isReadable(self) ? 'r' : ' '),
+                  (THFile_isWritable(self) ? 'w' : ' '));
+  return 1;
+}
+
+static const struct luaL_Reg torch_MemoryFile__ [] = {
+  {"storage", torch_MemoryFile_storage},
+  {"longSize", torch_longSize},
+  {"__tostring__", torch_MemoryFile___tostring__},
+  {NULL, NULL}
+};
+
+void torch_MemoryFile_init(lua_State *L)
+{
+  luaT_newmetatable(L, "torch.MemoryFile", "torch.File",
+                    torch_MemoryFile_new, torch_MemoryFile_free, NULL);
+  luaT_setfuncs(L, torch_MemoryFile__, 0);
+  lua_pop(L, 1);
+}
diff --git a/PipeFile.c b/PipeFile.c
new file mode 100644
index 0000000..a47c90d
--- /dev/null
+++ b/PipeFile.c
@@ -0,0 +1,43 @@
+#include "general.h"
+
+static int torch_PipeFile_new(lua_State *L)
+{
+  const char *name = luaL_checkstring(L, 1);
+  const char *mode = luaL_optstring(L, 2, "r");
+  int isQuiet = luaT_optboolean(L, 3, 0);
+  THFile *self = THPipeFile_new(name, mode, isQuiet);
+
+  luaT_pushudata(L, self, "torch.PipeFile");
+  return 1;
+}
+
+static int torch_PipeFile_free(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.PipeFile");
+  THFile_free(self);
+  return 0;
+}
+
+static int torch_PipeFile___tostring__(lua_State *L)
+{
+  THFile *self = luaT_checkudata(L, 1, "torch.PipeFile");
+  lua_pushfstring(L, "torch.PipeFile on <%s> [status: %s -- mode: %c%c]",
+                  THDiskFile_name(self),
+                  (THFile_isOpened(self) ? "open" : "closed"),
+                  (THFile_isReadable(self) ? 'r' : ' '),
+                  (THFile_isWritable(self) ? 'w' : ' '));
+  return 1;
+}
+
+static const struct luaL_Reg torch_PipeFile__ [] = {
+  {"__tostring__", torch_PipeFile___tostring__},
+  {NULL, NULL}
+};
+
+void torch_PipeFile_init(lua_State *L)
+{
+  luaT_newmetatable(L, "torch.PipeFile", "torch.DiskFile",
+                    torch_PipeFile_new, torch_PipeFile_free, NULL);
+  luaT_setfuncs(L, torch_PipeFile__, 0);
+  lua_pop(L, 1);
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..96f0dd8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,45 @@
+[![Join the chat at https://gitter.im/torch/torch7](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/torch/torch7?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Build Status](https://travis-ci.org/torch/torch7.svg)](https://travis-ci.org/torch/torch7)
+
+## Need help? ##
+
+* Questions, Support, Install issues: [Google groups](https://groups.google.com/forum/#!forum/torch7)
+* Reporting bugs: [torch7](https://github.com/torch/torch7/issues) [nn](https://github.com/torch/nn/issues) [cutorch](https://github.com/torch/cutorch/issues) [cunn](https://github.com/torch/cutorch/issues) [optim](https://github.com/torch/optim/issues) [threads](https://github.com/torch/threads/issues)
+* Hanging out with other developers and users (strictly no install issues, no large blobs of text): [Gitter Chat](https://gitter.im/torch/torch7)
+
+<a name="torch.reference.dok"/>
+# Torch Package Reference Manual #
+
+__Torch__ is the main package in [Torch7](http://torch.ch) where data
+structures for multi-dimensional tensors and mathematical operations
+over these are defined. Additionally, it provides many utilities for
+accessing files, serializing objects of arbitrary types and other
+useful utilities.
+
+<a name="torch.overview.dok"/>
+## Torch Packages ##
+
+  * Tensor Library
+    * [Tensor](doc/tensor.md) defines the _all powerful_ tensor object that provides multi-dimensional numerical arrays with type templating.
+    * [Mathematical operations](doc/maths.md) that are defined for the tensor object types.
+    * [Storage](doc/storage.md) defines a simple storage interface that controls the underlying storage for any tensor object.
+  * File I/O Interface Library
+    * [File](doc/file.md) is an abstract interface for common file operations.
+    * [Disk File](doc/diskfile.md) defines operations on files stored on disk.
+    * [Memory File](doc/memoryfile.md) defines operations on stored in RAM.
+    * [Pipe File](doc/pipefile.md) defines operations for using piped commands.
+    * [High-Level File operations](doc/serialization.md) defines higher-level serialization functions.
+  * Useful Utilities
+    * [Timer](doc/timer.md) provides functionality for _measuring time_.
+    * [Tester](doc/tester.md) is a generic tester framework.
+    * [CmdLine](doc/cmdline.md) is a command line argument parsing utility.
+    * [Random](doc/random.md) defines a random number generator package with various distributions.
+    * Finally useful [utility](doc/utility.md) functions are provided for easy handling of torch tensor types and class inheritance.
+
+<a name="torch.links.dok"/>
+## Useful Links ##
+
+  * [Community packages](https://github.com/torch/torch7/wiki/Cheatsheet)
+  * [Torch Blog](http://torch.ch/blog/)
+  * [Torch Slides](https://github.com/soumith/cvpr2015/blob/master/cvpr-torch.pdf)
+
diff --git a/ROADMAP.md b/ROADMAP.md
new file mode 100644
index 0000000..cb9c5ad
--- /dev/null
+++ b/ROADMAP.md
@@ -0,0 +1,144 @@
+
+# Torch Roadmap (August 2015 - March 2016)
+
+This roadmap document is intended to serve as a loose plan of our vision for Torch in the short term.  
+It is open to community feedback and contribution and only intends to serve as an initial draft.  
+After community feedback, we shall freeze it and work on it.  
+
+The roadmap focuses on five separate things
+
+- Core development: improving the core technically. Design changes, code refactors, performance, they go here.
+- Documentation and Accessibility: Outlining the changes in documentation, and improving general user and developer documentation in various ways.
+- Versioning and Packaging: Planned and much needed changes to the packaging of Torch are discussed here.
+- Continuous Build Infrastructure: Making our continuous builds more robust, introducing CUDA and OpenCL contbuilds etc.
+- Other improvements
+
+
+## Torch Core Project Development
+
+ - New class system:
+   - **[definite]** with no global side-effects (i.e. the class constructor should be scoped into its parent package)
+     Get rid of every statement/system that has a global effect on the environment (torch.setdefaultensortype => dangerous and not clean)
+   - **[needs discussion]** fully serializable (i.e. when deserializing/reloading a model, there shouldn't be a need to load libraries that defined the class originally, like nn; the class definition should be serialized as well: this would remove a lot of backward compatibility hacks that we have to add to class definitions currently
+       - **koray**: I like this, but wouldn't it break backward compatibility?
+		            Currently, whatever we serialize, it is just the data and implementation is defined
+					at load time, so if a bug is fixed (or introduced) you use that.
+					And it starts being ambiguous, what if I load a layer from file and
+					create a new one and their implementation is inconsistent...)
+ - **[definite]** Get rid of non-tensor-related stuff (like serialization) in TH, and move it to lua side
+ - **[needs discussion]** OpenMP: Should it stay or go? Is Threads sufficient?
+       - **Ronan**: I really wonder about this guy, especially now that I have been using threads intensively. I am not sure that fine-grine threading is necessary.
+	   - **koray**: I guess you mean with threading, there is no need for OpenMP, but I disagree.
+	          Our convolution layer will use multiple threads and then if we run a ReLu over a huge state space, it would become embarrassingly slow.
+			  We shouldn't expect everyone to run their experiments in a threading framework. It is more work than necessary sometimes.)
+ - **[needs discussion]** Templated C++ in TH Core?
+                    - **Ronan**: Should I cleanup TH core? In the end, I am scared to move to C++, but some iterators based taking a closure could be nice (I have some of those that I could add easily).
+					         I could move to C++ if it was only template + keeping pointers (and not C++11/14/17, because that would limit the number of users that it can reach because of the latest compilers needed etc.).
+ - **[definite]** Migrate to a single, better/modern testing support
+              - **koray**: like some aspects of Totem, but should be in core Tester
+ - **[definite]** Benchmarking support in Tester
+ - **[definite]** Consistent testing scripts across all core projects
+ - **[definite]** 'nn' container unified interface between containers and graph
+ - **[mostly definite]** Switch to batch only assumption in 'nn'. Right now, the code is unnecessarily complicated for stochastic/batch confusion, we needed extra functions like nInputDims and such.
+ - **[needs discussion]** Support named arguments in the constructor for all 'nn' layers.
+ - **[definite]** 'rnn' package.
+      - **Soumith**: Nicholas Leonard's seems to be a good one.
+ - **[mostly definite]** argcheck for all core functions in torch. Get rid of cwrap's ugliness.
+ - **[definite]** improve paths to support more file system operations
+       - **Clement**: could lfs and penlight be made more standard? penlight is a heavy package but provides so much utility
+	   - **Soumith**: I think penlight is lightweight and provides strong utility, definitely consider dependence.
+ - **[definite]** JIT/Lua/FFI/GC:
+   - **koray**: I think Torch should be agnostic to whatever is the backend;
+   - **clement**: yes!
+   - at this point, we need to have all core packages use the regular Lua api (almost the case)
+     - **Ronan**: agreed.
+
+- **[definite]** plan to have standalone FFI?
+  - Facebook releases their puc LUA based FFI package mostly improved by Sam Gross
+  - [needs discussion] **Ronan** improves it a bit more to use Leon's C99 parser
+                         - **Koray**: I am not opposed to Leon's C99 parser, but we should not have the QT like situation where
+						       it relies mostly on Leon to maintain it.
+							   And, still we need to have FFI since there are people and packages that rely on it now.
+- **[definite]** Lua 5.2 migration (I think it's already finished ;) ).
+- **[mostly definite]** Lua 5.3 migration
+- **[mostly definite]** Optionally replace GC by Ref-counting (existing version in luajit-rocks; but completely broken but will need to be fixed)
+- **[needs discussion]** Make OpenCL support more visible under torch/opencl (**Soumith**: Hugh Perkins will maintain it of course ;) ).
+- **[definite]** Split nn into THNN and nn. THNN would be NN package using TH as backend and nn would be the lua layer. THNN can be used as a standalone C library. Same for cunn
+- **[Definite]** CUDA typed tensor support - CudaHalfTensor CudaDoubleTensor etc.
+- **[Definite]** better plotting support
+- **[needs discussion]** UI package that doesn't suck?
+  - **Ronan**: something based on cairo?
+    - **clement**: not sure if this would have much adoption
+    - **Ronan**: yes, it is a worry. I started to do some fancy stuff there, it is not that hard.
+	         However, I would need quite some time to polish it.
+			 I think having something fully customizable from lua really 
+                         makes a difference (rather than something like Qt, for example). 
+  - something based on a web client?
+      - **clement**: i like the idea of itorch but could never easily build it, build process is too big.
+      - **Ronan**: I cannot use something which forces me to use global variables.
+      - **koray**: I think at the end of the day, we need to have both a GUI client and a web based client.
+		   My main problem with web based clients is that I can't easily create 
+                   custom displays to play an animation or such.
+		   It is an offline process that I need to generate a movie and then load it in.
+		   This and similar things make it hard to use for me.
+		   Also, I agree, I actually could not install iTorch on my laptop 
+                   before cvpr tutorial somehow, it did not want to work :).
+  - **soumith**: I think we should propose a common display API that any interface can implement, 
+                 that way the users dont need to change scripts across different UI backends.
+	         Also, szym/display is a good candidate for the Web UI, ITorch is indeed a bit of a pain to install.
+
+  - Should we endorse iTorch for everyone to use? 
+    - **Ronan**: I know **Soumith** likes it, but I am not a big fan. 
+    -            Heavy+encourages the use of global variables. Excellent for tutorials, though.
+ 	   - This ties to the first question in **Other Questions** section.
+ 	   - Can we/community do pull requests on iTorch? ( **Soumith**: Yes )
+ 	   - First step would be to leanify dependencies and/or install procedure (**Soumith**: agreed)
+- **[needs discussion]** How about Penlight? It has many crucial things that people use.
+   Should we endorse it, use some things from it? Replicate some things in penlight in torch?
+   - **clement**: upvoting this! we use it extensively.
+   - **Ronan**: I live better with less abstractions, but I can be convinced there.
+          However, I find penlight quite big.
+          There are things like the classes that I do not like as well (because of the way they chose for creating classes).
+- **[needs discussion]** how about Moses? New lean functional package that's pretty useful
+- **[definite]** A style guide
+  - Guidelines are super important:
+    - for Lua: at least impose strict camel case + 3 spaces (no tab)
+    - for C: camel case + use of underscore to represent namespace scoping + 2 spaces
+
+## Documentation + Accessibility
+
+ - Tutorials: provide guidelines and basic framework/standard to write and publish tutorials?
+ - Universal dataset API
+   - Dataset classes for several popular datasets
+   - high performance, thread support etc.
+   - support CPU and GPU
+ - Model Zoo + Training scripts, with training scripts we can highlight Torch's strengths
+  - How do we build a super friendly model zoo? git repo of pre-trained models?
+    - Better documentation support, have a doc server
+ 	- Documentation for TH/THC interface and design
+ 	- Inline documentation parser
+ - doc/shell integration (maybe this is still working but needs redoing?)
+
+## Versioning + Packaging
+ - Package owners need to start releasing frequent versions (i.e. torch v7.0.1, 7.0.2, ...)
+ - scm packages should become deprecated
+ - Packages need to avoid global side effects, and return themselves as simple tables (Lua 5.2 started enforcing this on the C side)
+ - Provide standard AMI instances that people can launch (already loosely done by the community). We can load it with many standard+optional packages and/or provide one line option to update to latest.
+
+## Build Infrastructure Requirements
+ - Prepare core distro release
+ - Professional Continuous build for distro and individual core projects
+ - Continuous build for GPU
+ 	- continuous build should include testing
+ - The distro should be build and tested at every pull into any of the member projects
+ - CI for Linux and OSX
+
+## Other Questions?
+ - If there is a project that seems good from outside or consortium, how do we endorse/improve/modify that?
+ 	- do we put some technical criteria to do that?
+ 	- being able to do pull requests?
+	- Licensing?
+ 	- or maybe maintain a list of suggested packages?
+ 	- when does existence of a package stop us from developing the same in core torch?
+	- **Soumith**: I think this should largely be community driven and by popularity. Top starred or watched repos in the ecosystem would be a good start.
+ 	
diff --git a/Storage.c b/Storage.c
new file mode 100644
index 0000000..28c4e87
--- /dev/null
+++ b/Storage.c
@@ -0,0 +1,9 @@
+#include "general.h"
+
+#define torch_Storage_(NAME) TH_CONCAT_4(torch_,Real,Storage_,NAME)
+#define THFile_readRealRaw TH_CONCAT_3(THFile_read, Real, Raw)
+#define THFile_writeRealRaw TH_CONCAT_3(THFile_write, Real, Raw)
+#define torch_Storage TH_CONCAT_STRING_3(torch.,Real,Storage)
+
+#include "generic/Storage.c"
+#include "THGenerateAllTypes.h"
diff --git a/Tensor.c b/Tensor.c
new file mode 100644
index 0000000..4bfbc6a
--- /dev/null
+++ b/Tensor.c
@@ -0,0 +1,9 @@
+#include "general.h"
+
+#define torch_Storage_(NAME) TH_CONCAT_4(torch_,Real,Storage_,NAME)
+#define torch_Storage TH_CONCAT_STRING_3(torch.,Real,Storage)
+#define torch_Tensor_(NAME) TH_CONCAT_4(torch_,Real,Tensor_,NAME)
+#define torch_Tensor TH_CONCAT_STRING_3(torch.,Real,Tensor)
+
+#include "generic/Tensor.c"
+#include "THGenerateAllTypes.h"
diff --git a/Tensor.lua b/Tensor.lua
new file mode 100644
index 0000000..0d573aa
--- /dev/null
+++ b/Tensor.lua
@@ -0,0 +1,561 @@
+-- additional methods for Storage
+local Storage = {}
+
+-- additional methods for Tensor
+local Tensor = {}
+
+-- types
+local types = {'Byte', 'Char', 'Short', 'Int', 'Long', 'Float', 'Double'}
+
+-- Lua 5.2 compatibility
+local log10 = math.log10 or function(x) return math.log(x, 10) end
+
+-- tostring() functions for Tensor and Storage
+local function Storage__printformat(self)
+   if self:size() == 0 then 
+     return "", nil, 0
+   end
+   local intMode = true
+   local type = torch.typename(self)
+--   if type == 'torch.FloatStorage' or type == 'torch.DoubleStorage' then
+      for i=1,self:size() do
+         if self[i] ~= math.ceil(self[i]) then
+            intMode = false
+            break
+         end
+      end
+--   end
+   local tensor = torch.DoubleTensor(torch.DoubleStorage(self:size()):copy(self), 1, self:size()):abs()
+   local expMin = tensor:min()
+   if expMin ~= 0 then
+      expMin = math.floor(log10(expMin)) + 1
+   else
+      expMin = 1
+   end
+   local expMax = tensor:max()
+   if expMax ~= 0 then
+      expMax = math.floor(log10(expMax)) + 1
+   else
+      expMax = 1
+   end
+
+   local format
+   local scale
+   local sz
+   if intMode then
+      if expMax > 9 then
+         format = "%11.4e"
+         sz = 11
+      else
+         format = "%SZd"
+         sz = expMax + 1
+      end
+   else
+      if expMax-expMin > 4 then
+         format = "%SZ.4e"
+         sz = 11
+         if math.abs(expMax) > 99 or math.abs(expMin) > 99 then
+            sz = sz + 1
+         end
+      else
+         if expMax > 5 or expMax < 0 then
+            format = "%SZ.4f"
+            sz = 7
+            scale = math.pow(10, expMax-1)
+         else
+            format = "%SZ.4f"
+            if expMax == 0 then
+               sz = 7
+            else
+               sz = expMax+6
+            end
+         end
+      end
+   end
+   format = string.gsub(format, 'SZ', sz)
+   if scale == 1 then
+      scale = nil
+   end
+   return format, scale, sz
+end
+
+function Storage.__tostring__(self)
+   local strt = {}
+   local format,scale = Storage__printformat(self)
+   if format:sub(2,4) == 'nan' then format = '%f' end
+   if scale then
+      table.insert(strt, string.format('%g', scale) .. ' *\n')
+      for i = 1,self:size() do
+         table.insert(strt, string.format(format, self[i]/scale) .. '\n')
+      end
+   else
+      for i = 1,self:size() do
+         table.insert(strt, string.format(format, self[i]) .. '\n')
+      end
+   end
+   table.insert(strt, '[' .. torch.typename(self) .. ' of size ' .. self:size() .. ']\n')
+   local str = table.concat(strt)
+   return str
+end
+
+for _,type in ipairs(types) do
+   local metatable = torch.getmetatable('torch.' .. type .. 'Storage')
+   for funcname, func in pairs(Storage) do
+      rawset(metatable, funcname, func)
+   end
+end
+
+local function Tensor__printMatrix(self, indent)
+   local format,scale,sz = Storage__printformat(self:storage())
+   if format:sub(2,4) == 'nan' then format = '%f' end
+--   print('format = ' .. format)
+   scale = scale or 1
+   indent = indent or ''
+   local strt = {indent}
+   local nColumnPerLine = math.floor((80-#indent)/(sz+1))
+--   print('sz = ' .. sz .. ' and nColumnPerLine = ' .. nColumnPerLine)
+   local firstColumn = 1
+   local lastColumn = -1
+   while firstColumn <= self:size(2) do
+      if firstColumn + nColumnPerLine - 1 <= self:size(2) then
+         lastColumn = firstColumn + nColumnPerLine - 1
+      else
+         lastColumn = self:size(2)
+      end
+      if nColumnPerLine < self:size(2) then
+         if firstColumn ~= 1 then
+            table.insert(strt, '\n')
+         end
+         table.insert(strt, 'Columns ' .. firstColumn .. ' to ' .. lastColumn .. '\n' .. indent)
+      end
+      if scale ~= 1 then
+         table.insert(strt, string.format('%g', scale) .. ' *\n ' .. indent)
+      end
+      for l=1,self:size(1) do
+         local row = self:select(1, l)
+         for c=firstColumn,lastColumn do
+            table.insert(strt, string.format(format, row[c]/scale))
+            if c == lastColumn then
+               table.insert(strt, '\n')
+               if l~=self:size(1) then
+                  if scale ~= 1 then
+                     table.insert(strt, indent .. ' ')
+                  else
+                     table.insert(strt, indent)
+                  end
+               end
+            else
+               table.insert(strt, ' ')
+            end
+         end
+      end
+      firstColumn = lastColumn + 1
+   end
+   local str = table.concat(strt)
+   return str
+end
+
+local function Tensor__printTensor(self)
+   local counter = torch.LongStorage(self:nDimension()-2)
+   local strt = {''}
+   local finished
+   counter:fill(1)
+   counter[1] = 0
+   while true do
+      for i=1,self:nDimension()-2 do
+         counter[i] = counter[i] + 1
+         if counter[i] > self:size(i) then
+            if i == self:nDimension()-2 then
+               finished = true
+               break
+            end
+            counter[i] = 1
+         else
+            break
+         end
+      end
+      if finished then
+         break
+      end
+--      print(counter)
+      if #strt > 1 then
+         table.insert(strt, '\n')
+      end
+      table.insert(strt, '(')
+      local tensor = self
+      for i=1,self:nDimension()-2 do
+         tensor = tensor:select(1, counter[i])
+         table.insert(strt, counter[i] .. ',')
+      end
+      table.insert(strt, '.,.) = \n')
+      table.insert(strt, Tensor__printMatrix(tensor, ' '))
+   end
+   return table.concat(strt)
+end
+
+function Tensor.__tostring__(self)
+   local strt = {''}
+   if self:nDimension() == 0 then
+      table.insert(strt, '[' .. torch.typename(self) .. ' with no dimension]\n')
+   else
+      local tensor = torch.DoubleTensor():resize(self:size()):copy(self)
+      if tensor:nDimension() == 1 then
+         local format,scale,sz = Storage__printformat(tensor:storage())
+         if format:sub(2,4) == 'nan' then format = '%f' end
+         if scale then
+            table.insert(strt, string.format('%g', scale) .. ' *\n')
+            for i = 1,tensor:size(1) do
+               table.insert(strt, string.format(format, tensor[i]/scale) .. '\n')
+            end
+         else
+            for i = 1,tensor:size(1) do
+               table.insert(strt, string.format(format, tensor[i]) .. '\n')
+            end
+         end
+         table.insert(strt, '[' .. torch.typename(self) .. ' of size ' .. tensor:size(1) .. ']\n')
+      elseif tensor:nDimension() == 2 then
+         table.insert(strt, Tensor__printMatrix(tensor))
+         table.insert(strt, '[' .. torch.typename(self) .. ' of size ' .. tensor:size(1) .. 'x' .. tensor:size(2) .. ']\n')
+      else
+         table.insert(strt, Tensor__printTensor(tensor))
+         table.insert(strt, '[' .. torch.typename(self) .. ' of size ')
+         for i=1,tensor:nDimension() do
+            table.insert(strt, tensor:size(i))
+            if i ~= tensor:nDimension() then
+               table.insert(strt, 'x')
+            end
+         end
+         table.insert(strt, ']\n')
+      end
+   end
+   return table.concat(strt)
+end
+
+function Tensor.type(self,type)
+   local current = torch.typename(self)
+   if not type then return current end
+   if type ~= current then
+      local new = torch.getmetatable(type).new()
+      if self:nElement() > 0 then
+         new:resize(self:size()):copy(self)
+      end
+      return new
+   else
+      return self
+   end
+end
+
+function Tensor.typeAs(self,tensor)
+   return self:type(tensor:type())
+end
+
+function Tensor.byte(self)
+   return self:type('torch.ByteTensor')
+end
+
+function Tensor.char(self)
+   return self:type('torch.CharTensor')
+end
+
+function Tensor.short(self)
+   return self:type('torch.ShortTensor')
+end
+
+function Tensor.int(self)
+   return self:type('torch.IntTensor')
+end
+
+function Tensor.long(self)
+   return self:type('torch.LongTensor')
+end
+
+function Tensor.float(self)
+   return self:type('torch.FloatTensor')
+end
+
+function Tensor.double(self)
+   return self:type('torch.DoubleTensor')
+end
+
+function Tensor.real(self)
+   return self:type(torch.getdefaulttensortype())
+end
+
+function Tensor.expand(result,tensor,...)
+   -- get sizes
+   local sizes = {...}
+
+   local t = torch.type(tensor)
+   if (t == 'number' or t == 'torch.LongStorage') then
+      table.insert(sizes,1,tensor)
+      tensor = result
+      result = tensor.new()
+   end
+
+   -- check type
+   local size
+   if torch.type(sizes[1])=='torch.LongStorage' then
+      size = sizes[1]
+   else
+      size = torch.LongStorage(#sizes)
+      for i,s in ipairs(sizes) do
+         size[i] = s
+      end
+   end
+
+   -- get dimensions
+   local tensor_dim = tensor:dim()
+   local tensor_stride = tensor:stride()
+   local tensor_size = tensor:size()
+
+   -- check nb of dimensions
+   if #size ~= tensor:dim() then
+      error('the number of dimensions provided must equal tensor:dim()')
+   end
+
+   -- create a new geometry for tensor:
+   for i = 1,tensor_dim do
+      if tensor_size[i] == 1 then
+         tensor_size[i] = size[i]
+         tensor_stride[i] = 0
+      elseif tensor_size[i] ~= size[i] then
+         error('incorrect size: only supporting singleton expansion (size=1)')
+      end
+   end
+
+   -- create new view, with singleton expansion:
+   result:set(tensor:storage(), tensor:storageOffset(),
+                         tensor_size, tensor_stride)
+   return result
+end
+torch.expand = Tensor.expand
+
+function Tensor.expandAs(result,tensor,template)
+   if template then
+      return result:expand(tensor,template:size())
+   end
+   return result:expand(tensor:size())
+end
+torch.expandAs = Tensor.expandAs
+
+function Tensor.repeatTensor(result,tensor,...)
+   -- get sizes
+   local sizes = {...}
+
+   local t = torch.type(tensor)
+   if (t == 'number' or t == 'torch.LongStorage') then
+      table.insert(sizes,1,tensor)
+      tensor = result
+      result = tensor.new()
+   end
+   -- if not contiguous, then force the tensor to be contiguous
+   if not tensor:isContiguous() then tensor = tensor:clone() end
+
+   -- check type
+   local size
+   if torch.type(sizes[1])=='torch.LongStorage' then
+      size = sizes[1]
+   else
+      size = torch.LongStorage(#sizes)
+      for i,s in ipairs(sizes) do
+         size[i] = s
+      end
+   end
+   if size:size() < tensor:dim() then
+      error('Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor')
+   end
+   local xtensor = tensor.new():set(tensor)
+   local xsize = xtensor:size():totable()
+   for i=1,size:size()-tensor:dim() do
+      table.insert(xsize,1,1)
+   end
+   size = torch.DoubleTensor(xsize):cmul(torch.DoubleTensor(size:totable())):long():storage()
+   xtensor:resize(torch.LongStorage(xsize))
+   result:resize(size)
+   local urtensor = result.new(result)
+   for i=1,xtensor:dim() do
+      urtensor = urtensor:unfold(i,xtensor:size(i),xtensor:size(i))
+   end
+   for i=1,urtensor:dim()-xtensor:dim() do
+      table.insert(xsize,1,1)
+   end
+   xtensor:resize(torch.LongStorage(xsize))
+   local xxtensor = xtensor:expandAs(urtensor)
+   urtensor:copy(xxtensor)
+   return result
+end
+torch.repeatTensor = Tensor.repeatTensor
+
+--- One of the size elements can be -1,
+ --- a new LongStorage is then returned.
+ --- The length of the unspecified dimension
+ --- is infered from the number of remaining elements.
+local function specifyFully(size, nElements)
+    local nCoveredElements = 1
+    local remainingDim = nil
+    local sizes = size:totable()
+    for i = 1, #sizes do
+        local wantedDimSize = sizes[i]
+        if wantedDimSize == -1 then
+            if remainingDim then
+                error("Only one of torch.view dimensions can be -1.")
+            end
+            remainingDim = i
+        else
+            nCoveredElements = nCoveredElements * wantedDimSize
+        end
+    end
+
+    if not remainingDim then
+        return size
+    end
+
+    assert(nElements % nCoveredElements == 0, "The number of covered elements is not a multiple of all elements.")
+    local copy = torch.LongStorage(sizes)
+    copy[remainingDim] = nElements / nCoveredElements
+    return copy
+end
+
+-- TODO : This should be implemented in TH and and wrapped.
+function Tensor.view(result, src, ...)
+   local size = ...
+   local view, tensor
+   local function istensor(tensor)
+      return torch.typename(tensor) and torch.typename(tensor):find('torch.*Tensor')
+   end
+   local function isstorage(storage)
+      return torch.typename(storage) and torch.typename(storage) == 'torch.LongStorage'
+   end
+   if istensor(result) and istensor(src) and type(size) == 'number' then
+      size = torch.LongStorage{...}
+      view = result
+      tensor = src
+   elseif istensor(result) and istensor(src) and isstorage(size) then
+      size = size
+      view = result
+      tensor = src
+   elseif istensor(result) and isstorage(src) and size == nil then
+      size = src
+      tensor = result
+      view = tensor.new()
+   elseif istensor(result) and type(src) == 'number' then
+      size = {...}
+      table.insert(size,1,src)
+      size = torch.LongStorage(size)
+      tensor = result
+      view = tensor.new()
+   else
+      local t1 = 'torch.Tensor, torch.Tensor, number [, number ]*'
+      local t2 = 'torch.Tensor, torch.Tensor, torch.LongStorage'
+      local t3 = 'torch.Tensor, torch.LongStorage'
+      local t4 = 'torch.Tensor, number [, number ]*'
+      error(string.format('torch.view, expected (%s) or\n (%s) or\n (%s)\n or (%s)', t1, t2, t3, t4))
+   end
+   local origNElement = tensor:nElement()
+   size = specifyFully(size, origNElement)
+
+   assert(tensor:isContiguous(), "expecting a contiguous tensor")
+   view:set(tensor:storage(), tensor:storageOffset(), size)
+   if view:nElement() ~= origNElement then
+      local inputSize = table.concat(tensor:size():totable(), "x")
+      local outputSize = table.concat(size:totable(), "x")
+      error(string.format("Wrong size for view. Input size: %s. Output size: %s",
+      inputSize, outputSize))
+   end
+   return view
+end
+torch.view = Tensor.view
+
+function Tensor.viewAs(result, src, template)
+   if template and torch.typename(template) then
+      return result:view(src, template:size())
+   elseif template == nil then
+      template = src
+      src = result
+      result = src.new()
+      return result:view(src, template:size())
+   else
+      local t1 = 'torch.Tensor, torch.Tensor, torch.LongStorage'
+      local t2 = 'torch.Tensor, torch.LongStorage'
+      error(string.format('expecting (%s) or (%s)', t1, t2))
+   end
+end
+torch.viewAs = Tensor.viewAs
+
+function Tensor.split(result, tensor, splitSize, dim)
+   if torch.type(result) ~= 'table' then
+      dim = splitSize
+      splitSize = tensor
+      tensor = result
+      result = {}
+   else
+      -- empty existing result table before using it
+      for k,v in pairs(result) do
+         result[k] = nil
+      end
+   end
+   dim = dim or 1
+   local start = 1
+   while start <= tensor:size(dim) do
+      local size = math.min(splitSize, tensor:size(dim) - start + 1)
+      local split = tensor:narrow(dim, start, size)
+      table.insert(result, split)
+      start = start + size
+   end
+   return result
+end
+torch.split = Tensor.split
+
+function Tensor.chunk(result, tensor, nChunk, dim)
+   if torch.type(result) ~= 'table' then
+      dim = nChunk
+      nChunk = tensor
+      tensor = result
+      result = {}
+   end
+   dim = dim or 1
+   local splitSize = math.ceil(tensor:size(dim)/nChunk)
+   return torch.split(result, tensor, splitSize, dim)
+end
+torch.chunk = Tensor.chunk
+
+function Tensor.totable(tensor)
+  local result = {}
+  local dim = tensor:dim()
+  if dim == 1 then
+    tensor:apply(function(i) table.insert(result, i) end)
+  elseif dim > 0 then
+    for i = 1, tensor:size(1) do
+      table.insert(result, tensor[i]:totable())
+    end
+  end
+  return result
+end
+torch.totable = Tensor.totable
+
+function Tensor.permute(tensor, ...)
+  local perm = {...}
+  local nDims = tensor:dim()
+  assert(#perm == nDims, 'Invalid permutation')
+  local j
+  for i, p in ipairs(perm) do
+    if p ~= i and p ~= 0 then
+      j = i
+      repeat
+        assert(0 < perm[j] and perm[j] <= nDims, 'Invalid permutation')
+        tensor = tensor:transpose(j, perm[j])
+        j, perm[j] = perm[j], 0
+      until perm[j] == i
+      perm[j] = j
+    end
+  end
+  return tensor
+end
+torch.permute = Tensor.permute
+
+for _,type in ipairs(types) do
+   local metatable = torch.getmetatable('torch.' .. type .. 'Tensor')
+   for funcname, func in pairs(Tensor) do
+      rawset(metatable, funcname, func)
+   end
+end
diff --git a/TensorMath.lua b/TensorMath.lua
new file mode 100644
index 0000000..5a37b12
--- /dev/null
+++ b/TensorMath.lua
@@ -0,0 +1,1443 @@
+local wrap = require 'cwrap'
+
+require 'torchcwrap'
+
+local interface = wrap.CInterface.new()
+local method = wrap.CInterface.new()
+
+interface:print([[
+#include "TH.h"
+#include "THMath.h"
+#include "luaT.h"
+#include "utils.h"
+]])
+
+-- specific to torch: we generate a 'dispatch' function
+-- first we create a helper function
+-- note that it let the "torch" table on the stack
+interface:print([[
+static const void* torch_istensortype(lua_State *L, const char *tname)
+{
+  if(!tname)
+    return NULL;
+
+  if(!luaT_pushmetatable(L, tname))
+    return NULL;
+
+  lua_pushstring(L, "torch");
+  lua_rawget(L, -2);
+  if(lua_istable(L, -1))
+    return tname;
+  else
+  {
+    lua_pop(L, 2);
+    return NULL;
+  }
+
+  return NULL;
+}
+]])
+
+interface:print([[
+static int torch_isnonemptytable(lua_State *L, int idx)
+{
+  int empty;
+  if (!lua_istable(L, idx)) return 0;
+
+  lua_rawgeti(L, idx, 1);
+  empty = lua_isnil(L, -1);
+  lua_pop(L, 1);
+  return !empty;
+}
+]])
+
+
+interface:print([[
+static const void* torch_istensorarray(lua_State *L, int idx)
+{
+  const char* tname;
+  int tensor_idx;
+  if (!torch_isnonemptytable(L, idx)) return 0;
+
+  lua_checkstack(L, 3);
+  lua_rawgeti(L, idx, 1);
+  tensor_idx = lua_gettop(L);  
+  tname = (torch_istensortype(L, luaT_typename(L, -1)));
+  lua_remove(L, tensor_idx);
+  return tname;
+}
+]])
+
+interface.dispatchregistry = {}
+function interface:wrap(name, ...)
+   -- usual stuff
+   wrap.CInterface.wrap(self, name, ...)
+
+   -- dispatch function
+   if not interface.dispatchregistry[name] then
+      interface.dispatchregistry[name] = true
+      table.insert(interface.dispatchregistry, {name=name, wrapname=string.format("torch_%s", name)})
+
+      interface:print(string.gsub([[
+static int torch_NAME(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  const void *tname;
+  if(narg >= 1 && (tname = torch_istensortype(L, luaT_typename(L, 1)))) /* first argument is tensor? */
+  {
+  }
+  else if(narg >= 2 && (tname = torch_istensortype(L, luaT_typename(L, 2)))) /* second? */
+  {
+  }
+  else if(narg >= 1 && (tname = torch_istensorarray(L, 1))) /* torch table argument? */
+  {
+  }
+  else if(narg >= 1 && lua_type(L, narg) == LUA_TSTRING
+	  && (tname = torch_istensortype(L, lua_tostring(L, narg)))) /* do we have a valid tensor type string then? */
+  {
+    lua_remove(L, -2);
+  }
+  else if(!(tname = torch_istensortype(L, torch_getdefaulttensortype(L))))
+    luaL_error(L, "internal error: the default tensor type does not seem to be an actual tensor");
+
+  lua_pushstring(L, "NAME");
+  lua_rawget(L, -2);
+  if(lua_isfunction(L, -1))
+  {
+    lua_insert(L, 1);
+    lua_pop(L, 2); /* the two tables we put on the stack above */
+    lua_call(L, lua_gettop(L)-1, LUA_MULTRET);
+  }
+  else
+    return luaL_error(L, "%s does not implement the torch.NAME() function", tname);
+
+  return lua_gettop(L);
+}
+]], 'NAME', name))
+  end
+end
+
+function interface:dispatchregister(name)
+   local txt = self.txt
+   table.insert(txt, string.format('static const struct luaL_Reg %s [] = {', name))
+   for _,reg in ipairs(self.dispatchregistry) do
+      table.insert(txt, string.format('{"%s", %s},', reg.name, reg.wrapname))
+   end
+   table.insert(txt, '{NULL, NULL}')
+   table.insert(txt, '};')
+   table.insert(txt, '')
+   self.dispatchregistry = {}
+end
+
+interface:print('/* WARNING: autogenerated file */')
+interface:print('')
+
+local function wrap(...)
+   local args = {...}
+
+   -- interface
+   interface:wrap(...)
+
+   -- method: we override things possibly in method table field
+   for _,x in ipairs(args) do
+      if type(x) == 'table' then -- ok, now we have a list of args
+         for _, arg in ipairs(x) do
+            if arg.method then
+               for k,v in pairs(arg.method) do
+                  if v == 'nil' then -- special case, we erase the field
+                     arg[k] = nil
+                  else
+                     arg[k] = v
+                  end
+               end
+            end
+         end
+      end
+   end
+   local unpack = unpack or table.unpack
+    method:wrap(unpack(args))
+end
+
+local reals = {ByteTensor='unsigned char',
+               CharTensor='char',
+               ShortTensor='short',
+               IntTensor='int',
+               LongTensor='long',
+               FloatTensor='float',
+               DoubleTensor='double'}
+
+local accreals = {ByteTensor='long',
+               CharTensor='long',
+               ShortTensor='long',
+               IntTensor='long',
+               LongTensor='long',
+               FloatTensor='double',
+               DoubleTensor='double'}
+
+for _,Tensor in ipairs({"ByteTensor", "CharTensor",
+                        "ShortTensor", "IntTensor", "LongTensor",
+                        "FloatTensor", "DoubleTensor"}) do
+
+   local real = reals[Tensor]
+   local accreal = accreals[Tensor]
+
+   function interface.luaname2wrapname(self, name)
+      return string.format('torch_%s_%s', Tensor, name)
+   end
+
+   function method.luaname2wrapname(self, name)
+      return string.format('m_torch_%s_%s', Tensor, name)
+   end
+
+   local function cname(name)
+      return string.format('TH%s_%s', Tensor, name)
+   end
+
+   local function lastdim(argn)
+      return function(arg)
+                return string.format("TH%s_nDimension(%s)", Tensor, arg.args[argn]:carg())
+             end
+   end
+
+   local function lastdimarray(argn)
+      return function(arg)
+                return string.format("TH%s_nDimension(arg%d_data[0])", Tensor, arg.args[argn].i)
+             end
+   end
+
+   wrap("zero",
+        cname("zero"),
+        {{name=Tensor, returned=true}})
+
+   wrap("fill",
+        cname("fill"),
+        {{name=Tensor, returned=true},
+         {name=real}})
+
+   wrap("zeros",
+        cname("zeros"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name="LongArg"}})
+
+   wrap("ones",
+        cname("ones"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name="LongArg"}})
+
+   wrap("reshape",
+        cname("reshape"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name="LongArg"}})
+
+   wrap("gather",
+        cname("gather"),
+        {{name=Tensor, default=true, returned=true,
+          init=function(arg)
+                  return table.concat(
+                     {
+                        arg.__metatable.init(arg),
+                        string.format("THLongStorage* %s_size = THLongTensor_newSizeOf(%s);", arg:carg(), arg.args[4]:carg()),
+                        string.format("TH%s_resize(%s, %s_size, NULL);", Tensor, arg:carg(), arg:carg()),
+                        string.format("THLongStorage_free(%s_size);", arg:carg())
+                     }, '\n')
+               end
+         },
+         {name=Tensor},
+         {name="index"},
+         {name="IndexTensor", noreadadd=true}})
+
+   wrap("scatter",
+        cname("scatter"),
+        {{name=Tensor, returned=true},
+         {name="index"},
+         {name="IndexTensor", noreadadd=true},
+         {name=Tensor}},
+        cname("scatterFill"),
+        {{name=Tensor, returned=true},
+         {name="index"},
+         {name="IndexTensor", noreadadd=true},
+         {name=real}})
+
+   wrap("dot",
+        cname("dot"),
+        {{name=Tensor},
+         {name=Tensor},
+         {name=accreal, creturned=true}})
+
+   wrap("equal",
+        cname("equal"),
+        {{name=Tensor},
+         {name=Tensor},
+         {name="boolean", creturned=true}})
+
+   wrap("add",
+        cname("add"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}},
+        cname("cadd"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real, default=1},
+         {name=Tensor}})
+
+   wrap("csub",
+     cname("sub"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+       {name=Tensor, method={default=1}},
+       {name=real}},
+     cname("csub"),
+     {{name=Tensor, default=true, returned=true, method={default='nil'}},
+       {name=Tensor, method={default=1}},
+       {name=real, default=1},
+       {name=Tensor}})
+
+   wrap("mul",
+        cname("mul"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
+   wrap("div",
+        cname("div"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
+   wrap("fmod",
+        cname("fmod"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
+   wrap("remainder",
+        cname("remainder"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+ 
+   -- mod alias
+   wrap("mod",
+        cname("fmod"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real}})
+
+   wrap("clamp",
+        cname("clamp"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real},
+         {name=real}})
+
+
+   wrap("match",
+        cname("match"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor},
+         {name=Tensor},
+         {name=real, default=1}
+        })
+
+   wrap("cmul",
+        cname("cmul"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("cpow",
+        cname("cpow"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("cdiv",
+        cname("cdiv"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("cfmod",
+        cname("cfmod"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("cremainder",
+        cname("cremainder"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   -- cmod alias
+   wrap("cmod",
+        cname("cfmod"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=Tensor}})
+
+   wrap("addcmul",
+        cname("addcmul"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real, default=1},
+         {name=Tensor},
+         {name=Tensor}})
+
+   wrap("addcdiv",
+        cname("addcdiv"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}},
+         {name=real, default=1},
+         {name=Tensor},
+         {name=Tensor}})
+
+   wrap("mv",
+        cname("addmv"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+                  return table.concat(
+                     {
+                        arg.__metatable.init(arg),
+                        string.format("TH%s_resize1d(%s, %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg())
+                     }, '\n')
+               end,
+          precall=function(arg)
+                  return table.concat(
+                     {
+                        string.format("TH%s_zero(%s);", Tensor, arg:carg()),
+                        arg.__metatable.precall(arg)
+                     }, '\n')
+               end,
+       },
+         {name=real, default=0, invisible=true},
+         {name=Tensor, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=2},
+         {name=Tensor, dim=1}}
+     )
+
+   wrap("mm",
+        cname("addmm"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+                  return table.concat(
+                     {
+                        arg.__metatable.init(arg),
+                        string.format("TH%s_resize2d(%s, %s->size[0], %s->size[1]);", Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                     }, '\n')
+               end,
+          precall=function(arg)
+                  return table.concat(
+                     {
+                        string.format("TH%s_zero(%s);", Tensor, arg:carg()),
+                        arg.__metatable.precall(arg)
+                     }, '\n')
+               end,
+       },
+         {name=real, default=0, invisible=true},
+         {name=Tensor, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=2},
+         {name=Tensor, dim=2}}
+     )
+
+   wrap("bmm",
+        cname("baddbmm"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+                  return table.concat(
+                     {
+                        arg.__metatable.init(arg),
+                        string.format("TH%s_resize3d(%s, %s->size[0], %s->size[1], %s->size[2]);",
+                                      Tensor, arg:carg(), arg.args[5]:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                     }, '\n')
+               end,
+          precall=function(arg)
+                  return table.concat(
+                     {
+                        string.format("TH%s_zero(%s);", Tensor, arg:carg()),
+                        arg.__metatable.precall(arg)
+                     }, '\n')
+               end,
+       },
+         {name=real, default=0, invisible=true},
+         {name=Tensor, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=3},
+         {name=Tensor, dim=3}}
+     )
+
+   wrap("ger",
+        cname("addr"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          init=function(arg)
+                  return table.concat(
+                     {
+                        arg.__metatable.init(arg),
+                        string.format("TH%s_resize2d(%s, %s->size[0], %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+                     }, '\n')
+               end,
+          precall=function(arg)
+                     return table.concat(
+                        {
+                           string.format("TH%s_zero(%s);", Tensor, arg:carg()),
+                           arg.__metatable.precall(arg)
+                        }, '\n')
+                  end
+       },
+        {name=real, default=1, invisible=true},
+        {name=Tensor, default=1, invisible=true},
+        {name=real, default=1, invisible=true},
+        {name=Tensor, dim=1},
+        {name=Tensor, dim=1}}
+     )
+
+   for _,f in ipairs({
+                        {name="addmv",   dim1=1, dim2=2, dim3=1},
+                        {name="addmm",   dim1=2, dim2=2, dim3=2},
+                        {name="addr",    dim1=2, dim2=1, dim3=1},
+                        {name="addbmm",  dim1=2, dim2=3, dim3=3},
+                        {name="baddbmm", dim1=3, dim2=3, dim3=3},
+                     }
+                  ) do
+
+      interface:wrap(f.name,
+                     cname(f.name),
+                     {{name=Tensor, default=true, returned=true},
+                      {name=real, default=1},
+                      {name=Tensor, dim=f.dim1},
+                      {name=real, default=1},
+                      {name=Tensor, dim=f.dim2},
+                      {name=Tensor, dim=f.dim3}})
+
+      -- there is an ambiguity here, hence the more complicated setup
+      method:wrap(f.name,
+                  cname(f.name),
+                  {{name=Tensor, returned=true, dim=f.dim1},
+                   {name=real, default=1, invisible=true},
+                   {name=Tensor, default=1, dim=f.dim1},
+                   {name=real, default=1},
+                   {name=Tensor, dim=f.dim2},
+                   {name=Tensor, dim=f.dim3}},
+                  cname(f.name),
+                  {{name=Tensor, returned=true, dim=f.dim1},
+                   {name=real},
+                   {name=Tensor, default=1, dim=f.dim1},
+                   {name=real},
+                   {name=Tensor, dim=f.dim2},
+                   {name=Tensor, dim=f.dim3}})
+   end
+
+   wrap("numel",
+        cname("numel"),
+        {{name=Tensor},
+         {name="long", creturned=true}})
+
+   for _,name in ipairs({"cumsum", "cumprod"}) do
+      wrap(name,
+           cname(name),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="index", default=1}})
+   end
+
+   wrap("sum",
+        cname("sumall"),
+        {{name=Tensor},
+         {name=accreal, creturned=true}},
+        cname("sum"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name="index"}})
+
+   wrap("prod",
+        cname("prodall"),
+        {{name=Tensor},
+         {name=accreal, creturned=true}},
+        cname("prod"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name="index"}})
+
+   for _,name in ipairs({"min", "max"}) do
+      wrap(name,
+           cname(name .. "all"),
+           {{name=Tensor},
+            {name=real, creturned=true}},
+           cname(name),
+           {{name=Tensor, default=true, returned=true},
+            {name="IndexTensor", default=true, returned=true, noreadadd=true},
+            {name=Tensor},
+            {name="index"}})
+   end
+
+   for _,name in ipairs({"cmin", "cmax"}) do
+      wrap(name,
+           cname(name),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor, method={default=1}},
+            {name=Tensor}},
+           cname(name .. "Value"),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor, method={default=1}},
+            {name=real}})
+   end
+
+   wrap("trace",
+        cname("trace"),
+        {{name=Tensor},
+         {name=accreal, creturned=true}})
+
+   wrap("cross",
+        cname("cross"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name=Tensor},
+         {name="index", default=0}})
+
+   wrap("diag",
+        cname("diag"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name="long", default=0}})
+
+   wrap("eye",
+        cname("eye"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name="long"},
+         {name="long", default=0}})
+
+   wrap("range",
+        cname("range"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=accreal},
+         {name=accreal},
+         {name=accreal, default=1}})
+
+   wrap("randperm",
+        cname("randperm"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'},
+          postcall=function(arg)
+                      return table.concat(
+                         {
+                            arg.__metatable.postcall(arg),
+                            string.format("TH%s_add(%s, %s, 1);", Tensor, arg:carg(), arg:carg())
+                         }, '\n')
+                   end},
+         {name="Generator", default=true},
+         {name="long"}})
+
+   wrap("sort",
+        cname("sort"),
+        {{name=Tensor, default=true, returned=true},
+         {name="IndexTensor", default=true, returned=true, noreadadd=true},
+         {name=Tensor},
+         {name="index", default=lastdim(3)},
+         {name="boolean", default=0}})
+
+wrap("topk",
+     cname("topk"),
+     {{name=Tensor, default=true, returned=true},
+        {name="IndexTensor", default=true, returned=true, noreadadd=true},
+        {name=Tensor},
+        {name="long", default=1},
+        {name="index", default=lastdim(3)},
+        {name="boolean", default=0},
+        {name="boolean", default=0}})
+
+   wrap("kthvalue",
+        cname("kthvalue"),
+        {{name=Tensor, default=true, returned=true},
+         {name="IndexTensor", default=true, returned=true, noreadadd=true},
+         {name=Tensor},
+         {name="index"},
+         {name="index", default=lastdim(3)}})
+
+   wrap("mode",
+       cname("mode"),
+       {{name=Tensor, default=true, returned=true},
+           {name="IndexTensor", default=true, returned=true, noreadadd=true},
+           {name=Tensor},
+           {name="index", default=lastdim(3)}})
+
+   wrap("median",
+        cname("median"),
+        {{name=Tensor, default=true, returned=true},
+         {name="IndexTensor", default=true, returned=true, noreadadd=true},
+         {name=Tensor},
+         {name="index", default=lastdim(3)}})
+
+   wrap("tril",
+        cname("tril"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name="int", default=0}})
+
+   wrap("triu",
+        cname("triu"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name="int", default=0}})
+
+   wrap("cat",
+        cname("cat"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor},
+         {name=Tensor},
+         {name="index", default=lastdim(2)}},
+        cname("catArray"),
+        {{name=Tensor, default=true, returned=true},
+         {name=Tensor .. "Array"},
+         {name="index", default=lastdimarray(2)}})
+
+   if Tensor == 'ByteTensor' then -- we declare this only once
+      interface:print(
+         [[
+static long THRandom_random2__(THGenerator *gen, long a, long b)
+{
+  THArgCheck(b >= a, 2, "upper bound must be larger than lower bound");
+  return((THRandom_random(gen) % (b+1-a)) + a);
+}
+
+static long THRandom_random1__(THGenerator *gen, long b)
+{
+  THArgCheck(b > 0, 1, "upper bound must be strictly positive");
+  return(THRandom_random(gen) % b + 1);
+}
+         ]])
+   end
+
+   interface:print(string.gsub(
+                      [[
+static void THTensor_random2__(THTensor *self, THGenerator *gen, long a, long b)
+{
+  THArgCheck(b >= a, 2, "upper bound must be larger than lower bound");
+  TH_TENSOR_APPLY(real, self, *self_data = ((THRandom_random(gen) % (b+1-a)) + a);)
+}
+
+static void THTensor_random1__(THTensor *self, THGenerator *gen, long b)
+{
+  THArgCheck(b > 0, 1, "upper bound must be strictly positive");
+  TH_TENSOR_APPLY(real, self, *self_data = (THRandom_random(gen) % b + 1);)
+}
+]], 'Tensor', Tensor):gsub('real', real))
+
+   wrap('random',
+        'THRandom_random2__',
+        {{name='Generator', default=true},
+         {name='long'},
+         {name='long'},
+         {name='long', creturned=true}},
+        'THRandom_random1__',
+        {{name='Generator', default=true},
+         {name='long'},
+         {name='long', creturned=true}},
+        'THRandom_random',
+        {{name='Generator', default=true},
+         {name='long', creturned=true}},
+        cname("random2__"),
+        {{name=Tensor, returned=true},
+         {name='Generator', default=true},
+         {name='long'},
+         {name='long'}},
+        cname("random1__"),
+        {{name=Tensor, returned=true},
+         {name='Generator', default=true},
+         {name='long'}},
+        cname("random"),
+        {{name=Tensor, returned=true},
+         {name='Generator', default=true}})
+
+   wrap("geometric",
+     "THRandom_geometric",
+     {{name="Generator", default=true},
+      {name="double"},
+      {name="double", creturned=true}},
+     cname("geometric"),
+     {{name=Tensor, returned=true},
+      {name="Generator", default=true},
+      {name="double"}})
+
+   wrap("bernoulli",
+      "THRandom_bernoulli",
+      {{name="Generator", default=true},
+       {name="double", default=0.5},
+       {name="double", creturned=true}},
+      cname("bernoulli"),
+      {{name=Tensor, returned=true},
+       {name="Generator", default=true},
+       {name="double", default=0.5}},
+      cname("bernoulli_FloatTensor"),
+      {{name=Tensor, returned=true},
+       {name="Generator", default=true},
+       {name="FloatTensor"}},
+      cname("bernoulli_DoubleTensor"),
+      {{name=Tensor, returned=true},
+       {name="Generator", default=true},
+       {name="DoubleTensor"}})
+
+   wrap("squeeze",
+        cname("squeeze"),
+        {{name=Tensor, default=true, returned=true, postcall=function(arg)
+                                                                local txt = {}
+                                                                if arg.returned then
+                                                                   table.insert(txt, string.format('if(arg%d->nDimension == 1 && arg%d->size[0] == 1)', arg.i, arg.i)) -- number
+                                                                   table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)(*TH%s_data(arg%d)));', Tensor, arg.i))
+                                                                end
+                                                                return table.concat(txt, '\n')
+                                                             end},
+         {name=Tensor}},
+        cname("squeeze1d"),
+        {{name=Tensor, default=true, returned=true,
+
+          postcall=
+             function(arg)
+                local txt = {}
+                if arg.returned then
+                   table.insert(txt, string.format('if(!hasdims && arg%d->nDimension == 1 && arg%d->size[0] == 1)', arg.i, arg.i)) -- number
+                   table.insert(txt, string.format('lua_pushnumber(L, (lua_Number)(*TH%s_data(arg%d)));}', Tensor, arg.i))
+                end
+                return table.concat(txt, '\n')
+             end},
+
+         {name=Tensor,
+
+          precall=
+             function(arg)
+                return string.format('{int hasdims = arg%d->nDimension > 1;', arg.i)
+             end},
+
+         {name="index"}})
+
+   wrap("sign",
+        cname("sign"),
+        {{name=Tensor, default=true, returned=true, method={default='nil'}},
+         {name=Tensor, method={default=1}}})
+
+   wrap("conv2",
+        cname("conv2Dmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=2},
+         {name=Tensor, dim=2},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="C", invisible=true}},
+        cname("conv2Dcmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=3},
+         {name=Tensor, dim=3},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="C", invisible=true}},
+        cname("conv2Dmv"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=3},
+         {name=Tensor, dim=4},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="C", invisible=true}}
+     )
+
+   wrap("xcorr2",
+        cname("conv2Dmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=2},
+         {name=Tensor, dim=2},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="X", invisible=true}},
+        cname("conv2Dcmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=3},
+         {name=Tensor, dim=3},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="X", invisible=true}},
+        cname("conv2Dmv"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=3},
+         {name=Tensor, dim=4},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="X", invisible=true}}
+     )
+
+   wrap("conv3",
+        cname("conv3Dmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=3},
+         {name=Tensor, dim=3},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="C", invisible=true}},
+        cname("conv3Dcmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=4},
+         {name=Tensor, dim=4},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="C", invisible=true}},
+        cname("conv3Dmv"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=4},
+         {name=Tensor, dim=5},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="C", invisible=true}}
+     )
+
+   wrap("xcorr3",
+        cname("conv3Dmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=3},
+         {name=Tensor, dim=3},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="X", invisible=true}},
+        cname("conv3Dcmul"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=4},
+         {name=Tensor, dim=4},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="X", invisible=true}},
+        cname("conv3Dmv"),
+        {{name=Tensor, default=true, returned=true},
+         {name=real, default=0, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=Tensor, dim=4},
+         {name=Tensor, dim=5},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name=real, default=1, invisible=true},
+         {name='charoption', values={'V', 'F'}, default='V'},
+         {name='charoption', default="X", invisible=true}}
+     )
+
+   for _,name in pairs({'lt','gt','le','ge','eq','ne'}) do
+      wrap(name,
+           cname(name .. 'Value'),
+           {{name='ByteTensor',default=true, returned=true},
+            {name=Tensor},
+            {name=real}},
+           cname(name .. 'ValueT'),
+           {{name=Tensor, returned=true},
+            {name=Tensor},
+            {name=real}},
+           cname(name .. 'Tensor'),
+           {{name='ByteTensor',default=true, returned=true},
+            {name=Tensor},
+            {name=Tensor}},
+           cname(name .. 'TensorT'),
+           {{name=Tensor, returned=true},
+            {name=Tensor},
+            {name=Tensor}})
+   end
+
+   wrap("nonzero",
+        cname("nonzero"),
+        {{name="IndexTensor", default=true, returned=true},
+         {name=Tensor}})
+
+   if Tensor == 'ByteTensor' then
+     -- Logical accumulators only apply to ByteTensor
+      for _,name in ipairs({'all', 'any'}) do
+        wrap(name,
+             cname('logical' .. name),
+             {{name=Tensor},
+		{name="boolean", creturned=true}})
+      end
+   end
+
+   if Tensor == 'IntTensor' then
+         wrap("abs",
+              cname("abs"),
+              {{name=Tensor, default=true, returned=true, method={default='nil'}},
+               {name=Tensor, method={default=1}}},
+              "abs",
+              {{name=real},
+               {name=real, creturned=true}})
+   elseif Tensor == 'LongTensor' then
+         wrap("abs",
+              cname("abs"),
+              {{name=Tensor, default=true, returned=true, method={default='nil'}},
+               {name=Tensor, method={default=1}}},
+              "labs",
+              {{name=real},
+               {name=real, creturned=true}})
+   end
+
+   if Tensor == 'FloatTensor' or Tensor == 'DoubleTensor' then
+
+      wrap("mean",
+           cname("meanall"),
+           {{name=Tensor},
+            {name=accreal, creturned=true}},
+           cname("mean"),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="index"}})
+
+      for _,name in ipairs({"var", "std"}) do
+         wrap(name,
+              cname(name .. "all"),
+              {{name=Tensor},
+               {name=accreal, creturned=true}},
+              cname(name),
+              {{name=Tensor, default=true, returned=true},
+               {name=Tensor},
+               {name="index"},
+               {name="boolean", default=false}})
+      end
+      wrap("histc",
+           cname("histc"),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name="long",default=100},
+            {name="double",default=0},
+            {name="double",default=0}})
+
+      wrap("norm",
+           cname("normall"),
+           {{name=Tensor},
+            {name=real, default=2},
+            {name=accreal, creturned=true}},
+           cname("norm"),
+           {{name=Tensor, default=true, returned=true},
+            {name=Tensor},
+            {name=real},
+            {name="index"}})
+
+      wrap("renorm",
+           cname("renorm"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real},
+            {name="index"},
+            {name=real}})
+
+      wrap("dist",
+           cname("dist"),
+           {{name=Tensor},
+            {name=Tensor},
+            {name=real, default=2},
+            {name=accreal, creturned=true}})
+
+      wrap("linspace",
+           cname("linspace"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=real},
+            {name=real},
+            {name="long", default=100}})
+
+      wrap("logspace",
+           cname("logspace"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=real},
+            {name=real},
+            {name="long", default=100}})
+
+      for _,name in ipairs({"log", "log1p", "exp",
+                            "cos", "acos", "cosh",
+                            "sin", "asin", "sinh",
+                            "tan", "atan", "tanh",
+                            "sqrt", "round", "ceil",
+                            "floor", "trunc", }) do
+         wrap(name,
+              cname(name),
+              {{name=Tensor, default=true, returned=true, method={default='nil'}},
+               {name=Tensor, method={default=1}}},
+              name,
+              {{name=real},
+               {name=real, creturned=true}})
+      end
+
+      wrap("abs",
+           cname("abs"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}}},
+           "fabs",
+           {{name=real},
+            {name=real, creturned=true}})
+
+      wrap("frac",
+           cname("frac"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}}},
+           "TH_frac",
+           {{name=real},
+            {name=real, creturned=true}})
+
+      wrap("rsqrt",
+           cname("rsqrt"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}}},
+           "TH_rsqrt",
+           {{name=real},
+            {name=real, creturned=true}})
+
+      wrap("sigmoid",
+           cname("sigmoid"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}}},
+           "TH_sigmoid",
+           {{name=real},
+            {name=real, creturned=true}})
+
+      wrap("neg",
+           cname("neg"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}}})
+
+      wrap("cinv",
+           cname("cinv"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}}})
+
+      wrap("lerp",
+           cname("lerp"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=Tensor},
+            {name=real}},
+           "TH_lerp",
+           {{name=real},
+            {name=real},
+            {name=real},
+            {name=real, creturned=true}})
+
+      wrap("atan2",
+           cname("atan2"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=Tensor}},
+           "atan2",
+           {{name=real},
+            {name=real},
+            {name=real, creturned=true}})
+
+      wrap("pow",
+           cname("pow"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=Tensor, method={default=1}},
+            {name=real}},
+           cname("tpow"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name=real},
+            {name=Tensor, method={default=1}}},
+           "pow",
+           {{name=real},
+            {name=real},
+            {name=real, creturned=true}})
+
+      wrap("rand",
+           cname("rand"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name='Generator', default=true},
+            {name="LongArg"}})
+
+      wrap("randn",
+           cname("randn"),
+           {{name=Tensor, default=true, returned=true, method={default='nil'}},
+            {name='Generator', default=true},
+            {name="LongArg"}})
+
+      wrap("multinomial",
+           cname("multinomial"),
+           {{name="IndexTensor", default=true, returned=true, method={default='nil'}},
+            {name='Generator', default=true},
+            {name=Tensor},
+            {name="int"},
+            {name="boolean", default=false}})
+
+      for _,f in ipairs({{name='uniform', a=0, b=1},
+                         {name='normal', a=0, b=1},
+                         {name='cauchy', a=0, b=1},
+                         {name='logNormal', a=1, b=2}}) do
+
+         wrap(f.name,
+              string.format("THRandom_%s", f.name),
+              {{name='Generator', default=true},
+               {name="double", default=f.a},
+               {name="double", default=f.b},
+               {name="double", creturned=true}},
+              cname(f.name),
+              {{name=Tensor, returned=true},
+               {name='Generator', default=true},
+               {name=real, default=f.a},
+               {name=real, default=f.b}})
+      end
+
+      for _,f in ipairs({{name='exponential'}}) do
+
+         wrap(f.name,
+              string.format("THRandom_%s", f.name),
+              {{name='Generator', default=true},
+               {name="double", default=f.a},
+               {name="double", creturned=true}},
+              cname(f.name),
+              {{name=Tensor, returned=true},
+               {name='Generator', default=true},
+               {name=real, default=f.a}})
+      end
+
+      for _,name in ipairs({"gesv","gels"}) do
+         interface:wrap(name,
+                        cname(name),
+                        {{name=Tensor, returned=true},
+                         {name=Tensor, returned=true},
+                         {name=Tensor},
+                         {name=Tensor}},
+                        cname(name),
+                        {{name=Tensor, default=true, returned=true, invisible=true},
+                         {name=Tensor, default=true, returned=true, invisible=true},
+                         {name=Tensor},
+                         {name=Tensor}}
+                     )
+      end
+      interface:wrap("trtrs",
+                     cname("trtrs"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'},  -- uplo
+                      {name='charoption', values={'N', 'T'}, default='N'},  -- trans
+                      {name='charoption', values={'N', 'U'}, default='N'}}, -- diag
+                     cname("trtrs"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'},  -- uplo
+                      {name='charoption', values={'N', 'T'}, default='N'},  -- trans
+                      {name='charoption', values={'N', 'U'}, default='N'}}  -- diag
+                  )
+
+      interface:wrap("symeig",
+                     cname("syev"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name='charoption', values={'N', 'V'}, default='N'},
+                      {name='charoption', values={'U', 'L'}, default='U'}},
+                     cname("syev"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name='charoption', values={'N', 'V'}, default='N'},
+                      {name='charoption', values={'U', 'L'}, default='U'}}
+                  )
+      interface:wrap("eig",
+                     cname("geev"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name='charoption', values={'N', 'V'}, default='N'}},
+                     cname("geev"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name='charoption', values={'N', 'V'}, default='N'}}
+                  )
+
+      interface:wrap("svd",
+                     cname("gesvd"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor, returned=true},
+                      {name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name='charoption', values={'A', 'S'}, default='S'}},
+                     cname("gesvd"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name='charoption', values={'A', 'S'}, default='S'}}
+                  )
+      interface:wrap("inverse",
+                     cname("getri"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor}},
+                     cname("getri"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor}}
+                  )
+      interface:wrap("potrf",
+                     cname("potrf"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'}}, -- uplo
+                     cname("potrf"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'}}
+                  )
+      interface:wrap("potrs",
+                     cname("potrs"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'}}, -- uplo
+                     cname("potrs"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'}}
+                  )
+      interface:wrap("potri",
+                     cname("potri"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'}}, -- uplo
+                     cname("potri"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'}} -- uplo
+                    )
+      interface:wrap("pstrf",
+                     cname("pstrf"),
+                     {{name=Tensor, returned=true},
+                      {name='IntTensor', returned=true},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'},  -- uplo
+                      {name=real, default=-1}},
+                     cname("pstrf"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name='IntTensor', default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name='charoption', values={'U', 'L'}, default='U'},  -- uplo
+                      {name=real, default=-1}}
+                  )
+      interface:wrap("qr",
+                     cname("qr"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor, returned=true},
+                      {name=Tensor}},
+                     cname("qr"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor}}
+                  )
+      interface:wrap("geqrf",
+                     cname("geqrf"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor, returned=true},
+                      {name=Tensor}},
+                     cname("geqrf"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor}}
+                  )
+      interface:wrap("orgqr",
+                     cname("orgqr"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name=Tensor}},
+                     cname("orgqr"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name=Tensor}}
+                  )
+      interface:wrap("ormqr",
+                     cname("ormqr"),
+                     {{name=Tensor, returned=true},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name='charoption', values={'L', 'R'}, default='L'},
+                      {name='charoption', values={'N', 'T'}, default='N'}},
+                     cname("ormqr"),
+                     {{name=Tensor, default=true, returned=true, invisible=true},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name=Tensor},
+                      {name='charoption', values={'L', 'R'}, default='L'},
+                      {name='charoption', values={'N', 'T'}, default='N'}}
+                  )
+   end
+
+   method:register(string.format("m_torch_%sMath__", Tensor))
+   interface:print(method:tostring())
+   method:clearhistory()
+   interface:register(string.format("torch_%sMath__", Tensor))
+
+   interface:print(string.gsub([[
+static void torch_TensorMath_init(lua_State *L)
+{
+  luaT_pushmetatable(L, "torch.Tensor");
+
+  /* register methods */
+  luaT_setfuncs(L, m_torch_TensorMath__, 0);
+
+  /* register functions into the "torch" field of the tensor metaclass */
+  lua_pushstring(L, "torch");
+  lua_newtable(L);
+  luaT_setfuncs(L, torch_TensorMath__, 0);
+  lua_rawset(L, -3);
+  lua_pop(L, 1);
+}
+]], 'Tensor', Tensor))
+end
+
+interface:dispatchregister("torch_TensorMath__")
+
+interface:print([[
+void torch_TensorMath_init(lua_State *L)
+{
+  torch_ByteTensorMath_init(L);
+  torch_CharTensorMath_init(L);
+  torch_ShortTensorMath_init(L);
+  torch_IntTensorMath_init(L);
+  torch_LongTensorMath_init(L);
+  torch_FloatTensorMath_init(L);
+  torch_DoubleTensorMath_init(L);
+  luaT_setfuncs(L, torch_TensorMath__, 0);
+}
+]])
+
+if arg[1] then
+   interface:tofile(arg[1])
+else
+   print(interface:tostring())
+end
diff --git a/TensorOperator.c b/TensorOperator.c
new file mode 100644
index 0000000..8986ff7
--- /dev/null
+++ b/TensorOperator.c
@@ -0,0 +1,8 @@
+#include "general.h"
+
+#define torch_TensorOperator_(NAME) TH_CONCAT_4(torch_,Real,TensorOperator_,NAME)
+#define torch_Tensor_id TH_CONCAT_3(torch_,Real,Tensor_id)
+#define torch_Tensor TH_CONCAT_STRING_3(torch.,Real,Tensor)
+
+#include "generic/TensorOperator.c"
+#include "THGenerateAllTypes.h"
diff --git a/TestSuite.lua b/TestSuite.lua
new file mode 100644
index 0000000..630c2c9
--- /dev/null
+++ b/TestSuite.lua
@@ -0,0 +1,30 @@
+function torch.TestSuite()
+   local obj = {
+      __tests = {},
+      __isTestSuite = true
+   }
+
+   local metatable = {}
+
+   function metatable:__index(key)
+      return self.__tests[key]
+   end
+
+   function metatable:__newindex(key, value)
+      if self.__tests[key] ~= nil then
+         error("Test " .. tostring(key) .. " is already defined.")
+      end
+      if type(value) ~= "function" then
+         if type(value) == "table" then
+            error("Nested tables of tests are not supported")
+         else
+            error("Only functions are supported as members of a TestSuite")
+         end
+      end
+      self.__tests[key] = value
+   end
+
+   setmetatable(obj, metatable)
+
+   return obj
+end
diff --git a/Tester.lua b/Tester.lua
new file mode 100644
index 0000000..a3b3ff3
--- /dev/null
+++ b/Tester.lua
@@ -0,0 +1,878 @@
+
+-- Lua 5.2 compatibility
+local unpack = unpack or table.unpack
+
+local check = {} -- helper functions, defined at the bottom of the file
+
+local Tester = torch.class('torch.Tester')
+
+function Tester:__init()
+   self.errors = {}
+   self.tests = {}
+   self.warnings = {}
+   self._warningCount = {}
+   self.disabledTests = {}
+   self._currentTestName = ''
+
+   -- To maintain backwards compatibility (at least for a short while),
+   -- disable exact dimension checking of tensors when :assertTensorEq is
+   -- called. Thus {{1}} == {1} when this flag is true.
+   --
+   -- Note that other methods that suppose tensor checking (such as
+   -- :assertGeneralEq) ignore this flag, since previously they didn't
+   -- exist or support tensor equality checks at all, so there is no
+   -- old code that uses these functions and relies on the behaviour.
+   --
+   -- Note also that if the dimension check fails with this flag is true, then
+   -- will show a warning.
+   self._assertTensorEqIgnoresDims = true
+end
+
+function Tester:setEarlyAbort(earlyAbort)
+   self.earlyAbort = earlyAbort
+end
+
+function Tester:setRethrowErrors(rethrow)
+   self.rethrow = rethrow
+end
+
+function Tester:setSummaryOnly(summaryOnly)
+   self.summaryOnly = summaryOnly
+end
+
+-- Add a success to the test.
+function Tester:_success()
+   local name = self._currentTestName
+   self.assertionPass[name] = self.assertionPass[name] + 1
+   return true
+end
+
+function Tester:_addDebugInfo(message)
+   local ss = debug.traceback('tester', 3) or ''
+   ss = ss:match('.-\n([^\n]+\n[^\n]+)\n[^\n]+xpcall') or ''
+   local name = self._currentTestName
+   return (name ~= '' and name .. '\n' or '') .. message .. '\n' .. ss
+end
+
+-- Add a failure to the test.
+function Tester:_failure(message)
+   if self.rethrow then error(message, 2) end
+   local name = self._currentTestName
+   self.assertionFail[name] = self.assertionFail[name] + 1
+   self.errors[#self.errors + 1] = self:_addDebugInfo(message)
+   return false
+end
+
+-- Add a warning to the test
+function Tester:_warning(message)
+   local name = self._currentTestName
+   self._warningCount[name] = (self._warningCount[name] or 0) + 1
+   self.warnings[#self.warnings + 1] = self:_addDebugInfo(message)
+end
+
+-- Call this during a test run with `condition = true` to log a success, or with
+-- `condition = false` to log a failure (using `message`).
+function Tester:_assert_sub(condition, message)
+   if condition then
+      return self:_success()
+   else
+      return self:_failure(message)
+   end
+end
+
+local function getMessage(message, ...)
+   assert(next{...} == nil, "Unexpected arguments passed to test function")
+   if message then
+      assert(type(message) == 'string', 'message parameter must be a string')
+      if message ~= '' then
+         return message .. '\n'
+      end
+   end
+   return ''
+end
+
+--[[ Historically, some test functions have accepted both a message and a
+tolerance, and some just a message (e.g., assertTableEq). Now assertTableEq
+accepts both a tolerance and a message, so allow the two arguments to be passed
+in either order to maintain backwards compatibility (and more generally,
+for convenience). (We still document the ordering as "tolerance, message" for
+clarity.) This function also sanitizes them (ensures they are non-nil, etc).
+]]
+local function getToleranceAndMessage(defaultTolerance, ...)
+   local args = {...}
+   local message = nil
+   local tolerance = nil
+   for _, a in ipairs(args) do
+      if type(a) == 'string' then
+         if message then
+            error("Unexpected string argument; already have message", a)
+         end
+         message = a .. '\n'
+      elseif type(a) == 'number' then
+         if tolerance then
+            error("Unexpected number argument; already have tolerance", a)
+         end
+         tolerance = a
+         assert(tolerance >= 0, "tolerance cannot be negative")
+      else
+         error("Unrecognized argument; should be a tolerance or message", a)
+      end
+   end
+   message = message or ''
+   tolerance = tolerance or defaultTolerance
+   return tolerance, message
+end
+
+function Tester:assert(condition, ...)
+   local message = getMessage(...)
+   if type(condition) ~= 'boolean' then
+      self:_warning(" :assert should only be used for boolean conditions. "
+                    .. "To check for non-nil variables, do this explicitly: "
+                    .. "Tester:assert(var ~= nil).")
+   end
+   return self:_assert_sub(condition,
+                           string.format('%sBOOL violation condition=%s',
+                                         message, tostring(condition)))
+end
+
+function Tester:assertGeneralEq(got, expected, ...)
+   return self:_eqOrNeq(got, expected, false, ...)
+end
+
+function Tester:eq(got, expected, ...)
+   return self:assertGeneralEq(got, expected, ...)
+end
+
+function Tester:assertGeneralNe(got, unexpected, ...)
+   return self:_eqOrNeq(got, unexpected, true, ...)
+end
+
+function Tester:ne(got, unexpected, ...)
+   return self:assertGeneralNe(got, unexpected, ...)
+end
+
+function Tester:_eqOrNeq(got, expected, negate, ...)
+   local tolerance, message = getToleranceAndMessage(0, ...)
+   local success, subMessage = check.areEq(got, expected, tolerance, negate)
+   subMessage = subMessage or ''
+   return self:_assert_sub(success, message .. subMessage)
+end
+
+function Tester:assertlt(a, b, ...)
+   local message = getMessage(...)
+   return self:_assert_sub(a < b,
+                           string.format('%sLT failed: %s >= %s',
+                                         message, tostring(a), tostring(b)))
+end
+
+function Tester:assertgt(a, b, ...)
+   local message = getMessage(...)
+   return self:_assert_sub(a > b,
+                           string.format('%sGT failed: %s <= %s',
+                                         message, tostring(a), tostring(b)))
+end
+
+function Tester:assertle(a, b, ...)
+   local message = getMessage(...)
+   return self:_assert_sub(a <= b,
+                           string.format('%sLE failed: %s > %s',
+                                         message, tostring(a), tostring(b)))
+end
+
+function Tester:assertge(a, b, ...)
+   local message = getMessage(...)
+   return self:_assert_sub(a >= b,
+                           string.format('%sGE failed: %s < %s',
+                                         message, tostring(a), tostring(b)))
+end
+
+function Tester:assertalmosteq(a, b, ...)
+   local tolerance, message = getToleranceAndMessage(1e-16, ...)
+   local diff = math.abs(a - b)
+   return self:_assert_sub(
+         diff <= tolerance,
+         string.format(
+               '%sALMOST_EQ failed: %s ~= %s with tolerance=%s',
+               message, tostring(a), tostring(b), tostring(tolerance)))
+end
+
+function Tester:asserteq(a, b, ...)
+   local message = getMessage(...)
+   return self:_assert_sub(a == b,
+                           string.format('%sEQ failed: %s ~= %s',
+                                         message, tostring(a), tostring(b)))
+end
+
+function Tester:assertne(a, b, ...)
+   local message = getMessage(...)
+   if type(a) == type(b) and type(a) == 'table' or type(a) == 'userdata' then
+      self:_warning(" :assertne should only be used to compare basic lua "
+                    .. "objects (numbers, booleans, etc). Consider using "
+                    .. "either :assertGeneralNe or :assert(a ~= b).")
+   end
+   return self:_assert_sub(a ~= b,
+                           string.format('%sNE failed: %s == %s',
+                                         message, tostring(a), tostring(b)))
+end
+
+function Tester:assertTensorEq(ta, tb, ...)
+  return self:_assertTensorEqOrNeq(ta, tb, false, ...)
+end
+
+function Tester:assertTensorNe(ta, tb, ...)
+  return self:_assertTensorEqOrNeq(ta, tb, true, ...)
+end
+
+function Tester:_assertTensorEqOrNeq(ta, tb, negate, ...)
+   assert(torch.isTensor(ta), "First argument should be a Tensor")
+   assert(torch.isTensor(tb), "Second argument should be a Tensor")
+
+   local tolerance, message = getToleranceAndMessage(0, ...)
+   local success, subMessage =
+         check.areTensorsEq(ta, tb, tolerance, negate,
+                            self._assertTensorEqIgnoresDims)
+   subMessage = subMessage or ''
+
+   if self._assertTensorEqIgnoresDims and (not negate) and success
+         and not ta:isSameSizeAs(tb) then
+     self:_warning("Tensors have the same content but different dimensions. "
+                   .. "For backwards compatability, they are considered equal, "
+                   .. "but this may change in the future. Consider using :eq "
+                   .. "to check for equality instead.")
+   end
+
+   return self:_assert_sub(success, message .. subMessage)
+end
+
+function Tester:assertTableEq(ta, tb, ...)
+   return self:_assertTableEqOrNeq(ta, tb, false, ...)
+end
+
+function Tester:assertTableNe(ta, tb, ...)
+   return self:_assertTableEqOrNeq(ta, tb, true, ...)
+end
+
+function Tester:_assertTableEqOrNeq(ta, tb, negate, ...)
+   assert(type(ta) == 'table', "First argument should be a Table")
+   assert(type(tb) == 'table', "Second argument should be a Table")
+   return self:_eqOrNeq(ta, tb, negate, ...)
+end
+
+function Tester:assertError(f, ...)
+   return self:assertErrorObj(f, function() return true end, ...)
+end
+
+function Tester:assertNoError(f, ...)
+   local message = getMessage(...)
+   local status, err = pcall(f)
+   return self:_assert_sub(status,
+                           string.format('%sERROR violation: err=%s', message,
+                                         tostring(err)))
+end
+
+function Tester:assertErrorMsg(f, errmsg, ...)
+   return self:assertErrorObj(f, function(err) return err == errmsg end, ...)
+end
+
+function Tester:assertErrorPattern(f, errPattern, ...)
+   local function errcomp(err)
+      return string.find(err, errPattern) ~= nil
+   end
+   return self:assertErrorObj(f, errcomp, ...)
+end
+
+function Tester:assertErrorObj(f, errcomp, ...)
+   local message = getMessage(...)
+   local status, err = pcall(f)
+   return self:_assert_sub((not status) and errcomp(err),
+                           string.format('%sERROR violation: err=%s', message,
+                                         tostring(err)))
+end
+
+function Tester:add(f, name)
+   if type(f) == "table" then
+      assert(name == nil, "Name parameter is forbidden for a table of tests, "
+                          .. "since its use is ambiguous")
+      if f.__isTestSuite then
+         f = f.__tests
+      else
+         self:_warning("Should use TestSuite rather than plain lua table")
+      end
+      for i, v in pairs(f) do
+         -- We forbid nested tests because the "expected" behaviour when a named
+         -- test is run in the case that the named test is in fact a table of
+         -- tests is not supported. Similar issue with _setUp and _tearDown
+         -- functions inside nested tests.
+         assert(type(v) ~= 'table', "Nested sets of tests are not supported")
+         self:add(v, i)
+      end
+      return self
+   end
+
+   assert(type(f) == 'function',
+          "Only tables of functions and functions supported")
+
+   if name == '_setUp' then
+      assert(not self._setUp, "Only one set-up function allowed")
+      self._setUp = f
+   elseif name == '_tearDown' then
+      assert(not self._tearDown, "Only one tear-down function allowed")
+      self._tearDown = f
+   else
+      name = name or 'unknown'
+      if self.tests[name] ~= nil then
+         error('Test with name ' .. name .. ' already exists!')
+      end
+      self.tests[name] = f
+   end
+   return self
+end
+
+function Tester:disable(testNames)
+   if type(testNames) == 'string' then
+      testNames = {testNames}
+   end
+   assert(type(testNames) == 'table', "Expecting name or list for disable")
+   for _, name in ipairs(testNames) do
+      assert(self.tests[name], "Unrecognized test '" .. name .. "'")
+      self.disabledTests[name] = true
+   end
+   return self
+end
+
+function Tester:run(testNames)
+   local tests = self:_getTests(testNames)
+   self.assertionPass = {}
+   self.assertionFail = {}
+   self.haveWarning = {}
+   self.testError = {}
+   for name in pairs(tests) do
+      self.assertionPass[name] = 0
+      self.assertionFail[name] = 0
+      self.testError[name] = 0
+      self._warningCount[name] = 0
+   end
+   self:_run(tests)
+   self:_report(tests)
+
+   -- Throws an error on test failure/error, so that test script returns
+   -- with nonzero return value.
+   for name in pairs(tests) do
+      assert(self.assertionFail[name] == 0,
+             'An error was found while running tests!')
+      assert(self.testError[name] == 0,
+             'An error was found while running tests!')
+   end
+
+   return 0
+end
+
+local function pluralize(num, str)
+   local stem = num .. ' ' .. str
+   if num == 1 then
+      return stem
+   else
+      return stem .. 's'
+   end
+end
+
+local NCOLS = 80
+local coloured
+local enable_colors, c = pcall(require, 'sys.colors')
+if arg and enable_colors then  -- have we been invoked from the commandline?
+   coloured = function(str, colour)
+      return colour .. str .. c.none
+   end
+else
+   c = {}
+   coloured = function(str)
+      return str
+   end
+end
+
+function Tester:_run(tests)
+   local ntests = 0
+   for _ in pairs(tests) do
+      ntests = ntests + 1
+   end
+
+   local ntestsAsString = string.format('%u', ntests)
+   local cfmt = string.format('%%%uu/%u ', ntestsAsString:len(), ntestsAsString)
+   local cfmtlen = ntestsAsString:len() * 2 + 2
+
+   local function bracket(str)
+      return '[' .. str .. ']'
+   end
+
+   io.write('Running ' .. pluralize(ntests, 'test') .. '\n')
+   local i = 1
+   for name, fn in pairs(tests) do
+      self._currentTestName = name
+
+      -- TODO: compute max length of name and cut it down to size if needed
+      local strinit = coloured(string.format(cfmt, i), c.cyan)
+                      .. self._currentTestName .. ' '
+                      .. string.rep('.',
+                                    NCOLS - 6 - 2 -
+                                    cfmtlen - self._currentTestName:len())
+                      .. ' '
+      io.write(strinit .. bracket(coloured('WAIT', c.cyan)))
+      io.flush()
+
+      local status, message, pass, skip
+      if self.disabledTests[name] then
+         skip = true
+      else
+         skip = false
+         if self._setUp then
+            self._setUp(name)
+         end
+         if self.rethrow then
+            status = true
+            local nerr = #self.errors
+            message = fn()
+            pass = nerr == #self.errors
+         else
+            status, message, pass = self:_pcall(fn)
+         end
+         if self._tearDown then
+            self._tearDown(name)
+         end
+      end
+
+      io.write('\r')
+      io.write(strinit)
+
+      if skip then
+         io.write(bracket(coloured('SKIP', c.yellow)))
+      elseif not status then
+         self.testError[name] = 1
+         io.write(bracket(coloured('ERROR', c.magenta)))
+      elseif not pass then
+         io.write(bracket(coloured('FAIL', c.red)))
+      else
+         io.write(bracket(coloured('PASS', c.green)))
+         if self._warningCount[name] > 0 then
+            io.write('\n' .. string.rep(' ', NCOLS - 10))
+            io.write(bracket(coloured('+warning', c.yellow)))
+         end
+      end
+      io.write('\n')
+      io.flush()
+
+      if self.earlyAbort and (i < ntests) and (not status or not pass)
+            and (not skip) then
+         io.write('Aborting on first error, not all tests have been executed\n')
+         break
+      end
+
+      i = i + 1
+
+      collectgarbage()
+   end
+end
+
+function Tester:_pcall(f)
+   local nerr = #self.errors
+   local stat, result = xpcall(f, debug.traceback)
+   if not stat then
+      self.errors[#self.errors + 1] =
+         self._currentTestName .. '\n Function call failed\n' .. result .. '\n'
+   end
+   return stat, result, stat and (nerr == #self.errors)
+end
+
+function Tester:_getTests(testNames)
+   if testNames == nil then
+      return self.tests
+   end
+   if type(testNames) == 'string' then
+      testNames = {testNames}
+   end
+   assert(type(testNames) == 'table',
+          "Only accept a name or table of test names (or nil for all tests)")
+
+   local function getMatchingNames(pattern)
+      local matchingNames = {}
+      for name in pairs(self.tests) do
+         if string.match(name, pattern) then
+            table.insert(matchingNames, name)
+         end
+      end
+      return matchingNames
+   end
+
+   local tests = {}
+   for _, pattern in ipairs(testNames) do
+      local matchingNames = getMatchingNames(pattern)
+      assert(#matchingNames > 0, "Couldn't find test '" .. pattern .. "'")
+      for _, name in ipairs(matchingNames) do
+         tests[name] = self.tests[name]
+      end
+   end
+   return tests
+end
+
+function Tester:_report(tests)
+   local ntests = 0
+   local nfailures = 0
+   local nerrors = 0
+   local nskipped = 0
+   local nwarnings = 0
+   self.countasserts = 0
+   for name in pairs(tests) do
+      ntests = ntests + 1
+      self.countasserts = self.countasserts + self.assertionFail[name]
+                          + self.assertionPass[name]
+      if self.assertionFail[name] > 0 then
+         nfailures = nfailures + 1
+      end
+      if self.testError[name] > 0 then
+         nerrors = nerrors + 1
+      end
+      if self._warningCount[name] > 0 then
+         nwarnings = nwarnings + 1
+      end
+      if self.disabledTests[name] then
+         nskipped = nskipped + 1
+      end
+   end
+   if self._warningCount[''] then
+      nwarnings = nwarnings + self._warningCount['']
+   end
+
+   io.write('Completed ' .. pluralize(self.countasserts, 'assert'))
+   io.write(' in ' .. pluralize(ntests, 'test') .. ' with ')
+   io.write(coloured(pluralize(nfailures, 'failure'),
+                     nfailures == 0 and c.green or c.red))
+   io.write(' and ')
+   io.write(coloured(pluralize(nerrors, 'error'),
+                     nerrors == 0 and c.green or c.magenta))
+   if nwarnings > 0 then
+      io.write(' and ')
+      io.write(coloured(pluralize(nwarnings, 'warning'), c.yellow))
+   end
+   if nskipped > 0 then
+      io.write(' and ')
+      io.write(coloured(nskipped .. ' disabled', c.yellow))
+   end
+   io.write('\n')
+
+   -- Prints off a message separated by -----
+   local haveSection = false
+   local function addSection(text)
+      local function printDashes()
+         io.write(string.rep('-', NCOLS) .. '\n')
+      end
+      if not haveSection then
+         printDashes()
+         haveSection = true
+      end
+      io.write(text .. '\n')
+      printDashes()
+   end
+
+   if not self.summaryOnly then
+      for _, v in ipairs(self.errors) do
+         addSection(v)
+      end
+      for _, v in ipairs(self.warnings) do
+         addSection(v)
+      end
+   end
+end
+
+
+--[[ Tests for tensor equality between two tensors of matching sizes and types.
+
+Tests whether the maximum element-wise difference between `ta` and `tb` is less
+than or equal to `tolerance`.
+
+Arguments:
+* `ta` (tensor)
+* `tb` (tensor)
+* `tolerance` (number) maximum elementwise difference between `ta` and `tb`.
+* `negate` (boolean) if true, we invert success and failure.
+* `storage` (boolean) if true, we print an error message referring to Storages
+    rather than Tensors.
+
+Returns:
+1. success, boolean that indicates success
+2. failure_message, string or nil
+]]
+function check.areSameFormatTensorsEq(ta, tb, tolerance, negate, storage)
+   local function ensureHasAbs(t)
+      -- Byte, Char and Short Tensors don't have abs
+      return t.abs and t or t:double()
+   end
+
+   ta = ensureHasAbs(ta)
+   tb = ensureHasAbs(tb)
+
+   local diff = ta:clone():add(-1, tb):abs()
+   local err = diff:max()
+   local success = err <= tolerance
+   if negate then
+      success = not success
+   end
+
+   local errMessage
+   if not success then
+      local prefix = storage and 'Storage' or 'Tensor'
+      local violation = negate and 'NE(==)' or 'EQ(==)'
+      errMessage = string.format('%s%s violation: max diff=%s, tolerance=%s',
+                                 prefix,
+                                 violation,
+                                 tostring(err),
+                                 tostring(tolerance))
+   end
+
+   return success, errMessage
+end
+
+--[[ Tests for tensor equality.
+
+Tests whether the maximum element-wise difference between `ta` and `tb` is less
+than or equal to `tolerance`.
+
+Arguments:
+* `ta` (tensor)
+* `tb` (tensor)
+* `tolerance` (number) maximum elementwise difference between `ta` and `tb`.
+* `negate` (boolean) if negate is true, we invert success and failure.
+* `ignoreTensorDims` (boolean, default false) if true, then tensors of the same
+    size but different dimensions can still be considered equal, e.g.,
+    {{1}} == {1}. For backwards compatibility.
+
+Returns:
+1. success, boolean that indicates success
+2. failure_message, string or nil
+]]
+function check.areTensorsEq(ta, tb, tolerance, negate, ignoreTensorDims)
+   ignoreTensorDims = ignoreTensorDims or false
+
+   if not ignoreTensorDims and ta:dim() ~= tb:dim() then
+      return negate, 'The tensors have different dimensions'
+   end
+
+   if ta:type() ~= tb:type() then
+      return negate, 'The tensors have different types'
+   end
+
+   -- If we are comparing two empty tensors, return true.
+   -- This is needed because some functions below cannot be applied to tensors
+   -- of dimension 0.
+   if ta:dim() == 0 and tb:dim() == 0 then
+      return not negate, 'Both tensors are empty'
+   end
+
+   local sameSize
+   if ignoreTensorDims then
+      sameSize = ta:nElement() == tb:nElement()
+   else
+      sameSize = ta:isSameSizeAs(tb)
+   end
+   if not sameSize then
+      return negate, 'The tensors have different sizes'
+   end
+
+   return check.areSameFormatTensorsEq(ta, tb, tolerance, negate, false)
+end
+
+local typesMatching = {
+      ['torch.ByteStorage'] = torch.ByteTensor,
+      ['torch.CharStorage'] = torch.CharTensor,
+      ['torch.ShortStorage'] = torch.ShortTensor,
+      ['torch.IntStorage'] = torch.IntTensor,
+      ['torch.LongStorage'] = torch.LongTensor,
+      ['torch.FloatStorage'] = torch.FloatTensor,
+      ['torch.DoubleStorage'] = torch.DoubleTensor,
+}
+
+--[[ Tests for storage equality.
+
+Tests whether the maximum element-wise difference between `sa` and `sb` is less
+than or equal to `tolerance`.
+
+Arguments:
+* `sa` (storage)
+* `sb` (storage)
+* `tolerance` (number) maximum elementwise difference between `a` and `b`.
+* `negate` (boolean) if negate is true, we invert success and failure.
+
+Returns:
+1. success, boolean that indicates success
+2. failure_message, string or nil
+]]
+function check.areStoragesEq(sa, sb, tolerance, negate)
+   if sa:size() ~= sb:size() then
+      return negate, 'The storages have different sizes'
+   end
+
+   local typeOfsa = torch.type(sa)
+   local typeOfsb = torch.type(sb)
+
+   if typeOfsa ~= typeOfsb then
+      return negate, 'The storages have different types'
+   end
+
+   local ta = typesMatching[typeOfsa](sa)
+   local tb = typesMatching[typeOfsb](sb)
+
+   return check.areSameFormatTensorsEq(ta, tb, tolerance, negate, true)
+end
+
+--[[ Tests for general (deep) equality.
+
+The types of `got` and `expected` must match.
+Tables are compared recursively. Keys and types of the associated values must
+match, recursively. Numbers are compared with the given tolerance.
+Torch tensors and storages are compared with the given tolerance on their
+elementwise difference. Other types are compared for strict equality with the
+regular Lua == operator.
+
+Arguments:
+* `got`
+* `expected`
+* `tolerance` (number) maximum elementwise difference between `a` and `b`.
+* `negate` (boolean) if negate is true, we invert success and failure.
+
+Returns:
+1. success, boolean that indicates success
+2. failure_message, string or nil
+]]
+function check.areEq(got, expected, tolerance, negate)
+   local errMessage
+   if type(got) ~= type(expected) then
+      if not negate then
+         errMessage = 'EQ failed: values have different types (first: '
+                      .. type(got) .. ', second: ' .. type(expected) .. ')'
+      end
+      return negate, errMessage
+   elseif type(got) == 'number' then
+      local diff = math.abs(got - expected)
+      local ok = (diff <= tolerance)
+      if negate then
+         ok = not ok
+      end
+      if not ok then
+         if negate then
+            errMessage = string.format("NE failed: %s == %s",
+                                       tostring(got), tostring(expected))
+         else
+            errMessage = string.format("EQ failed: %s ~= %s",
+                                       tostring(got), tostring(expected))
+         end
+         if tolerance > 0 then
+            errMessage = errMessage .. " with tolerance=" .. tostring(tolerance)
+         end
+      end
+      return ok, errMessage
+   elseif type(expected) == "table" then
+     return check.areTablesEq(got, expected, tolerance, negate)
+   elseif torch.isTensor(got) then
+     return check.areTensorsEq(got, expected, tolerance, negate)
+   elseif torch.isStorage(got) then
+     return check.areStoragesEq(got, expected, tolerance, negate)
+   else
+     -- Below: we have the same type which is either userdata or a lua type
+     -- which is not a number.
+     local ok = (got == expected)
+     if negate then
+        ok = not ok
+     end
+     if not ok then
+        if negate then
+           errMessage = string.format("NE failed: %s (%s) == %s (%s)",
+                                      tostring(got), type(got),
+                                      tostring(expected), type(expected))
+        else
+           errMessage = string.format("EQ failed: %s (%s) ~= %s (%s)",
+                                      tostring(got), type(got),
+                                      tostring(expected), type(expected))
+        end
+     end
+     return ok, errMessage
+   end
+end
+
+--[[ Tests for (deep) table equality.
+
+Tables are compared recursively. Keys and types of the associated values must
+match, recursively. Numbers are compared with the given tolerance.
+Torch tensors and storages are compared with the given tolerance on their
+elementwise difference. Other types are compared for strict equality with the
+regular Lua == operator.
+
+Arguments:
+* `t1` (table)
+* `t2` (table)
+* `tolerance` (number) maximum elementwise difference between `a` and `b`.
+* `negate` (boolean) if negate is true, we invert success and failure.
+
+Returns:
+1. success, boolean that indicates success
+2. failure_message, string or nil
+]]
+function check.areTablesEq(t1, t2, tolerance, negate)
+   -- Implementation detail: Instead of doing a depth-first table comparison
+   -- check (for example, using recursion), let's do a breadth-first search
+   -- using a queue. Why? Because if we have two tables that are quite deep
+   -- (e.g., a gModule from nngraph), then if they are different then it's
+   -- more useful to the user to show how they differ at as-shallow-a-depth
+   -- as possible.
+   local queue = {}
+   queue._head = 1
+   queue._tail = 1
+   function queue.isEmpty()
+      return queue._tail == queue._head
+   end
+   function queue.pop()
+      queue._head = queue._head + 1
+      return queue[queue._head - 1]
+   end
+   function queue.push(value)
+      queue[queue._tail] = value
+      queue._tail = queue._tail + 1
+   end
+
+   queue.push({t1, t2})
+   while not queue.isEmpty() do
+      local location
+      t1, t2, location = unpack(queue.pop())
+
+      local function toSublocation(key)
+         local keyAsString = tostring(key)
+         return (location and location .. "." .. keyAsString) or keyAsString
+      end
+
+      for key, value1 in pairs(t1) do
+         local sublocation = toSublocation(key)
+         if t2[key] == nil then
+            return negate, string.format(
+                  "Entry %s missing in second table (is %s in first)",
+                  sublocation, tostring(value1))
+         end
+         local value2 = t2[key]
+         if type(value1) == 'table' and type(value2) == 'table' then
+            queue.push({value1, value2, sublocation})
+         else
+            local ok, message = check.areEq(value1, value2, tolerance, false)
+            if not ok then
+               message = 'At table location ' .. sublocation .. ': ' .. message
+               return negate, message
+            end
+         end
+      end
+
+      for key, value2 in pairs(t2) do
+         local sublocation = toSublocation(key)
+         if t1[key] == nil then
+             return negate, string.format(
+                   "Entry %s missing in first table (is %s in second)",
+                   sublocation, tostring(value2))
+         end
+      end
+   end
+   return not negate, 'The tables are equal'
+end
diff --git a/Timer.c b/Timer.c
new file mode 100644
index 0000000..796190b
--- /dev/null
+++ b/Timer.c
@@ -0,0 +1,170 @@
+#include "general.h"
+
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+#include <time.h>
+#else
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+typedef struct _Timer
+{
+    int isRunning;
+
+    double totalrealtime;
+    double totalusertime;
+    double totalsystime;
+
+    double startrealtime;
+    double startusertime;
+    double startsystime;
+
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+  time_t base_time;
+#endif
+
+} Timer;
+
+static double torch_Timer_realtime()
+{
+#ifdef WIN32
+  time_t ltime;
+  time(&ltime);
+  return (double)(ltime);
+#else
+  struct timeval current;
+  gettimeofday(&current, NULL);
+  return (current.tv_sec + current.tv_usec/1000000.0);
+#endif
+}
+
+static double torch_Timer_usertime()
+{
+#ifdef WIN32
+  return torch_Timer_realtime();
+#else
+  struct rusage current;
+  getrusage(RUSAGE_SELF, &current);
+  return (current.ru_utime.tv_sec + current.ru_utime.tv_usec/1000000.0);
+#endif
+}
+
+static double torch_Timer_systime()
+{
+#ifdef WIN32
+  return 0;
+#else
+  struct rusage current;
+  getrusage(RUSAGE_SELF, &current);
+  return (current.ru_stime.tv_sec + current.ru_stime.tv_usec/1000000.0);
+#endif
+}
+
+static int torch_Timer_new(lua_State *L)
+{
+  Timer *timer = luaT_alloc(L, sizeof(Timer));
+#ifdef _MSC_VER
+  timer->base_time = 0;
+  while(!timer->base_time)
+    time(&timer->base_time);
+#endif
+  timer->isRunning = 1;
+  timer->totalrealtime = 0;
+  timer->totalusertime = 0;
+  timer->totalsystime = 0;
+  timer->startrealtime = torch_Timer_realtime();
+  timer->startusertime = torch_Timer_usertime();
+  timer->startsystime = torch_Timer_systime();
+  luaT_pushudata(L, timer, "torch.Timer");
+  return 1;
+}
+
+static int torch_Timer_reset(lua_State *L)
+{
+  Timer *timer = luaT_checkudata(L, 1, "torch.Timer");
+  timer->totalrealtime = 0;
+  timer->totalusertime = 0;
+  timer->totalsystime = 0;
+  timer->startrealtime = torch_Timer_realtime();
+  timer->startusertime = torch_Timer_usertime();
+  timer->startsystime = torch_Timer_systime();
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Timer_free(lua_State *L)
+{
+  Timer *timer = luaT_checkudata(L, 1, "torch.Timer");
+  luaT_free(L, timer);
+  return 0;
+}
+
+static int torch_Timer_stop(lua_State *L)
+{
+  Timer *timer = luaT_checkudata(L, 1, "torch.Timer");
+  if(timer->isRunning)  
+  {
+    double realtime = torch_Timer_realtime() - timer->startrealtime;
+    double usertime = torch_Timer_usertime() - timer->startusertime;
+    double systime = torch_Timer_systime() - timer->startsystime;
+    timer->totalrealtime += realtime;
+    timer->totalusertime += usertime;
+    timer->totalsystime += systime;
+    timer->isRunning = 0;
+  }
+  lua_settop(L, 1);
+  return 1;  
+}
+
+static int torch_Timer_resume(lua_State *L)
+{
+  Timer *timer = luaT_checkudata(L, 1, "torch.Timer");
+  if(!timer->isRunning)
+  {
+    timer->isRunning = 1;
+    timer->startrealtime = torch_Timer_realtime();
+    timer->startusertime = torch_Timer_usertime();
+    timer->startsystime = torch_Timer_systime();
+  }
+  lua_settop(L, 1);
+  return 1;  
+}
+
+static int torch_Timer_time(lua_State *L)
+{
+  Timer *timer = luaT_checkudata(L, 1, "torch.Timer");
+  double realtime = (timer->isRunning ? (timer->totalrealtime + torch_Timer_realtime() - timer->startrealtime) : timer->totalrealtime);
+  double usertime = (timer->isRunning ? (timer->totalusertime + torch_Timer_usertime() - timer->startusertime) : timer->totalusertime);
+  double systime = (timer->isRunning ? (timer->totalsystime + torch_Timer_systime() - timer->startsystime) : timer->totalsystime);
+  lua_createtable(L, 0, 3);
+  lua_pushnumber(L, realtime);
+  lua_setfield(L, -2, "real");
+  lua_pushnumber(L, usertime);
+  lua_setfield(L, -2, "user");
+  lua_pushnumber(L, systime);
+  lua_setfield(L, -2, "sys");
+  return 1;
+}
+
+static int torch_Timer___tostring__(lua_State *L)
+{
+  Timer *timer = luaT_checkudata(L, 1, "torch.Timer");
+  lua_pushfstring(L, "torch.Timer [status: %s]", (timer->isRunning ? "running" : "stopped"));
+  return 1;
+}
+
+static const struct luaL_Reg torch_Timer__ [] = {
+  {"reset", torch_Timer_reset},
+  {"stop", torch_Timer_stop},
+  {"resume", torch_Timer_resume},
+  {"time", torch_Timer_time},
+  {"__tostring__", torch_Timer___tostring__},
+  {NULL, NULL}
+};
+
+void torch_Timer_init(lua_State *L)
+{
+  luaT_newmetatable(L, "torch.Timer", NULL, torch_Timer_new, torch_Timer_free, NULL);
+  luaT_setfuncs(L, torch_Timer__, 0);
+  lua_pop(L, 1);
+}
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
new file mode 100644
index 0000000..3d85eb1
--- /dev/null
+++ b/cmake/TorchConfig.cmake.in
@@ -0,0 +1,35 @@
+# This (ugly) setup assumes:
+#  CMAKE_PREFIX_PATH = LUA_BINDIR
+#  CMAKE_INSTALL_PREFIX = PREFIX
+
+# Define Torch basic subpaths
+SET(Torch_INSTALL_PREFIX "@Torch_INSTALL_PREFIX@")
+
+SET(Torch_INSTALL_BIN_SUBDIR "@Torch_INSTALL_BIN_SUBDIR@")
+SET(Torch_INSTALL_MAN_SUBDIR "@Torch_INSTALL_MAN_SUBDIR@")
+SET(Torch_INSTALL_LIB_SUBDIR "@Torch_INSTALL_LIB_SUBDIR@")
+SET(Torch_INSTALL_SHARE_SUBDIR "@Torch_INSTALL_SHARE_SUBDIR@")
+SET(Torch_INSTALL_INCLUDE_SUBDIR "@Torch_INSTALL_INCLUDE_SUBDIR@")
+SET(Torch_INSTALL_CMAKE_SUBDIR "@Torch_INSTALL_CMAKE_SUBDIR@")
+SET(Torch_INSTALL_LUA_PATH_SUBDIR "@Torch_INSTALL_LUA_PATH_SUBDIR@")
+SET(Torch_INSTALL_LUA_CPATH_SUBDIR "@Torch_INSTALL_LUA_CPATH_SUBDIR@")
+SET(Torch_INSTALL_CMAKE_RIDBUS "@Torch_INSTALL_CMAKE_RIDBUS@")
+
+FILE(RELATIVE_PATH Torch_INSTALL_LUA_PATH_SUBDIR "${Torch_INSTALL_PREFIX}" "${CMAKE_INSTALL_PREFIX}/lua")
+FILE(RELATIVE_PATH Torch_INSTALL_LUA_CPATH_SUBDIR "${Torch_INSTALL_PREFIX}" "${CMAKE_INSTALL_PREFIX}/lib")
+
+SET(CMAKE_MODULE_PATH "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_CMAKE_SUBDIR}" "${CMAKE_MODULE_PATH}")
+SET(CMAKE_INSTALL_PREFIX "${Torch_INSTALL_PREFIX}") # override
+
+INCLUDE(TorchPathsInit)
+INCLUDE(TorchPackage)
+INCLUDE(TorchWrap)
+
+# Define Torch basic targets
+INCLUDE(TorchExports)
+
+INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}")
+INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/TH")
+LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
+
+MESSAGE(STATUS "Found Torch7 in ${Torch_INSTALL_PREFIX}")
diff --git a/cmake/TorchExports.cmake b/cmake/TorchExports.cmake
new file mode 100644
index 0000000..8443cee
--- /dev/null
+++ b/cmake/TorchExports.cmake
@@ -0,0 +1,14 @@
+INSTALL(EXPORT TH-exports
+  DESTINATION "${Torch_INSTALL_CMAKE_SUBDIR}"
+  FILE "TorchExports.cmake")
+
+CONFIGURE_FILE("cmake/TorchConfig.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/TorchConfig.cmake" @ONLY)
+CONFIGURE_FILE("cmake/TorchWrap.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/TorchWrap.cmake" @ONLY)
+
+INSTALL(
+  FILES
+  "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/TorchConfig.cmake"
+  "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/TorchWrap.cmake"
+  "cmake/TorchPathsInit.cmake"
+  "cmake/TorchPackage.cmake"
+  DESTINATION "${Torch_INSTALL_CMAKE_SUBDIR}")
diff --git a/cmake/TorchPackage.cmake b/cmake/TorchPackage.cmake
new file mode 100644
index 0000000..7fcbdff
--- /dev/null
+++ b/cmake/TorchPackage.cmake
@@ -0,0 +1,53 @@
+# -*- cmake -*-
+
+MACRO(ADD_TORCH_PACKAGE package src luasrc)
+  INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+  INCLUDE_DIRECTORIES(${Torch_LUA_INCLUDE_DIR})
+
+ ### C/C++ sources
+ # As per CMake doc, macro arguments are not variables, so simple test syntax not working
+  IF(NOT "${src}" STREQUAL "")
+
+    if ("${src}" MATCHES "cu$" OR "${src}" MATCHES "cu;")
+      CUDA_ADD_LIBRARY(${package} MODULE ${src})
+      if(BUILD_STATIC)
+        CUDA_ADD_LIBRARY(${package}_static STATIC ${src})
+      endif()
+    else()
+      ADD_LIBRARY(${package} MODULE ${src})
+      if(BUILD_STATIC)
+        ADD_LIBRARY(${package}_static STATIC ${src})
+      endif()
+    endif()
+
+    ### Torch packages supposes libraries prefix is "lib"
+    SET_TARGET_PROPERTIES(${package} PROPERTIES
+      PREFIX "lib"
+      IMPORT_PREFIX "lib"
+      INSTALL_NAME_DIR "@executable_path/${Torch_INSTALL_BIN2CPATH}")
+
+    IF(APPLE)
+      SET_TARGET_PROPERTIES(${package} PROPERTIES
+        LINK_FLAGS "-undefined dynamic_lookup")
+    ENDIF()
+
+    if(BUILD_STATIC)
+      SET_TARGET_PROPERTIES(${package}_static PROPERTIES
+        COMPILE_FLAGS "-fPIC")
+      SET_TARGET_PROPERTIES(${package}_static PROPERTIES
+        PREFIX "lib" IMPORT_PREFIX "lib" OUTPUT_NAME "${package}")
+    endif()
+
+    INSTALL(TARGETS ${package}
+      RUNTIME DESTINATION ${Torch_INSTALL_LUA_CPATH_SUBDIR}
+      LIBRARY DESTINATION ${Torch_INSTALL_LUA_CPATH_SUBDIR})
+
+  ENDIF(NOT "${src}" STREQUAL "")
+
+  ### lua sources
+  IF(NOT "${luasrc}" STREQUAL "")
+    INSTALL(FILES ${luasrc}
+      DESTINATION ${Torch_INSTALL_LUA_PATH_SUBDIR}/${package})
+  ENDIF(NOT "${luasrc}" STREQUAL "")
+
+ENDMACRO(ADD_TORCH_PACKAGE)
diff --git a/cmake/TorchPaths.cmake b/cmake/TorchPaths.cmake
new file mode 100644
index 0000000..b0417aa
--- /dev/null
+++ b/cmake/TorchPaths.cmake
@@ -0,0 +1,32 @@
+# workaround another annoying cmake bug
+# http://public.kitware.com/Bug/view.php?id=14462
+# https://awesome.naquadah.org/bugs/index.php?do=details&task_id=869
+MACRO(NORMALIZE_PATH _path_)
+  get_filename_component(${_path_}_abs "${${_path_}}" ABSOLUTE)
+  SET(${_path_} "${${_path_}_abs}")
+ENDMACRO()
+
+NORMALIZE_PATH(LUA_BINDIR)
+NORMALIZE_PATH(LUA_LIBDIR)
+NORMALIZE_PATH(LUA_INCDIR)
+NORMALIZE_PATH(LUADIR)
+NORMALIZE_PATH(LIBDIR)
+
+GET_FILENAME_COMPONENT(CMAKE_INSTALL_PREFIX "${LUA_BINDIR}" PATH)
+
+SET(Torch_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+FILE(RELATIVE_PATH Torch_INSTALL_BIN_SUBDIR "${CMAKE_INSTALL_PREFIX}" "${LUA_BINDIR}")
+FILE(RELATIVE_PATH Torch_INSTALL_LIB_SUBDIR "${CMAKE_INSTALL_PREFIX}" "${LUA_LIBDIR}")
+FILE(RELATIVE_PATH Torch_INSTALL_INCLUDE_SUBDIR "${CMAKE_INSTALL_PREFIX}" "${LUA_INCDIR}")
+
+SET(Torch_INSTALL_MAN_SUBDIR "share/man" CACHE PATH
+  "Install dir for man pages (relative to Torch_INSTALL_PREFIX)")
+
+SET(Torch_INSTALL_SHARE_SUBDIR "share" CACHE PATH
+  "Install dir for data (relative to Torch_INSTALL_PREFIX)")
+
+SET(Torch_INSTALL_CMAKE_SUBDIR "share/cmake/torch" CACHE PATH
+  "Install dir for .cmake files (relative to Torch_INSTALL_PREFIX)")
+
+FILE(RELATIVE_PATH Torch_INSTALL_LUA_PATH_SUBDIR "${CMAKE_INSTALL_PREFIX}" "${LUADIR}")
+FILE(RELATIVE_PATH Torch_INSTALL_LUA_CPATH_SUBDIR "${CMAKE_INSTALL_PREFIX}" "${LIBDIR}")
diff --git a/cmake/TorchPathsInit.cmake b/cmake/TorchPathsInit.cmake
new file mode 100644
index 0000000..2b59c4a
--- /dev/null
+++ b/cmake/TorchPathsInit.cmake
@@ -0,0 +1,42 @@
+SET(Torch_INSTALL_BIN "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_BIN_SUBDIR}")
+SET(Torch_INSTALL_MAN "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_MAN_SUBDIR}")
+SET(Torch_INSTALL_LIB "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_LIB_SUBDIR}")
+SET(Torch_INSTALL_SHARE "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_SHARE_SUBDIR}")
+SET(Torch_INSTALL_INCLUDE "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_INCLUDE_SUBDIR}")
+#SET(Torch_INSTALL_DOK "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_DOK_SUBDIR}")
+#SET(Torch_INSTALL_HTML "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_HTML_SUBDIR}")
+SET(Torch_INSTALL_CMAKE "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_CMAKE_SUBDIR}")
+SET(Torch_INSTALL_LUA_PATH "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_LUA_PATH_SUBDIR}")
+#SET(Torch_INSTALL_LUA_PKG_PATH "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_LUA_PKG_PATH_SUBDIR}")
+SET(Torch_INSTALL_LUA_CPATH "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_LUA_CPATH_SUBDIR}")
+#SET(Torch_INSTALL_LUAROCKS_SYSCONF "${Torch_INSTALL_PREFIX}/${Torch_INSTALL_LUAROCKS_SYSCONF_SUBDIR}")
+
+# reverse relative path to prefix (ridbus is the palindrom of subdir)
+FILE(RELATIVE_PATH Torch_INSTALL_BIN_RIDBUS "${Torch_INSTALL_BIN}" "${Torch_INSTALL_PREFIX}/.")
+FILE(RELATIVE_PATH Torch_INSTALL_CMAKE_RIDBUS "${Torch_INSTALL_CMAKE}" "${Torch_INSTALL_PREFIX}/.")
+GET_FILENAME_COMPONENT(Torch_INSTALL_BIN_RIDBUS "${Torch_INSTALL_BIN_RIDBUS}" PATH)
+GET_FILENAME_COMPONENT(Torch_INSTALL_CMAKE_RIDBUS "${Torch_INSTALL_CMAKE_RIDBUS}" PATH)
+
+IF(UNIX)
+  OPTION(WITH_RPATH "Build libraries with executable rpaths" ON)
+
+  IF(WITH_RPATH)
+    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    FILE(RELATIVE_PATH Torch_INSTALL_BIN2LIB
+      "${Torch_INSTALL_BIN}" "${Torch_INSTALL_LIB}")
+    IF(APPLE)
+      SET(CMAKE_MACOSX_RPATH TRUE) # @rpath in libs
+      SET(CMAKE_INSTALL_RPATH "@executable_path/${Torch_INSTALL_BIN2LIB}") # exec
+    ELSE()
+      SET(CMAKE_INSTALL_RPATH "\$ORIGIN/${Torch_INSTALL_BIN2LIB}")
+    ENDIF()
+  ELSE()
+    SET(CMAKE_MACOSX_RPATH FALSE) # no @rpath in libs
+  ENDIF()
+
+ENDIF(UNIX)
+
+IF (WIN32)
+  SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+  SET(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+ENDIF (WIN32)
diff --git a/cmake/TorchWrap.cmake b/cmake/TorchWrap.cmake
new file mode 100644
index 0000000..05bd978
--- /dev/null
+++ b/cmake/TorchWrap.cmake
@@ -0,0 +1,18 @@
+MACRO(ADD_TORCH_WRAP target luafile)
+  INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
+  GET_FILENAME_COMPONENT(_file_ "${luafile}" NAME_WE)
+  SET(cfile "${_file_}.c")
+  IF (DEFINED CWRAP_CUSTOM_LUA)
+    ADD_CUSTOM_COMMAND(
+	OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+	COMMAND ${CWRAP_CUSTOM_LUA} ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${luafile}" "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+    	WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    	DEPENDS "${luafile}")
+  ELSE (DEFINED CWRAP_CUSTOM_LUA)
+    ADD_CUSTOM_COMMAND(
+	OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+      	COMMAND ${LUA} ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${luafile}" "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+      	WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+      	DEPENDS "${luafile}")
+  ENDIF (DEFINED CWRAP_CUSTOM_LUA)
+ENDMACRO(ADD_TORCH_WRAP)
diff --git a/cmake/TorchWrap.cmake.in b/cmake/TorchWrap.cmake.in
new file mode 100644
index 0000000..5c20445
--- /dev/null
+++ b/cmake/TorchWrap.cmake.in
@@ -0,0 +1,19 @@
+MACRO(ADD_TORCH_WRAP target luafile)
+  INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
+  GET_FILENAME_COMPONENT(_file_ "${luafile}" NAME_WE)
+  SET(cfile "${_file_}.c")
+  IF (DEFINED CWRAP_CUSTOM_LUA)
+    ADD_CUSTOM_COMMAND(
+	OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+	COMMAND ${CWRAP_CUSTOM_LUA} ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${luafile}" "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+    	WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    	DEPENDS "${luafile}")
+  ELSE (DEFINED CWRAP_CUSTOM_LUA)
+    ADD_CUSTOM_COMMAND(
+	OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+      	COMMAND @LUA@ ARGS "${CMAKE_CURRENT_SOURCE_DIR}/${luafile}" "${CMAKE_CURRENT_BINARY_DIR}/${cfile}"
+      	WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+      	DEPENDS "${luafile}")
+  ENDIF (DEFINED CWRAP_CUSTOM_LUA)
+  ADD_CUSTOM_TARGET(${target} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${cfile}")
+ENDMACRO(ADD_TORCH_WRAP)
diff --git a/doc/cmdline.md b/doc/cmdline.md
new file mode 100644
index 0000000..3f33439
--- /dev/null
+++ b/doc/cmdline.md
@@ -0,0 +1,148 @@
+<a name="torch.CmdLine.dok"></a>
+# CmdLine #
+
+This class provides a parameter parsing framework which is very
+useful when one needs to run several experiments that rely on
+different parameter settings that are passed in the command line.
+This class will also override the default print function to direct
+all the output to a log file as well as screen at the same time.
+
+A sample `lua` file is given below that makes use of `CmdLine`
+class.
+
+```lua
+
+cmd = torch.CmdLine()
+cmd:text()
+cmd:text()
+cmd:text('Training a simple network')
+cmd:text()
+cmd:text('Options')
+cmd:option('-seed',123,'initial random seed')
+cmd:option('-booloption',false,'boolean option')
+cmd:option('-stroption','mystring','string option')
+cmd:text()
+
+-- parse input params
+params = cmd:parse(arg)
+
+params.rundir = cmd:string('experiment', params, {dir=true})
+paths.mkdir(params.rundir)
+
+-- create log file
+cmd:log(params.rundir .. '/log', params)
+
+```
+
+When this file is run on the th command line as follows
+```shell
+# th myscript.lua
+```
+
+It will produce the following output:
+
+```
+[program started on Tue Jan 10 15:33:49 2012]
+[command line arguments]
+booloption	false
+seed	123
+rundir	experiment
+stroption	mystring
+[----------------------]
+booloption	false
+seed	123
+rundir	experiment
+stroption	mystring
+```
+
+The same output will also be written to file
+`experiment/log`. Whenever one of the options are passed on the
+command line and is different than the default value, the `rundir`
+is name is produced to reflect the parameter setting.
+
+```shell
+# th myscript.lua -seed 456 -stroption mycustomstring
+```
+
+This will produce the following output:
+
+```
+[program started on Tue Jan 10 15:36:55 2012]
+[command line arguments]
+booloption	false
+seed	456
+rundir	experiment,seed=456,stroption=mycustomstring
+stroption	mycustomstring
+[----------------------]
+booloption	false
+seed	456
+rundir	experiment,seed=456,stroption=mycustomstring
+stroption	mycustomstring
+```
+
+and the output will be logged in
+`experiment,seed=456,stroption=mycustomstring/log`
+
+<a name="torch.CmdLine.addtime"></a>
+### addTime([name] [,format]) ###
+
+Adds a prefix to every line in the log file with the date/time in the
+given format with an optional name argument. The date/time format is
+the same as `os.date()`. Note that the prefix is only added to the
+log file, not the screen output. The default value for name is empty
+and the default format is '%F %T'.
+
+The final produced output for the following command is:
+
+```lua
+> cmd:addTime('your project name','%F %T')
+> print('Your log message')
+```
+
+```
+2012-02-07 08:21:56[your project name]: Your log message
+```
+
+<a name="torch.CmdLine.log"></a>
+### log(filename, parameter_table) ###
+
+It sets the log filename to `filename` and prints the values of
+parameters in the `parameter_table`. If filename is an open file
+descriptor, it will write to the file instead of creating a new one.
+
+<a name="torch.CmdLine.option"></a>
+### option(name, default, help) ###
+
+Stores an option argument. The name should always start with '-'.
+
+<a name="torch.CmdLine.parse"></a>
+### [table] parse(arg) ###
+
+Parses a given table, `arg` is by default the argument table that 
+is created by `lua` using the command line arguments passed to the 
+executable. Returns a table of option values.
+
+<a name="torch.CmdLine.silent"></a>
+### silent() ###
+
+Silences the output to standard output. The only output is written to
+the log file.
+
+<a name="torch.CmdLine.string"></a>
+### [string] string(prefix, params, ignore) ###
+
+Returns a string representation of the options by concatenating the
+non-default options. `ignore` is a table `{dir=true}`, which will
+ensure that option named `dir` will be ignored while creating the
+string representation.
+
+This function is useful for creating unique experiment directories that
+depend on the parameter settings.
+
+<a name="torch.CmdLine.text"></a>
+### text(string) ###
+
+Logs a custom text message.
+
+
+
diff --git a/doc/diskfile.md b/doc/diskfile.md
new file mode 100644
index 0000000..f00a0ef
--- /dev/null
+++ b/doc/diskfile.md
@@ -0,0 +1,74 @@
+<a name="torch.DiskFile.dok"></a>
+# DiskFile #
+
+Parent classes: [File](file.md)
+
+A `DiskFile` is a particular `File` which is able to perform basic read/write operations
+on a file stored on disk. It implements all methods described in [File](file.md), and
+some additional methods relative to _endian_ encoding.
+
+By default, a `DiskFile` is in [ASCII](file.md#torch.File.ascii) mode. If changed to
+the [binary](file.md#torch.File.binary) mode, the default endian encoding is the native
+computer one.
+
+The file might be open in read, write, or read-write mode, depending on the parameter
+`mode` (which can take the value `"r"`, `"w"` or `"rw"` respectively)
+given to the [torch.DiskFile(fileName, mode)](#torch.DiskFile).
+
+<a name="torch.DiskFile"></a>
+### torch.DiskFile(fileName, [mode], [quiet]) ###
+
+_Constructor_ which opens `fileName` on disk, using the given `mode`. Valid `mode` are
+`"r"` (read), `"w"` (write) or `"rw"` (read-write). Default is read mode.
+
+If read-write mode, the file _will be created_ if it does not exists. If it
+exists, it will be positioned at the beginning of the file after opening.
+
+If (and only if) `quiet` is `true`, no error will be raised in case of
+problem opening the file: instead `nil` will be returned.
+
+The file is opened in [ASCII](file.md#torch.File.ascii) mode by default.
+
+<a name="torch.DiskFile.bigEndianEncoding"></a>
+### bigEndianEncoding() ###
+
+In [binary](file.md#torch.File.binary) mode, force encoding in _big endian_.
+(_big end first_: decreasing numeric significance with increasing memory
+addresses)
+
+<a name="torch.DiskFile.isBigEndianCPU"></a>
+### [boolean] isBigEndianCPU() ###
+
+Returns `true` if, and only if, the computer CPU operates in _big endian_.
+_Big end first_: decreasing numeric significance with increasing
+memory addresses.
+
+<a name="torch.DiskFile.isLittleEndianCPU"></a>
+### [boolean] isLittleEndianCPU() ###
+
+Returns `true` if, and only if, the computer CPU operates in _little endian_.
+_Little end first_: increasing numeric significance with increasing
+memory addresses.
+
+<a name="torch.DiskFile.littleEndianEncoding"></a>
+### littleEndianEncoding() ###
+
+In [binary](file.md#torch.File.binary) mode, force encoding in _little endian_.
+(_little end first_: increasing numeric significance with increasing memory
+addresses)
+
+<a name="torch.DiskFile.nativeEndianEncoding"></a>
+### nativeEndianEncoding() ###
+
+In [binary](file.md#torch.File.binary) mode, force encoding in _native endian_.
+
+<a name="torch.DiskFile.longSize"/></a>
+### longSize([size]) ###
+
+Longs will be written and read from the file as `size` bytes long, which
+can be 0, 4 or 8. 0 means system default.
+
+<a name="torch.DiskFile.noBuffer"/></a>
+### noBuffer() ###
+
+Disables read and write buffering on the `DiskFile`.
diff --git a/doc/file.md b/doc/file.md
new file mode 100644
index 0000000..c4aa742
--- /dev/null
+++ b/doc/file.md
@@ -0,0 +1,364 @@
+<a name="torch.File.dok"></a>
+# File #
+
+This is an _abstract_ class. It defines most methods implemented by its
+child classes, like [DiskFile](diskfile.md),
+[MemoryFile](memoryfile.md) and [PipeFile](pipefile.md).
+
+Methods defined here are intended for basic read/write functionalities.
+Read/write methods might write in [ASCII](#torch.File.ascii) mode or
+[binary](#torch.File.binary) mode.
+
+In [ASCII](#torch.File.ascii) mode, numbers are converted in human readable
+format (characters). Booleans are converted into `0` (false) or `1` (true).
+In [binary](#torch.File.binary) mode, numbers and boolean are directly encoded
+as represented in a register of the computer. While not being human
+readable and less portable, the binary mode is obviously faster.
+
+In [ASCII](#torch.File.ascii) mode, if the default option
+[autoSpacing()](#torch.File.autoSpacing) is chosen, a space will be generated
+after each written number or boolean. A carriage return will also be added
+after each call to a write method. With this option, the spaces are
+supposed to exist while reading. This option can be deactivated with
+[noAutoSpacing()](#torch.File.noAutoSpacing).
+
+A `Lua` error might or might not be generated in case of read/write error
+or problem in the file. This depends on the choice made between
+[quiet()](#torch.File.quiet) and [pedantic()](#torch.File.pedantic) options. It
+is possible to query if an error occurred in the last operation by calling
+[hasError()](#torch.File.hasError).
+
+<a name="torch.File.read"></a>
+## Read methods ##
+<a name="torch.File.readByte"></a>
+<a name="torch.File.readBool"></a>
+<a name="torch.File.readShort"></a>
+<a name="torch.File.readChar"></a>
+<a name="torch.File.readLong"></a>
+<a name="torch.File.readInt"></a>
+<a name="torch.File.readDouble"></a>
+<a name="torch.File.readFloat"></a>
+
+They are three types of reading methods:
+
+  - `[number] readTYPE()`
+  - `[TYPEStorage] readTYPE(n)`
+  - `[number] readTYPE(TYPEStorage)`
+
+where `TYPE` can be either `Byte`, `Char`, `Short`, `Int`, `Long`, `Float` or `Double`.
+
+A convenience method also exist for boolean types: `[boolean] readBool()`. It reads
+a value on the file with `readInt()` and returns `true` if and only if this value is `1`. It is not possible
+to read storages of booleans.
+
+All these methods depends on the encoding choice: [ASCII](#torch.File.ascii)
+or [binary](#torch.File.binary) mode.  In [ASCII](#torch.File.ascii) mode, the
+option [autoSpacing()](#torch.File.autoSpacing) and
+[noAutoSpacing()](#torch.File.noAutoSpacing) have also an effect on these
+methods.
+
+If no parameter is given, one element is returned. This element is
+converted to a `Lua` number when reading.
+
+If `n` is given, `n` values of the specified type are read
+and returned in a new [Storage](storage.md) of that particular type.
+The storage size corresponds to the number of elements actually read.
+
+If a `Storage` is given, the method will attempt to read a number of elements
+equals to the size of the given storage, and fill up the storage with these elements.
+The number of elements actually read is returned.
+
+In case of read error, these methods will call the `Lua` error function using the default
+[pedantic](#torch.File.pedantic) option, or stay quiet with the [quiet](#torch.File.quiet)
+option. In the latter case, one can check if an error occurred with
+[hasError()](#torch.File.hasError).
+
+<a name="torch.File.write"></a>
+## Write methods ##
+<a name="torch.File.writeByte"></a>
+<a name="torch.File.writeBool"></a>
+<a name="torch.File.writeShort"></a>
+<a name="torch.File.writeChar"></a>
+<a name="torch.File.writeLong"></a>
+<a name="torch.File.writeInt"></a>
+<a name="torch.File.writeDouble"></a>
+<a name="torch.File.writeFloat"></a>
+
+They are two types of writing methods:
+
+  - `[number] writeTYPE(number)`
+  - `[number] writeTYPE(TYPEStorage)`
+
+where `TYPE` can be either `Byte`, `Char`, `Short`, `Int`, `Long`, `Float` or `Double`.
+
+A convenience method also exist for boolean types: `writeBool(value)`. If `value` is `nil` or
+not `true` a it is equivalent to a `writeInt(0)` call, else to `writeInt(1)`. It is not possible
+to write storages of booleans.
+
+All these methods depends on the encoding choice: [ASCII](#torch.File.ascii)
+or [binary](#torch.File.ascii) mode.  In [ASCII](#torch.File.ascii) mode, the
+option [autoSpacing()](#torch.File.autoSpacing) and
+[noAutoSpacing()](#torch.File.noAutoSpacing) have also an effect on these
+methods.
+
+If one `Lua` number is given, this number is converted according to the
+name of the method when writing (e.g. `writeInt(3.14)` will write `3`).
+
+If a `Storage` is given, the method will attempt to write all the elements contained
+in the storage.
+
+These methods return the number of elements actually written.
+
+In case of write error, these methods will call the `Lua` error function using the default
+[pedantic](#torch.File.pedantic) option, or stay quiet with the [quiet](#torch.File.quiet)
+option. In the latter case, one can check if an error occurred with
+[hasError()](#torch.File.hasError).
+
+<a name="torch.File.serialization"></a>
+## Serialization methods ##
+
+These methods allow the user to save any serializable objects on disk and
+reload it later in its original state. In other words, it can perform a
+_deep_ copy of an object into a given `File`.
+
+Serializable objects are `Torch` objects having a `read()` and
+`write()` method. `Lua` objects such as `table`, `number` or
+`string` or _pure Lua_ functions are also serializable.
+
+If the object to save contains several other objects (let say it is a tree
+of objects), then objects appearing several times in this tree will be
+_saved only once_. This saves disk space, speedup loading/saving and
+respect the dependencies between objects.
+
+Interestingly, if the `File` is a [MemoryFile](memoryfile.md), it allows
+the user to easily make a _clone_ of any serializable object:
+```lua
+file = torch.MemoryFile() -- creates a file in memory
+file:writeObject(object) -- writes the object into file
+file:seek(1) -- comes back at the beginning of the file
+objectClone = file:readObject() -- gets a clone of object
+```
+
+<a name="torch.File.readObject"></a>
+### readObject() ###
+
+Returns the next [serializable](#torch.File.serialization) object saved beforehand
+in the file with [writeObject()](#torch.File.writeObject).
+
+Note that objects which were [written](#torch.File.writeObject) with the same
+reference have still the same reference after loading.
+
+Example:
+```lua
+-- creates an array which contains twice the same tensor
+array = {}
+x = torch.Tensor(1)
+table.insert(array, x)
+table.insert(array, x)
+
+-- array[1] and array[2] refer to the same address
+-- x[1] == array[1][1] == array[2][1] == 3.14
+array[1][1] = 3.14
+
+-- write the array on disk
+file = torch.DiskFile('foo.asc', 'w')
+file:writeObject(array)
+file:close() -- make sure the data is written
+
+-- reload the array
+file = torch.DiskFile('foo.asc', 'r')
+arrayNew = file:readObject()
+
+-- arrayNew[1] and arrayNew[2] refer to the same address!
+-- arrayNew[1][1] == arrayNew[2][1] == 3.14
+-- so if we do now:
+arrayNew[1][1] = 2.72
+-- arrayNew[1][1] == arrayNew[2][1] == 2.72 !
+```
+
+<a name="torch.File.writeObject"></a>
+### writeObject(object) ###
+
+Writes `object` into the file. This object can be read later using
+[readObject()](#torch.File.readObject). Serializable objects are `Torch`
+objects having a `read()` and `write()` method. `Lua` objects such as
+`table`, `number` or `string` or pure Lua functions are also serializable.
+
+If the object has been already written in the file, only a _reference_ to
+this already saved object will be written: this saves space an speed-up
+writing; it also allows to keep the dependencies between objects intact.
+
+In returns, if one writes an object, modify its member, and write the
+object again in the same file, the modifications will not be recorded
+in the file, as only a reference to the original will be written. See
+[readObject()](#torch.File.readObject) for an example.
+
+<a name="torch.File.readString"></a>
+### [string] readString(format) ###
+
+If `format` starts with ''"*l"` then returns the next line in the `File''. The end-of-line character is skipped.
+
+If `format` starts with ''"*a"` then returns all the remaining contents of the `File''.
+
+If no data is available, then an error is raised, except if `File` is in [quiet()](#torch.File.quiet) mode where
+it then returns an empty string `''` and after that you'll be able to see that last reading failed due to end of file with your_file:[hasError()](#torch.File.hasError).
+
+Because Torch is more precise on number typing, the `Lua` format ''"*n"'' is not supported:
+instead use one of the [number read methods](#torch.File.read).
+
+<a name="torch.File.writeString"></a>
+### [number] writeString(str) ###
+
+Writes the string `str` in the `File`. If the string cannot be written completely an error is raised, except
+if `File` is in [quiet()](#torch.File.quiet) mode where it returns the number of character actually written.
+
+## General Access and Control Methods ##
+
+<a name="torch.File.ascii"></a>
+### ascii() [default] ###
+
+The data read or written will be in `ASCII` mode: all numbers are converted
+to characters (human readable format) and boolean are converted to `0`
+(false) or `1` (true). The input-output format in this mode depends on the
+options [autoSpacing()](#torch.File.autoSpacing) and
+[noAutoSpacing()](#torch.File.noAutoSpacing).
+
+<a name="torch.File.autoSpacing"></a>
+### autoSpacing() [default] ###
+
+In [ASCII](#torch.File.ascii) mode, write additional spaces around the elements
+written on disk: if writing a [Storage](storage.md), a space will be
+generated between each _element_ and a _return line_ after the last
+element. If only writing one element, a _return line_ will be generated
+after this element.
+
+Those spaces are supposed to exist while reading in this mode.
+
+This is the default behavior. You can de-activate this option with the
+[noAutoSpacing()](#torch.File.noAutoSpacing) method.
+
+<a name="torch.File.binary"></a>
+### binary() ###
+
+The data read or written will be in binary mode: the representation in the
+`File` is the same that the one in the computer memory/register (not human
+readable).  This mode is faster than [ASCII](#torch.File.ascii) but less
+portable.
+
+<a name="torch.File.clearError"></a>
+### clearError() ###
+
+Clear the error.flag returned by [hasError()](#torch.File.hasError).
+
+<a name="torch.File.close"></a>
+### close() ###
+
+Close the file. Any subsequent operation will generate a `Lua` error.
+
+<a name="torch.File.noAutoSpacing"></a>
+### noAutoSpacing() ###
+
+In [ASCII](#torch.File.ascii) mode, do not put extra spaces between element
+written on disk. This is the contrary of the option
+[autoSpacing()](#torch.File.autoSpacing).
+
+<a name="torch.File.synchronize"></a>
+### synchronize() ###
+
+If the child class bufferize the data while writing, ensure that the data
+is actually written.
+
+
+<a name="torch.File.pedantic"></a>
+### pedantic() [default] ###
+
+If this mode is chosen (which is the default), a `Lua` error will be
+generated in case of error (which will cause the program to stop).
+
+It is possible to use [quiet()](#torch.File.quiet) to avoid `Lua` error generation
+and set a flag instead.
+
+<a name="torch.File.position"></a>
+### [number] position() ###
+
+Returns the current position (in bytes) in the file.
+The first position is `1` (following Lua standard indexing).
+
+<a name="torch.File.quiet"></a>
+### quiet() ###
+
+If this mode is chosen instead of [pedantic()](#torch.File.pedantic), no `Lua`
+error will be generated in case of read/write error. Instead, a flag will
+be raised, readable through [hasError()](#torch.File.hasError). This flag can
+be cleared with [clearError()](#torch.File.clearError)
+
+Checking if a file is quiet can be performed using [isQuiet()](#torch.File.isQuiet).
+
+<a name="torch.File.seek"></a>
+### seek(position) ###
+
+Jump into the file at the given `position` (in byte). Might generate/raise
+an error in case of problem. The first position is `1` (following Lua standard indexing).
+
+<a name="torch.File.seekEnd"></a>
+### seekEnd() ###
+
+Jump at the end of the file. Might generate/raise an error in case of
+problem.
+
+## File state query ##
+
+These methods allow the user to query the state of the given `File`.
+
+<a name="torch.File.hasError"></a>
+### [boolean] hasError() ###
+
+Returns if an error occurred since the last [clearError()](#torch.File.clearError) call, or since
+the opening of the file if `clearError()` has never been called.
+
+<a name="torch.File.isQuiet"></a>
+### [boolean] isQuiet() ###
+
+Returns a boolean which tells if the file is in [quiet](#torch.File.quiet) mode or not.
+
+<a name="torch.File.isReadable"></a>
+### [boolean] isReadable() ###
+
+Tells if one can read the file or not.
+
+<a name="torch.File.isWritable"></a>
+### [boolean] isWritable() ###
+
+Tells if one can write in the file or not.
+
+<a name="torch.File.isAutoSpacing"></a>
+### [boolean] isAutoSpacing() ###
+
+Return `true` if [autoSpacing](#torch.File.autoSpacing) has been chosen.
+
+<a name="torch.File.referenced"></a>
+### referenced(ref) ###
+
+Sets the referenced property of the File to `ref`. `ref` has to be `true`
+or `false`.
+
+By default `ref` is true, which means that a File object keeps track of
+objects written (using [writeObject](#torch.File.writeObject) method) or
+read (using [readObject](#torch.File.readObject) method). Objects with the
+same address will be written or read only once, meaning that this approach
+preserves shared memory structured.
+
+Keeping track of references has a cost: every object which is serialized in
+the file is kept alive (even if one discards the object after
+writing/reading) as File needs to track their pointer. This is not always a
+desirable behavior, especially when dealing with large data structures.
+
+Another typical example when does not want reference tracking is when
+one needs to push the same tensor repeatedly into a file but every time
+changing its contents: calling `referenced(false)` ensures desired
+behaviour.
+
+<a name="torch.File.isReferenced"></a>
+### isReferenced() ###
+
+Return the state set by [referenced](#torch.File.referenced).
diff --git a/doc/gather.png b/doc/gather.png
new file mode 100644
index 0000000..22aa756
Binary files /dev/null and b/doc/gather.png differ
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 0000000..a0ce7d2
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,32 @@
+<a name="torch.reference.dok"></a>
+# Torch Package Reference Manual #
+
+[![Join the chat at https://gitter.im/torch/torch7](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/torch/torch7?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Build Status](https://travis-ci.org/torch/torch7.svg)](https://travis-ci.org/torch/torch7)
+
+__Torch__ is the main package in [Torch7](http://torch.ch) where data
+structures for multi-dimensional tensors and mathematical operations
+over these are defined. Additionally, it provides many utilities for
+accessing files, serializing objects of arbitrary types and other
+useful utilities.
+
+<a name="torch.reference.dok"></a>
+## Torch Packages ##
+
+  * Tensor Library
+    * [Tensor](tensor.md) defines the _all powerful_ tensor object that provides multi-dimensional numerical arrays with type templating.
+    * [Mathematical operations](maths.md) that are defined for the tensor object types.
+    * [Storage](storage.md) defines a simple storage interface that controls the underlying storage for any tensor object.
+  * File I/O Interface Library
+    * [File](file.md) is an abstract interface for common file operations.
+    * [Disk File](diskfile.md) defines operations on files stored on disk.
+    * [Memory File](memoryfile.md) defines operations on stored in RAM.
+    * [Pipe File](pipefile.md) defines operations for using piped commands.
+    * [High-Level File operations](serialization.md) defines higher-level serialization functions.
+  * Useful Utilities
+    * [Timer](timer.md) provides functionality for _measuring time_.
+    * [Tester](tester.md) is a generic tester framework.
+    * [CmdLine](cmdline.md) is a command line argument parsing utility.
+    * [Random](random.md) defines a random number generator package with various distributions.
+    * Finally useful [utility](utility.md) functions are provided for easy handling of torch tensor types and class inheritance.
+
diff --git a/doc/maths.md b/doc/maths.md
new file mode 100755
index 0000000..5916409
--- /dev/null
+++ b/doc/maths.md
@@ -0,0 +1,2851 @@
+<a name="torch.maths.dok"></a>
+# Math Functions #
+
+Torch provides MATLAB-like functions for manipulating [`Tensor`](tensor.md) objects.  Functions fall into several types of categories:
+
+  * [Constructors](#torch.construction.dok) like [`zeros`](#torch.zeros), [`ones`](#torch.ones);
+  * Extractors like [`diag`](#torch.diag)  and [`triu`](#torch.triu);
+  * [Element-wise](#torch.elementwise.dok) mathematical operations like [`abs`](#torch.abs) and [`pow`](#torch.pow);
+  * [BLAS](#torch.basicoperations.dok) operations;
+  * [Column or row-wise operations](#torch.columnwise.dok) like [`sum`](#torch.sum) and [`max`](#torch.max);
+  * [Matrix-wide operations](#torch.matrixwide.dok) like [`trace`](#torch.trace) and [`norm`](#torch.norm);
+  * [Convolution and cross-correlation](#torch.conv.dok) operations like [`conv2`](#torch.conv2);
+  * [Basic linear algebra operations](#torch.linalg.dok) like [`eig`](#torch.eig);
+  * [Logical operations](#torch.logical.dok) on `Tensor`s.
+
+By default, all operations allocate a new `Tensor` to return the result.
+However, all functions also support passing the target `Tensor`(s) as the first argument(s), in which case the target `Tensor`(s) will be resized accordingly and filled with result.
+This property is especially useful when one wants have tight control over when memory is allocated.
+
+The *Torch* package adopts the same concept, so that calling a function directly on the `Tensor` itself using an object-oriented syntax is equivalent to passing the `Tensor` as the optional resulting `Tensor`.
+The following two calls are equivalent.
+
+```lua
+torch.log(x, x)
+x:log()
+```
+
+Similarly, `torch.conv2` function can be used in the following manner.
+
+```lua
+> x = torch.rand(100, 100)
+> k = torch.rand(10, 10)
+> res1 = torch.conv2(x, k)   -- case 1
+
+> res2 = torch.Tensor()
+> torch.conv2(res2, x, k)     -- case 2
+
+> res2:dist(res1)
+0
+```
+
+The advantage of second case is, same `res2` `Tensor` can be used successively in a loop without any new allocation.
+
+```lua
+-- no new memory allocations...
+> for i = 1, 100 do
+     torch.conv2(res2, x, k)
+  end
+
+> res2:dist(res1)
+0
+```
+
+<a name="torch.construction.dok"></a>
+## Construction or extraction functions ##
+
+<a name="torch.cat"></a>
+### [res] torch.cat( [res,] x_1, x_2, [dimension] ) ###
+### [res] torch.cat( [res,] {x_1, x_2, ...}, [dimension] ) ###
+<a name="torch.cat"></a>
+`x = torch.cat(x_1, x_2, [dimension])` returns a `Tensor` `x` which is the concatenation of `Tensor`s `x_1` and `x_2` along dimension `dimension`.
+
+If `dimension` is not specified it is the last dimension.
+
+The other dimensions of `x_1` and `x_2` have to be equal.
+
+Also supports arrays with arbitrary numbers of `Tensor`s as inputs.
+
+Examples:
+```lua
+> torch.cat(torch.ones(3), torch.zeros(2))
+ 1
+ 1
+ 1
+ 0
+ 0
+[torch.DoubleTensor of size 5]
+
+> torch.cat(torch.ones(3, 2), torch.zeros(2, 2), 1)
+ 1  1
+ 1  1
+ 1  1
+ 0  0
+ 0  0
+[torch.DoubleTensor of size 5x2]
+
+> torch.cat(torch.ones(2, 2), torch.zeros(2, 2), 1)
+ 1  1
+ 1  1
+ 0  0
+ 0  0
+[torch.DoubleTensor of size 4x2]
+
+> torch.cat(torch.ones(2, 2), torch.zeros(2, 2), 2)
+ 1  1  0  0
+ 1  1  0  0
+[torch.DoubleTensor of size 2x4]
+
+> torch.cat(torch.cat(torch.ones(2, 2), torch.zeros(2, 2), 1), torch.rand(3, 2), 1)
+ 1.0000  1.0000
+ 1.0000  1.0000
+ 0.0000  0.0000
+ 0.0000  0.0000
+ 0.3227  0.0493
+ 0.9161  0.1086
+ 0.2206  0.7449
+[torch.DoubleTensor of size 7x2]
+
+> torch.cat({torch.ones(2, 2), torch.zeros(2, 2), torch.rand(3, 2)}, 1)
+ 1.0000  1.0000
+ 1.0000  1.0000
+ 0.0000  0.0000
+ 0.0000  0.0000
+ 0.3227  0.0493
+ 0.9161  0.1086
+ 0.2206  0.7449
+[torch.DoubleTensor of size 7x2]
+
+```
+
+
+<a name="torch.diag"></a>
+### [res] torch.diag([res,] x [,k]) ###
+<a name="torch.diag"></a>
+
+`y = torch.diag(x)` when `x` is of dimension 1 returns a diagonal matrix with diagonal elements constructed from `x`.
+
+`y = torch.diag(x)` when `x` is of dimension 2 returns a `Tensor` of dimension 1 with elements constructed from the diagonal of `x`.
+
+`y = torch.diag(x, k)` returns the k-th diagonal of `x`, where `k = 0` is the main diagonal, `k > 0` is above the main diagonal and `k < 0` is below the main diagonal.
+
+<a name="torch.eye"></a>
+### [res] torch.eye([res,] n [,m]) ###
+<a name="torch.eye"></a>
+
+`y = torch.eye(n)` returns the `n × n` identity matrix.
+
+`y = torch.eye(n, m)` returns an `n × m` identity matrix with ones on the diagonal and zeros elsewhere.
+
+
+<a name="torch.histc"></a>
+### [res] torch.histc([res,] x [,nbins, min_value, max_value]) ###
+<a name="torch.histc"></a>
+
+`y = torch.histc(x)` returns the histogram of the elements in `x`.
+By default the elements are sorted into 100 equally spaced bins between the minimum and maximum values of `x`.
+
+`y = torch.histc(x, n)` same as above with `n` bins.
+
+`y = torch.histc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range.
+
+
+<a name="torch.linspace"></a>
+### [res] torch.linspace([res,] x1, x2, [,n]) ###
+<a name="torch.linspace"></a>
+
+`y = torch.linspace(x1, x2)` returns a one-dimensional `Tensor` of size 100 equally spaced points between `x1` and `x2`.
+
+`y = torch.linspace(x1, x2, n)` returns a one-dimensional `Tensor` of `n` equally spaced points between `x1` and `x2`.
+
+
+<a name="torch.logspace"></a>
+### [res] torch.logspace([res,] x1, x2, [,n]) ###
+<a name="torch.logspace"></a>
+
+`y = torch.logspace(x1, x2)` returns a one-dimensional `Tensor` of `100` logarithmically eqally spaced points between `10^x1` and `10^x2`.
+
+`y = torch.logspace(x1, x2, n)` returns a one-dimensional `Tensor` of `n` logarithmically equally spaced points between `10^x1` and `10^x2`.
+
+<a name="torch.multinomial"></a>
+### [res] torch.multinomial([res,], p, n, [,replacement]) ###
+<a name="torch.multinomial"></a>
+
+`y = torch.multinomial(p, n)` returns a `Tensor` `y` where each row contains `n` indices sampled from the [multinomial probability distribution](http://en.wikipedia.org/wiki/Multinomial_distribution) located in the corresponding row of `Tensor` `p`.
+
+The rows of `p` do not need to sum to one (in which case we use the values as weights), but must be non-negative and have a non-zero sum.
+Indices are ordered from left to right according to when each was sampled (first samples are placed in first column).
+
+If `p` is a vector, `y` is a vector size `n`.
+
+If `p` is a m-rows matrix, `y` is an `m × n` matrix.
+
+If `replacement` is `true`, samples are drawn **with replacement**.
+If not, they are drawn **without replacement**, which means that when a sample index is drawn for a row, it cannot be drawn again for that row.
+This implies the constraint that `n` must be lower than `p` length (or number of columns of `p` if it is a matrix).
+
+The default value for `replacement` is `false`.
+
+
+```lua
+p = torch.Tensor{1, 1, 0.5, 0}
+a = torch.multinomial(p, 10000, true)
+
+> a
+...
+[torch.LongTensor of dimension 10000]
+
+> for i = 1, 4 do print(a:eq(i):sum()) end
+3967
+4016
+2017
+0
+```
+
+Note: If you use the function with a given result `Tensor`, i.e. of the function prototype: `torch.multinomial(res, p, n [, replacement])` then you will have to call it slightly differently as:
+
+```lua
+p.multinomial(res, p, n, replacement) -- p.multinomial instead of torch.multinomial
+```
+
+This is due to the fact that the result here is of a `LongTensor` type, and we do not define a `torch.multinomial` over long `Tensor`s.
+
+<a name="torch.ones"></a>
+### [res] torch.ones([res,] m [,n...]) ###
+<a name="torch.ones"></a>
+
+`y = torch.ones(n)` returns a one-dimensional `Tensor` of size `n` filled with ones.
+
+`y = torch.ones(m, n)` returns a `m × n` `Tensor` filled with ones.
+
+For more than `4` dimensions, you can use a storage as argument: `y = torch.ones(torch.LongStorage{m, n, k, l, o})`.
+
+
+<a name="torch.rand"></a>
+### [res] torch.rand([res,] [gen,] m [,n...]) ###
+<a name="torch.rand"></a>
+
+`y = torch.rand(n)` returns a one-dimensional `Tensor` of size `n` filled with random numbers from a uniform distribution on the interval `[0, 1)`.
+
+`y = torch.rand(m, n)` returns a `m × n` `Tensor` of random numbers from a uniform distribution on the interval `[0, 1)`.
+
+For more than 4 dimensions, you can use a storage as argument: `y = torch.rand(torch.LongStorage{m, n, k, l, o})`.
+
+`y = torch.rand(gen, m, n)` returns a `m × n` `Tensor` of random numbers from a uniform distribution on the interval `[0, 1)`, using a non-global random number generator `gen` created by [torch.Generator()](random.md#torch.Generator).
+
+<a name="torch.randn"></a>
+### [res] torch.randn([res,] [gen,] m [,n...]) ###
+<a name="torch.randn"></a>
+
+`y = torch.randn(n)` returns a one-dimensional `Tensor` of size `n` filled with random numbers from a normal distribution with mean zero and variance one.
+
+`y = torch.randn(m, n)` returns a `m × n` `Tensor` of random numbers from a normal distribution with mean zero and variance one.
+
+For more than 4 dimensions, you can use a storage as argument: `y = torch.randn(torch.LongStorage{m, n, k, l, o})`.
+
+`y = torch.randn(gen, m, n)` returns a `m × n` `Tensor` of random numbers from a normal distribution with mean zero and variance one, using a non-global random number generator `gen` created by [torch.Generator()](random.md#torch.Generator).
+
+<a name="torch.range"></a>
+### [res] torch.range([res,] x, y [,step]) ###
+<a name="torch.range"></a>
+
+`y = torch.range(x, y)` returns a `Tensor` of size `floor((y - x) / step) + 1` with values from `x` to `y` with step `step` (default to 1).
+
+```lua
+> torch.range(2, 5)
+ 2
+ 3
+ 4
+ 5
+[torch.DoubleTensor of size 4]
+
+> torch.range(2, 5, 1.2)
+ 2.0000
+ 3.2000
+ 4.4000
+[torch.DoubleTensor of size 3]
+```
+
+
+<a name="torch.randperm"></a>
+### [res] torch.randperm([res,] [gen,] n) ###
+<a name="torch.randperm"></a>
+
+`y = torch.randperm(n)` returns a random permutation of integers from 1 to `n`.
+
+`y = torch.randperm(gen, n)` returns a random permutation of integers from 1 to `n`, using a non-global random number generator `gen` created by [torch.Generator()](random.md#torch.Generator).
+
+<a name="torch.reshape"></a>
+### [res] torch.reshape([res,] x, m [,n...]) ###
+<a name="torch.reshape"></a>
+
+`y = torch.reshape(x, m, n)` returns a new `m × n` `Tensor` y whose elements are taken rowwise from `x`, which must have `m * n` elements. The elements are copied into the new `Tensor`.
+
+For more than 4 dimensions, you can use a storage: `y = torch.reshape(x, torch.LongStorage{m, n, k, l, o})`.
+
+
+<a name="torch.tril"></a>
+### [res] torch.tril([res,] x [,k]) ###
+<a name="torch.tril"></a>
+
+`y = torch.tril(x)` returns the lower triangular part of `x`, the other elements of `y` are set to 0.
+
+`torch.tril(x, k)` returns the elements on and below the k-th diagonal of `x` as non-zero.
+`k = 0` is the main diagonal, `k > 0` is above the main diagonal and `k < 0` is below the main diagonal.
+
+
+<a name="torch.triu"></a>
+### [res] torch.triu([res,] x, [,k]) ###
+<a name="torch.triu"></a>
+
+`y = torch.triu(x)` returns the upper triangular part of `x`, the other elements of `y` are set to 0.
+
+`torch.triu(x, k)` returns the elements on and above the k-th diagonal of `x` as non-zero.
+`k = 0` is the main diagonal, `k > 0` is above the main diagonal and `k < 0` is below the main diagonal.
+
+
+<a name="torch.zeros"></a>
+### [res] torch.zeros([res,] x) ###
+<a name="torch.zeros"></a>
+
+`y = torch.zeros(n)` returns a one-dimensional `Tensor` of size n filled with zeros.
+
+`y = torch.zeros(m, n)` returns a `m × n` `Tensor` filled with zeros.
+
+For more than 4 dimensions, you can use a storage: `y = torch.zeros(torch.LongStorage{m, n, k, l, o})`.
+
+
+<a name="torch.elementwise.dok"></a>
+## Element-wise Mathematical Operations ##
+
+<a name="torch.abs"></a>
+### [res] torch.abs([res,] x) ###
+<a name="torch.abs"></a>
+
+`y = torch.abs(x)` returns a new `Tensor` with the absolute values of the elements of `x`.
+
+`x:abs()` replaces all elements in-place with the absolute values of the elements of `x`.
+
+
+<a name="torch.sign"></a>
+### [res] torch.sign([res,] x) ###
+<a name="torch.sign"></a>
+
+`y = torch.sign(x)` returns a new `Tensor` with the sign (`+/- 1`) of the elements of `x`.
+
+`x:sign()` replaces all elements in-place with the sign of the elements of `x`.
+
+
+<a name="torch.acos"></a>
+### [res] torch.acos([res,] x) ###
+<a name="torch.acos"></a>
+
+`y = torch.acos(x)` returns a new `Tensor` with the arcosine of the elements of `x`.
+
+`x:acos()` replaces all elements in-place with the arcosine of the elements of `x`.
+
+
+<a name="torch.asin"></a>
+### [res] torch.asin([res,] x) ###
+<a name="torch.asin"></a>
+
+`y = torch.asin(x)` returns a new `Tensor` with the arcsine  of the elements of `x`.
+
+`x:asin()` replaces all elements in-place with the arcsine  of the elements of `x`.
+
+
+<a name="torch.atan"></a>
+### [res] torch.atan([res,] x) ###
+<a name="torch.atan"></a>
+
+`y = torch.atan(x)` returns a new `Tensor` with the arctangent of the elements of `x`.
+
+`x:atan()` replaces all elements in-place with the arctangent of the elements of `x`.
+
+
+<a name="torch.ceil"></a>
+### [res] torch.ceil([res,] x) ###
+<a name="torch.ceil"></a>
+
+`y = torch.ceil(x)` returns a new `Tensor` with the values of the elements of `x` rounded up to the nearest integers.
+
+`x:ceil()` replaces all elements in-place with the values of the elements of `x` rounded up to the nearest integers.
+
+
+<a name="torch.cos"></a>
+### [res] torch.cos([res,] x) ###
+<a name="torch.cos"></a>
+
+`y = torch.cos(x)` returns a new `Tensor` with the cosine of the elements of `x`.
+
+`x:cos()` replaces all elements in-place with the cosine of the elements of `x`.
+
+
+<a name="torch.cosh"></a>
+### [res] torch.cosh([res,] x) ###
+<a name="torch.cosh"></a>
+
+`y = torch.cosh(x)` returns a new `Tensor` with the hyberbolic cosine of the elements of `x`.
+
+`x:cosh()` replaces all elements in-place with the hyberbolic cosine of the elements of `x`.
+
+
+<a name="torch.exp"></a>
+### [res] torch.exp([res,] x) ###
+<a name="torch.exp"></a>
+
+`y = torch.exp(x)` returns, for each element in `x`,  *e* (*Neper number*, the base of natural logarithms) raised to the power of the element in `x`.
+
+`x:exp()` returns, for each element in `x`,  *e* raised to the power of the element in `x`.
+
+
+<a name="torch.floor"></a>
+### [res] torch.floor([res,] x) ###
+<a name="torch.floor"></a>
+
+`y = torch.floor(x)` returns a new `Tensor` with the values of the elements of `x` rounded down to the nearest integers.
+
+`x:floor()` replaces all elements in-place with the values of the elements of `x` rounded down to the nearest integers.
+
+
+<a name="torch.log"></a>
+### [res] torch.log([res,] x) ###
+<a name="torch.log"></a>
+
+`y = torch.log(x)` returns a new `Tensor` with the natural logarithm of the elements of `x`.
+
+`x:log()` replaces all elements in-place with the natural logarithm of the elements of `x`.
+
+
+<a name="torch.log1p"></a>
+### [res] torch.log1p([res,] x) ###
+<a name="torch.log1p"></a>
+
+`y = torch.log1p(x)` returns a new `Tensor` with the natural logarithm of the elements of `x + 1`.
+
+`x:log1p()` replaces all elements in-place with the natural logarithm of the elements of `x + 1`.
+This function is more accurate than [`log`](#torch.log) for small values of `x`.
+
+
+<a name="x:neg"></a>
+### x:neg() ###
+<a name="x:neg"></a>
+
+`x:neg()` replaces all elements in-place with the sign-reversed values of the elements of `x`.
+
+
+<a name="x:cinv"></a>
+### x:cinv() ###
+<a name="x:cinv"></a>
+
+`x:cinv()` replaces all elements in-place with `1.0 / x`.
+
+
+<a name="torch.pow"></a>
+### [res] torch.pow([res,] x, n) ###
+<a name="torch.pow"></a>
+
+Let `x` be a `Tensor` and `n` a number.
+
+`y = torch.pow(x, n)` returns a new `Tensor` with the elements of `x` to the power of `n`.
+
+`y = torch.pow(n, x)` returns, a new `Tensor` with `n` to the power of the elements of `x`.
+
+`x:pow(n)` replaces all elements in-place with the elements of `x` to the power of `n`.
+
+`torch.pow(x, n, x)` replaces all elements in-place with `n` to the power of the elements of `x`.
+
+<a name="torch.round"></a>
+### [res] torch.round([res,] x) ###
+<a name="torch.round"></a>
+
+`y = torch.round(x)` returns a new `Tensor` with the values of the elements of `x` rounded to the nearest integers.
+
+`x:round()` replaces all elements in-place with the values of the elements of `x` rounded to the nearest integers.
+
+
+<a name="torch.sin"></a>
+### [res] torch.sin([res,] x) ###
+<a name="torch.sin"></a>
+
+`y = torch.sin(x)` returns a new `Tensor` with the sine of the elements of `x`.
+
+`x:sin()` replaces all elements in-place with the sine of the elements of `x`.
+
+
+<a name="torch.sinh"></a>
+### [res] torch.sinh([res,] x) ###
+<a name="torch.sinh"></a>
+
+`y = torch.sinh(x)` returns a new `Tensor` with the hyperbolic sine of the elements of `x`.
+
+`x:sinh()` replaces all elements in-place with the hyperbolic sine of the elements of `x`.
+
+
+<a name="torch.sqrt"></a>
+### [res] torch.sqrt([res,] x) ###
+<a name="torch.sqrt"></a>
+
+`y = torch.sqrt(x)` returns a new `Tensor` with the square root of the elements of `x`.
+
+`x:sqrt()` replaces all elements in-place with the square root of the elements of `x`.
+
+
+<a name="torch.rsqrt"></a>
+### [res] torch.rsqrt([res,] x) ###
+<a name="torch.rsqrt"></a>
+
+`y = torch.rsqrt(x)` returns a new `Tensor` with the reciprocal of the square root of the elements of `x`.
+
+`x:rsqrt()` replaces all elements in-place with the reciprocal of the square root of the elements of `x`.
+
+
+<a name="torch.tan"></a>
+### [res] torch.tan([res,] x) ###
+<a name="torch.tan"></a>
+
+`y = torch.tan(x)` returns a new `Tensor` with the tangent of the elements of `x`.
+
+`x:tan()` replaces all elements in-place with the tangent of the elements of `x`.
+
+
+<a name="torch.tanh"></a>
+### [res] torch.tanh([res,] x) ###
+<a name="torch.tanh"></a>
+
+`y = torch.tanh(x)` returns a new `Tensor` with the hyperbolic tangent of the elements of `x`.
+
+`x:tanh()` replaces all elements in-place with the hyperbolic tangent of the elements of `x`.
+
+
+<a name="torch.sigmoid"></a>
+### [res] torch.sigmoid([res,] x) ###
+<a name="torch.sigmoid"></a>
+
+`y = torch.sigmoid(x)` returns a new `Tensor` with the sigmoid of the elements of `x`.
+
+`x:sigmoid()` replaces all elements in-place with the sigmoid of the elements of `x`.
+
+
+<a name="torch.trunc"></a>
+### [res] torch.trunc([res,] x) ###
+<a name="torch.trunc"></a>
+
+`y = torch.trunc(x)` returns a new `Tensor` with the truncated integer values of the elements of `x`.
+
+`x:trunc()` replaces all elements in-place with the truncated integer values of the elements of `x`.
+
+
+<a name="torch.frac"></a>
+### [res] torch.frac([res,] x) ###
+<a name="torch.frac"></a>
+
+`y = torch.frac(x)` returns a new `Tensor` with the fractional portion of the elements of `x`.
+
+`x:frac()` replaces all elements in-place with the fractional portion of the elements of `x`.
+
+
+<a name="torch.basicoperations.dok"></a>
+## Basic operations ##
+
+In this section, we explain basic mathematical operations for `Tensor`s.
+
+<a name="torch.equal"></a>
+### [boolean] equal([tensor1,] tensor2) ###
+<a name="torch.equal"></a>
+
+Returns `true` iff the dimensions and values of `tensor1` and `tensor2` are exactly the same.
+
+```lua
+x = torch.Tensor{1,2,3}
+y = torch.Tensor{1,2,3}
+> x:equal(y)
+true
+
+y = torch.Tensor{1,2,4}
+> x:equal(y)
+false
+```
+
+Note that `a:equal(b)` is more efficient that `a:eq(b):all()` as it avoids allocation of a temporary tensor and can short-circuit.
+
+<a name="torch.add"></a>
+### [res] torch.add([res,] tensor, value) ###
+<a name="torch.add"></a>
+
+Add the given value to all elements in the `Tensor`.
+
+`y = torch.add(x, value)` returns a new `Tensor`.
+
+`x:add(value)` add `value` to all elements in place.
+
+
+<a name="torch.add"></a>
+### [res] torch.add([res,] tensor1, tensor2) ###
+<a name="torch.add"></a>
+
+Add `tensor1` to `tensor2` and put result into `res`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> y = torch.Tensor(4):fill(3)
+> x:add(y)
+> x
+ 5  5
+ 5  5
+[torch.DoubleTensor of size 2x2]
+```
+
+`y = torch.add(a, b)` returns a new `Tensor`.
+
+`torch.add(y, a, b)` puts `a + b` in `y`.
+
+`a:add(b)` accumulates all elements of `b` into `a`.
+
+`y:add(a, b)` puts `a + b` in `y`.
+
+
+<a name="torch.add"></a>
+### [res] torch.add([res,] tensor1, value, tensor2) ###
+<a name="torch.add"></a>
+
+Multiply elements of `tensor2` by the scalar `value` and add it to `tensor1`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> y = torch.Tensor(4):fill(3)
+> x:add(2, y)
+> x
+ 8  8
+ 8  8
+[torch.DoubleTensor of size 2x2]
+```
+
+`x:add(value, y)` multiply-accumulates values of `y` into `x`.
+
+`z:add(x, value, y)` puts the result of `x + value * y` in `z`.
+
+`torch.add(x, value, y)` returns a new `Tensor` `x + value * y`.
+
+`torch.add(z, x, value, y)` puts the result of `x + value * y` in `z`.
+
+
+<a name="x:csub"></a>
+### tensor:csub(value) ###
+<a name="x:csub"></a>
+
+Subtracts the given value from all elements in the `Tensor`, in place.
+
+
+<a name="x:csub"></a>
+### tensor1:csub(tensor2) ###
+<a name="x:csub"></a>
+
+Subtracts `tensor2` from `tensor1`, in place.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(8)
+> y = torch.Tensor(4):fill(3)
+> x:csub(y)
+> x
+ 5  5
+ 5  5
+[torch.DoubleTensor of size 2x2]
+```
+
+`a:csub(b)` put `a - b` into `a`.
+
+
+<a name="torch.mul"></a>
+### [res] torch.mul([res,] tensor1, value) ###
+<a name="torch.mul"></a>
+
+Multiply all elements in the `Tensor` by the given `value`.
+
+`z = torch.mul(x, 2)` will return a new `Tensor` with the result of `x * 2`.
+
+`torch.mul(z, x, 2)` will put the result of `x * 2` in `z`.
+
+`x:mul(2)` will multiply all elements of `x` with `2` in-place.
+
+`z:mul(x, 2)` will put the result of `x * 2` in `z`.
+
+
+<a name="torch.clamp"></a>
+### [res] torch.clamp([res,] tensor, min_value, max_value) ###
+<a name="torch.mul"></a>
+
+Clamp all elements in the `Tensor` into the range `[min_value, max_value]`.  ie:
+
+```
+      ⎧ min_value, if x_i < min_value
+y_i = ⎨ x_i,       if min_value ≤ x_i ≤ max_value
+      ⎩ max_value, if x_i > max_value
+```
+
+`z = torch.clamp(x, 0, 1)` will return a new `Tensor` with the result of `x` bounded between `0` and `1`.
+
+`torch.clamp(z, x, 0, 1)` will put the result in `z`.
+
+`x:clamp(0, 1)` will perform the clamp operation in place (putting the result in `x`).
+
+`z:clamp(x, 0, 1)` will put the result in `z`.
+
+
+<a name="torch.cmul"></a>
+### [res] torch.cmul([res,] tensor1, tensor2) ###
+<a name="torch.cmul"></a>
+
+Element-wise multiplication of `tensor1` by `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> y = torch.Tensor(4):fill(3)
+> x:cmul(y)
+> = x
+ 6  6
+ 6  6
+[torch.DoubleTensor of size 2x2]
+```
+
+`z = torch.cmul(x, y)` returns a new `Tensor`.
+
+`torch.cmul(z, x, y)` puts the result in `z`.
+
+`y:cmul(x)` multiplies all elements of `y` with corresponding elements of `x`.
+
+`z:cmul(x, y)` puts the result in `z`.
+
+
+<a name="torch.cpow"></a>
+### [res] torch.cpow([res,] tensor1, tensor2) ###
+<a name="torch.cpow"></a>
+
+Element-wise power operation, taking the elements of `tensor1` to the powers given by elements of `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> y = torch.Tensor(4):fill(3)
+> x:cpow(y)
+> x
+ 8  8
+ 8  8
+[torch.DoubleTensor of size 2x2]
+```
+
+`z = torch.cpow(x, y)` returns a new `Tensor`.
+
+`torch.cpow(z, x, y)` puts the result in `z`.
+
+`y:cpow(x)` takes all elements of `y` to the powers given by the corresponding elements of `x`.
+
+`z:cpow(x, y)` puts the result in `z`.
+
+
+<a name="torch.addcmul"></a>
+### [res] torch.addcmul([res,] x [,value], tensor1, tensor2) ###
+<a name="torch.addcmul"></a>
+
+Performs the element-wise multiplication of `tensor1` by `tensor2`, multiply the result by the scalar `value` (1 if not present) and add it to `x`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> y = torch.Tensor(4):fill(3)
+> z = torch.Tensor(2, 2):fill(5)
+> x:addcmul(2, y, z)
+> x
+ 32  32
+ 32  32
+[torch.DoubleTensor of size 2x2]
+```
+
+`z:addcmul(value, x, y)` accumulates the result in `z`.
+
+`torch.addcmul(z, value, x, y)` returns a new `Tensor` with the result.
+
+`torch.addcmul(z, z, value, x, y)` puts the result in `z`.
+
+
+<a name="torch.div"></a>
+### [res] torch.div([res,] tensor, value) ###
+<a name="torch.div"></a>
+
+Divide all elements in the `Tensor` by the given `value`.
+
+`z = torch.div(x, 2)` will return a new `Tensor` with the result of `x / 2`.
+
+`torch.div(z, x, 2)` will put the result of `x / 2` in `z`.
+
+`x:div(2)` will divide all elements of `x` with `2` in-place.
+
+`z:div(x, 2)` puts the result of `x / 2` in `z`.
+
+
+<a name="torch.cdiv"></a>
+### [res] torch.cdiv([res,] tensor1, tensor2) ###
+<a name="torch.cdiv"></a>
+
+Performs the element-wise division of `tensor1` by `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(1)
+> y = torch.range(1, 4)
+> x:cdiv(y)
+> x
+ 1.0000  0.5000
+ 0.3333  0.2500
+[torch.DoubleTensor of size 2x2]
+```
+
+`z = torch.cdiv(x, y)` returns a new `Tensor`.
+
+`torch.cdiv(z, x, y)` puts the result in `z`.
+
+`y:cdiv(x)` divides all elements of `y` with corresponding elements of `x`.
+
+`z:cdiv(x, y)` puts the result in `z`.
+
+
+<a name="torch.addcdiv"></a>
+### [res] torch.addcdiv([res,] x [,value], tensor1, tensor2) ###
+<a name="torch.addcdiv"></a>
+
+Performs the element-wise division of `tensor1` by `tensor2`, multiply the result by the scalar `value` and add it to `x`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor(2, 2):fill(1)
+> y = torch.range(1, 4)
+> z = torch.Tensor(2, 2):fill(5)
+> x:addcdiv(2, y, z)
+> x
+ 1.4000  1.8000
+ 2.2000  2.6000
+[torch.DoubleTensor of size 2x2]
+```
+
+`z:addcdiv(value, x, y)` accumulates the result in `z`.
+
+`torch.addcdiv(z, value, x, y)` returns a new `Tensor` with the result.
+
+`torch.addcdiv(z, z, value, x, y)` puts the result in `z`.
+
+
+<a name="torch.fmod"></a>
+### [res] torch.fmod([res,] tensor, value) ###
+<a name="torch.fmod"></a>
+
+Computes remainder of division (rounded towards zero) of all elements in the `Tensor` by `value`.
+This works both for integer and floating point numbers. It behaves the same as Lua bulit-in function `math.fmod()` and a little bit different from `torch.remainder()` and `%` operator. For example:
+
+```lua
+> x = torch.Tensor({-3, 3})
+> torch.fmod(x, 2)
+-1
+ 1
+[torch.DoubleTensor of size 2]
+
+> torch.fmod(x, -2)
+-1
+ 1
+[torch.DoubleTensor of size 2]
+
+> torch.remainder(x, 2)
+ 1
+ 1
+[torch.DoubleTensor of size 2]
+
+> torch.remainder(x, -2)
+-1
+-1
+[torch.DoubleTensor of size 2]
+```
+
+`z = torch.fmod(x, 2)` will return a new `Tensor` with the result of `math.fmod(x, 2)`.
+
+`torch.fmod(z, x, 2)` will put the result of `math.fmod(x, 2)` in `z`.
+
+`x:fmod(2)` will replace all elements of `x` the result of `math.fmod(x, 2)` in-place.
+
+`z:fmod(x, 2)` puts the result of `math.fmod(x, 2)` in `z`.
+
+
+<a name="torch.remainder"></a>
+### [res] torch.remainder([res,] tensor, value) ###
+<a name="torch.remainder"></a>
+
+Computes remainder of division (rounded to nearest) of all elements in the `Tensor` by `value`.
+This works both for integer and floating point numbers. It behaves the same as `%` operator and can be expressed as `a % b = a - b * floor(a/b)`. See `torch.fmod()` for comparison.
+
+`z = torch.remainder(x, 2)` will return a new `Tensor` with the result of `x % 2`.
+
+`torch.remainder(z, x, 2)` will put the result of `x % 2` in `z`.
+
+`x:remainder(2)` will replace all elements of `x` the result of `x % 2` in-place.
+
+`z:remainder(x, 2)` puts the result of `x % 2` in `z`.
+
+
+<a name="torch.mod"></a>
+### [res] torch.mod([res,] tensor, value) ###
+<a name="torch.mod"></a>
+
+This function is deprecated and exists only for compatibility with previous versions. Please use `torch.fmod()` or `torch.remainder()` instead.
+
+
+<a name="torch.cfmod"></a>
+### [res] torch.cfmod([res,] tensor1, tensor2) ###
+<a name="torch.cfmod"></a>
+
+Computes the element-wise remainder of the division (rounded towards zero) of `tensor1` by `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor({{3, 3}, {-3, -3}})
+> y = torch.Tensor({{2, -2}, {2, -2}})
+> x:cfmod(y)
+ 1  1
+-1 -1
+[torch.DoubleTensor of size 2x2]
+```
+
+`z = torch.cfmod(x, y)` returns a new `Tensor`.
+
+`torch.cfmod(z, x, y)` puts the result in `z`.
+
+`y:cfmod(x)` replaces all elements of `y` by their remainders of division (rounded towards zero) by
+corresponding elements of `x`.
+
+`z:cfmod(x, y)` puts the result in `z`.
+
+
+<a name="torch.cremainder"></a>
+### [res] torch.cremainder([res,] tensor1, tensor2) ###
+<a name="torch.cremainder"></a>
+
+Computes element-wise remainder of the division (rounded to nearest) of `tensor1` by `tensor2`.
+The number of elements must match, but sizes do not matter.
+
+```lua
+> x = torch.Tensor({{3, 3}, {-3, -3}})
+> y = torch.Tensor({{2, -2}, {2, -2}})
+> x:cfmod(y)
+ 1  1
+-1 -1
+[torch.DoubleTensor of size 2x2]
+```
+
+`z = torch.cremainder(x, y)` returns a new `Tensor`.
+
+`torch.cremainder(z, x, y)` puts the result in `z`.
+
+`y:cremainder(x)` replaces all elements of `y` by their remainders of division (rounded to nearest) by
+corresponding elements of `x`.
+
+`z:cremainder(x, y)` puts the result in `z`.
+
+
+<a name="torch.cmod"></a>
+### [res] torch.cmod([res,] tensor1, tensor2) ###
+<a name="torch.cmod"></a>
+
+This function is deprecated and exists only for compatibility with previous versions. Please use `torch.cfmod()` or `torch.cremainder()` instead.
+
+
+<a name="torch.dot"></a>
+### [number] torch.dot(tensor1, tensor2) ###
+<a name="torch.dot"></a>
+
+Performs the dot product between `tensor1` and `tensor2`.
+The number of elements must match: both `Tensor`s are seen as a 1D vector.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> y = torch.Tensor(4):fill(3)
+> x:dot(y)
+24
+```
+
+`torch.dot(x, y)` returns dot product of `x` and `y`.
+`x:dot(y)` returns dot product of `x` and `y`.
+
+
+<a name="torch.addmv"></a>
+### [res] torch.addmv([res,] [beta,] [v1,] vec1, [v2,] mat, vec2) ###
+<a name="torch.addmv"></a>
+
+Performs a matrix-vector multiplication between `mat` (2D `Tensor`) and `vec2` (1D `Tensor`) and add it to `vec1`.
+
+Optional values `v1` and `v2` are scalars that multiply `vec1` and `vec2` respectively.
+
+Optional value `beta` is  a scalar that scales the result `Tensor`, before accumulating the result into the `Tensor`.
+Defaults to `1.0`.
+
+In other words,
+
+```
+res = (beta * res) + (v1 * vec1) + (v2 * (mat * vec2))
+```
+
+Sizes must respect the matrix-multiplication operation: if `mat` is a `n × m` matrix, `vec2` must be vector of size `m` and `vec1` must be a vector of size `n`.
+
+```lua
+> x = torch.Tensor(3):fill(0)
+> M = torch.Tensor(3, 2):fill(3)
+> y = torch.Tensor(2):fill(2)
+> x:addmv(M, y)
+> x
+ 12
+ 12
+ 12
+[torch.DoubleTensor of size 3]
+```
+
+`torch.addmv(x, y, z)` returns a new `Tensor` with the result.
+
+`torch.addmv(r, x, y, z)` puts the result in `r`.
+
+`x:addmv(y, z)` accumulates `y * z` into `x`.
+
+`r:addmv(x, y, z)` puts the result of `x + y * z` into `r`.
+
+
+<a name="torch.addr"></a>
+### [res] torch.addr([res,] [v1,] mat, [v2,] vec1, vec2) ###
+<a name="torch.addr"></a>
+
+Performs the outer-product between `vec1` (1D `Tensor`) and `vec2` (1D `Tensor`).
+
+Optional values `v1` and `v2` are scalars that multiply `mat` and `vec1 [out] vec2` respectively.
+
+In other words,
+
+```
+res_ij = (v1 * mat_ij) + (v2 * vec1_i * vec2_j)
+```
+
+If `vec1` is a vector of size `n` and `vec2` is a vector of size `m`, then `mat` must be a matrix of size `n × m`.
+
+```lua
+> x = torch.range(1, 3)
+> y = torch.range(1, 2)
+> M = torch.Tensor(3, 2):zero()
+> M:addr(x, y)
+ 1  2         --     |0 0|     |1 2|
+ 2  4         -- = 1*|0 0| + 1*|2 4|
+ 3  6         --     |0 0|     |3 6|
+[torch.DoubleTensor of size 3x2]
+-- default values of v1 and v2 are 1.
+
+> M:addr(2, 1, x, y)
+  3   6        --     |1 2|     |1 2|
+  6  12        -- = 2*|2 4| + 1*|2 4|
+  9  18        --     |3 6|     |3 6|
+[torch.DoubleTensor of size 3x2]
+
+> A = torch.range(1, 6):resize(3, 2)
+> A
+ 1  2
+ 3  4
+ 5  6
+[torch.DoubleTensor of size 3x2]
+> M:addr(2, A, 1, x, y)
+  3   6        --   |1 2|     |1 2|
+  8  12        -- 2*|3 4| + 1*|2 4|
+ 13  18        --   |5 6|     |3 6|
+[torch.DoubleTensor of size 3x2]
+```
+
+`torch.addr(M, x, y)` returns the result in a new `Tensor`.
+
+`torch.addr(r, M, x, y)` puts the result in `r`.
+
+`M:addr(x, y)` puts the result in `M`.
+
+`r:addr(M, x, y)` puts the result in `r`.
+
+
+<a name="torch.addmm"></a>
+### [res] torch.addmm([res,] [beta,] [v1,] M [v2,] mat1, mat2) ###
+<a name="torch.addmm"></a>
+
+Performs a matrix-matrix multiplication between `mat1` (2D `Tensor`) and `mat2` (2D `Tensor`).
+
+Optional values `v1` and `v2` are scalars that multiply `M` and `mat1 * mat2` respectively.
+
+Optional value `beta` is  a scalar that scales the result `Tensor`, before accumulating the result into the `Tensor`.
+Defaults to `1.0`.
+
+In other words,
+
+```
+res = (res * beta) + (v1 * M) + (v2 * mat1 * mat2)
+```
+
+If `mat1` is a `n × m` matrix, `mat2` a `m × p` matrix, `M` must be a `n × p` matrix.
+
+`torch.addmm(M, mat1, mat2)` returns the result in a new `Tensor`.
+
+`torch.addmm(r, M, mat1, mat2)` puts the result in `r`.
+
+`M:addmm(mat1, mat2)` puts the result in `M`.
+
+`r:addmm(M, mat1, mat2)` puts the result in `r`.
+
+
+<a name="torch.addbmm"></a>
+### [res] torch.addbmm([res,] [v1,] M [v2,] batch1, batch2) ###
+<a name="torch.addbmm"></a>
+
+Batch matrix matrix product of matrices stored in `batch1` and `batch2`, with a reduced add step (all matrix multiplications get accumulated in a single place).
+
+`batch1` and `batch2` must be 3D `Tensor`s each containing the same number of matrices.
+If `batch1` is a `b × n × m` `Tensor`, `batch2` a `b × m × p` `Tensor`, res will be a `n × p` `Tensor`.
+
+In other words,
+
+```
+res = (v1 * M) + (v2 * sum(batch1_i * batch2_i, i = 1, b))
+```
+
+`torch.addbmm(M, x, y)` puts the result in a new `Tensor`.
+
+`M:addbmm(x, y)` puts the result in `M`, resizing `M` if necessary.
+
+`M:addbmm(beta, M2, alpha, x, y)` puts the result in `M`, resizing `M` if necessary.
+
+
+<a name="torch.baddbmm"></a>
+### [res] torch.baddbmm([res,] [v1,] M [v2,] batch1, batch2) ###
+<a name="torch.baddbmm"></a>
+
+Batch matrix matrix product of matrices stored in `batch1` and `batch2`, with batch add.
+
+`batch1` and `batch2` must be 3D `Tensor`s each containing the same number of matrices.
+If `batch1` is a `b × n × m` `Tensor`, `batch2` a `b × m × p` `Tensor`, res will be a `b × n × p` `Tensor`.
+
+In other words,
+
+```
+res_i = (v1 * M_i) + (v2 * batch1_i * batch2_i)
+```
+
+`torch.baddbmm(M, x, y)` puts the result in a new `Tensor`.
+
+`M:baddbmm(x, y)` puts the result in `M`, resizing `M` if necessary.
+
+`M:baddbmm(beta, M2, alpha, x, y)` puts the result in `M`, resizing `M` if necessary.
+
+
+<a name="torch.mv"></a>
+### [res] torch.mv([res,] mat, vec) ###
+<a name="torch.mv"></a>
+
+Matrix vector product of `mat` and `vec`.
+Sizes must respect the matrix-multiplication operation: if `mat` is a `n × m` matrix, `vec` must be vector of size `m` and `res` must be a vector of size `n`.
+
+`torch.mv(x, y)` puts the result in a new `Tensor`.
+
+`torch.mv(M, x, y)` puts the result in `M`.
+
+`M:mv(x, y)` puts the result in `M`.
+
+
+<a name="torch.mm"></a>
+### [res] torch.mm([res,] mat1, mat2) ###
+<a name="torch.mm"></a>
+
+Matrix matrix product of `mat1` and `mat2`.
+If `mat1` is a `n × m` matrix, `mat2` a `m × p` matrix, `res` must be a `n × p` matrix.
+
+`torch.mm(x, y)` puts the result in a new `Tensor`.
+
+`torch.mm(M, x, y)` puts the result in `M`.
+
+`M:mm(x, y)` puts the result in `M`.
+
+
+<a name="torch.bmm"></a>
+### [res] torch.bmm([res,] batch1, batch2) ###
+<a name="torch.bmm"></a>
+
+Batch matrix matrix product of matrices stored in `batch1` and `batch2`.
+`batch1` and `batch2` must be 3D `Tensor`s each containing the same number of matrices.
+If `batch1` is a `b × n × m` `Tensor`, `batch2` a `b × m × p` `Tensor`, `res` will be a `b × n × p` `Tensor`.
+
+`torch.bmm(x, y)` puts the result in a new `Tensor`.
+
+`torch.bmm(M, x, y)` puts the result in `M`, resizing `M` if necessary.
+
+`M:bmm(x, y)` puts the result in `M`, resizing `M` if necessary.
+
+
+<a name="torch.ger"></a>
+### [res] torch.ger([res,] vec1, vec2) ###
+<a name="torch.ger"></a>
+
+Outer product of `vec1` and `vec2`.
+If `vec1` is a vector of size `n` and `vec2` is a vector of size `m`, then `res` must be a matrix of size `n × m`.
+
+`torch.ger(x, y)` puts the result in a new `Tensor`.
+
+`torch.ger(M, x, y)` puts the result in `M`.
+
+`M:ger(x, y)` puts the result in `M`.
+
+
+<a name="torch.lerp"></a>
+### [res] torch.lerp([res,] a, b, weight) ###
+<a name="torch.lerp"></a>
+
+Linear interpolation of two scalars or tensors based on a weight: `res = a + weight * (b - a)`
+
+`torch.lerp(a, b, weight)` puts the result in a new `Tensor` if `a` and `b` are tensors. If `a` and `b` are scalars the functions returns a number.
+
+`torch.lerp(M, a, b, weight)` puts the result in `M`.
+
+`M:lerp(a, b, weight)` puts the result in `M`.
+
+
+## Overloaded operators ##
+
+It is possible to use basic mathematical operators like `+`, `-`, `/`, `*` and `%` with `Tensor`s.
+These operators are provided as a convenience.
+While they might be handy, they create and return a new `Tensor` containing the results.
+They are thus not as fast as the operations available in the [previous section](#torch.BasicOperations.dok).
+
+Another important point to note is that these operators are only overloaded when the first operand is a `Tensor`.
+For example, this will NOT work:
+
+```lua
+> x = 5 + torch.rand(3)
+```
+
+
+### Addition and subtraction ###
+
+You can add a `Tensor` to another one with the `+` operator.
+Subtraction is done with `-`.
+The number of elements in the `Tensor`s must match, but the sizes do not matter.
+The size of the returned `Tensor` will be the size of the first `Tensor`.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> y = torch.Tensor(4):fill(3)
+> = x + y
+ 5  5
+ 5  5
+[torch.DoubleTensor of size 2x2]
+
+> = y - x
+ 1
+ 1
+ 1
+ 1
+[torch.DoubleTensor of size 4]
+```
+
+A scalar might also be added or subtracted to a `Tensor`.
+The scalar needs to be on the right of the operator.
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> = x + 3
+ 5  5
+ 5  5
+[torch.DoubleTensor of size 2x2]
+```
+
+
+### Negation ###
+
+A `Tensor` can be negated with the `-` operator placed in front:
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> = -x
+-2 -2
+-2 -2
+[torch.DoubleTensor of size 2x2]
+```
+
+
+### Multiplication ###
+
+Multiplication between two `Tensor`s is supported with the `*` operators.
+The result of the multiplication depends on the sizes of the `Tensor`s.
+
+ - 1D and 1D: Returns the dot product between the two `Tensor`s (scalar).
+ - 2D and 1D: Returns the matrix-vector operation between the two `Tensor`s (1D `Tensor`).
+ - 2D and 2D: Returns the matrix-matrix operation between the two `Tensor`s (2D `Tensor`).
+
+Sizes must be conformant for the corresponding operation.
+
+A `Tensor` might also be multiplied by a scalar.
+The scalar might be on the right or left of the operator.
+
+Examples:
+
+```lua
+> M = torch.Tensor(2, 2):fill(2)
+> N = torch.Tensor(2, 4):fill(3)
+> x = torch.Tensor(2):fill(4)
+> y = torch.Tensor(2):fill(5)
+> = x * y -- dot product
+40
+
+> = M * x --- matrix-vector
+ 16
+ 16
+[torch.DoubleTensor of size 2]
+
+> = M * N -- matrix-matrix
+ 12  12  12  12
+ 12  12  12  12
+[torch.DoubleTensor of size 2x4]
+```
+
+
+### Division and Modulo (remainder) ###
+
+Only the division of a `Tensor` by a scalar is supported with the operator `/`.
+
+Example:
+
+```lua
+> x = torch.Tensor(2, 2):fill(2)
+> = x/3
+ 0.6667  0.6667
+ 0.6667  0.6667
+[torch.DoubleTensor of size 2x2]
+```
+
+Similarly, the remainder of the division of a `Tensor`s elements by a scalar
+can be obtained with the operator `%`.
+
+Example:
+> x = torch.Tensor{{1,2},{3,4}}
+> = x % 3
+ 1  2
+ 0  1
+[torch.Tensor of size 2x2]
+
+
+<a name="torch.columnwise.dok"></a>
+## Column or row-wise operations  (dimension-wise operations) ##
+
+
+<a name="torch.cross"></a>
+### [res] torch.cross([res,] a, b [,n]) ###
+
+`y = torch.cross(a, b)` returns the cross product of `a` and `b` along the first dimension of length 3.
+
+`y = torch.cross(a, b, n)`  returns the cross product of vectors in dimension `n` of `a` and `b`.
+
+`a` and `b` must have the same size, and both `a:size(n)` and `b:size(n)` must be 3.
+
+
+<a name="torch.cumprod"></a>
+### [res] torch.cumprod([res,] x [,dim]) ###
+
+`y = torch.cumprod(x)` returns the cumulative product of the elements of `x`, performing the operation over the last dimension.
+
+`y = torch.cumprod(x, n)` returns the cumulative product of the elements of `x`, performing the operation over dimension `n`.
+
+```lua
+-- 1. cumulative product for a vector
+> A = torch.range(1, 5)
+> A
+ 1
+ 2
+ 3
+ 4
+ 5
+[torch.DoubleTensor of size 5]
+
+> B = torch.cumprod(A)
+> B
+   1     -- B(1) = A(1) = 1
+   2     -- B(2) = A(1)*A(2) = 1*2 = 2
+   6     -- B(3) = A(1)*A(2)*A(3) = 1*2*3 = 6
+  24     -- B(4) = A(1)*A(2)*A(3)*A(4) = 1*2*3*4 = 24
+ 120     -- B(5) = A(1)*A(2)*A(3)*A(4)*A(5) =1*2*3*4*5 = 120
+[torch.DoubleTensor of size 5]
+
+-- 2. cumulative product for a matrix
+> A = torch.LongTensor{{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}
+> A
+ 1  4  7
+ 2  5  8
+ 3  6  9
+[torch.LongTensor of size 3x3]
+
+> B = torch.cumprod(A)
+> B
+   1    4    7
+   2   20   56
+   6  120  504
+[torch.LongTensor of size 3x3]
+
+-- Why?
+-- B(1, 1) = A(1, 1) = 1
+-- B(2, 1) = A(1, 1)*A(2, 1) = 1*2 = 2
+-- B(3, 1) = A(1, 1)*A(2, 1)*A(3, 1) = 1*2*3 = 6
+-- B(1, 2) = A(1, 2) = 4
+-- B(2, 2) = A(1, 2)*A(2, 2) = 4*5 = 20
+-- B(3, 2) = A(1, 2)*A(2, 2)*A(3, 2) = 4*5*6 = 120
+-- B(1, 3) = A(1, 3) = 7
+-- B(2, 3) = A(1, 3)*A(2, 3) = 7*8 = 56
+-- B(3, 3) = A(1, 3)*A(2, 3)*A(3, 3) = 7*8*9 = 504
+
+-- 3. cumulative product along 2-dim
+> B = torch.cumprod(A, 2)
+> B
+   1    4   28
+   2   10   80
+   3   18  162
+[torch.LongTensor of size 3x3]
+
+-- Why?
+-- B(1, 1) = A(1, 1) = 1
+-- B(1, 2) = A(1, 1)*A(1, 2) = 1*4 = 4
+-- B(1, 3) = A(1, 1)*A(1, 2)*A(1, 3) = 1*4*7 = 28
+-- B(2, 1) = A(2, 1) = 2
+-- B(2, 2) = A(2, 1)*A(2, 2) = 2*5 = 10
+-- B(2, 3) = A(2, 1)*A(2, 2)*A(2, 3) = 2*5*8 = 80
+-- B(3, 1) = A(3, 1) = 3
+-- B(3, 2) = A(3, 1)*A(2, 3) = 3*6 = 18
+-- B(3, 3) = A(3, 1)*A(2, 3)*A(3, 3) = 3*6*9 = 162
+```
+
+
+<a name="torch.cumsum"></a>
+### [res] torch.cumsum([res,] x [,dim]) ###
+
+`y = torch.cumsum(x)` returns the cumulative sum of the elements of `x`, performing the operation over the first dimension.
+
+`y = torch.cumsum(x, n)` returns the cumulative sum of the elements of `x`, performing the operation over dimension `n`.
+
+
+<a name="torch.max"></a>
+### torch.max([resval, resind,] x [,dim]) ###
+
+`y = torch.max(x)` returns the single largest element of `x`.
+
+`y, i = torch.max(x, 1)` returns the largest element in each column (across rows) of `x`, and a `Tensor` `i` of their corresponding indices in `x`.
+
+`y, i = torch.max(x, 2)` performs the max operation for each row.
+
+`y, i = torch.max(x, n)` performs the max operation over the dimension `n`.
+
+```lua
+> x = torch.randn(3, 3)
+> x
+ 1.1994 -0.6290  0.6888
+-0.0038 -0.0908 -0.2075
+ 0.3437 -0.9948  0.1216
+[torch.DoubleTensor of size 3x3]
+
+> torch.max(x)
+1.1993977428735
+
+> torch.max(x, 1)
+ 1.1994 -0.0908  0.6888
+[torch.DoubleTensor of size 1x3]
+
+ 1  2  1
+[torch.LongTensor of size 1x3]
+
+> torch.max(x, 2)
+ 1.1994
+-0.0038
+ 0.3437
+[torch.DoubleTensor of size 3x1]
+
+ 1
+ 1
+ 1
+[torch.LongTensor of size 3x1]
+```
+
+
+<a name="torch.mean"></a>
+### [res] torch.mean([res,] x [,dim]) ###
+
+`y = torch.mean(x)` returns the mean of all elements of `x`.
+
+`y = torch.mean(x, 1)` returns a `Tensor` `y` of the mean of the elements in each column of `x`.
+
+`y = torch.mean(x, 2)` performs the mean operation for each row.
+
+`y = torch.mean(x, n)` performs the mean operation over the dimension `n`.
+
+
+<a name="torch.min"></a>
+### torch.min([resval, resind,] x [,dim]) ###
+
+`y = torch.min(x)` returns the single smallest element of `x`.
+
+`y, i = torch.min(x, 1)` returns the smallest element in each column (across rows) of `x`, and a `Tensor` `i` of their corresponding indices in `x`.
+
+`y, i = torch.min(x, 2)` performs the min operation for each row.
+
+`y, i = torch.min(x, n)` performs the min operation over the dimension `n`.
+
+
+<a name="torch.cmax"></a>
+### [res] torch.cmax([res,] tensor1, tensor2) ###
+
+Compute the maximum of each pair of values in `tensor1` and `tensor2`.
+
+`c = torch.cmax(a, b)` returns a new `Tensor` containing the element-wise maximum of `a` and `b`.
+
+`a:cmax(b)` stores the element-wise maximum of `a` and `b` in `a`.
+
+`c:cmax(a, b)` stores the element-wise maximum of `a` and `b` in `c`.
+
+```lua
+> a = torch.Tensor{1, 2, 3}
+> b = torch.Tensor{3, 2, 1}
+> torch.cmax(a, b)
+ 3
+ 2
+ 3
+[torch.DoubleTensor of size 3]
+```
+
+
+<a name="torch.cmax"></a>
+### [res] torch.cmax([res,] tensor, value) ###
+
+Compute the maximum between each value in `tensor` and `value`.
+
+`c = torch.cmax(a, v)` returns a new `Tensor` containing the maxima of each element in `a` and `v`.
+
+`a:cmax(v)` stores the maxima of each element in `a` and `v` in `a`.
+
+`c:cmax(a, v)` stores the maxima of each element in `a` and `v` in `c`.
+
+```lua
+> a = torch.Tensor{1, 2, 3}
+> torch.cmax(a, 2)
+ 2
+ 2
+ 3
+[torch.DoubleTensor of size 3]
+```
+
+
+<a name="torch.cmin"></a>
+### [res] torch.cmin([res,] tensor1, tensor2) ###
+
+Compute the minimum of each pair of values in `tensor1` and `tensor2`.
+
+`c = torch.cmin(a, b)` returns a new `Tensor` containing the element-wise minimum of `a` and `b`.
+
+`a:cmin(b)` stores the element-wise minimum of `a` and `b` in `a`.
+
+`c:cmin(a, b)` stores the element-wise minimum of `a` and `b` in `c`.
+
+```lua
+> a = torch.Tensor{1, 2, 3}
+> b = torch.Tensor{3, 2, 1}
+> torch.cmin(a, b)
+ 1
+ 2
+ 1
+[torch.DoubleTensor of size 3]
+```
+
+
+<a name="torch.cmin"></a>
+### [res] torch.cmin([res,] tensor, value) ###
+
+Compute the minimum between each value in `tensor` and `value`.
+
+`c = torch.cmin(a, v)` returns a new `Tensor` containing the minima of each element in `a` and `v`.
+
+`a:cmin(v)` stores the minima of each element in `a` and `v` in `a`.
+
+`c:cmin(a, v)` stores the minima of each element in `a` and `v` in `c`.
+
+```lua
+> a = torch.Tensor{1, 2, 3}
+> torch.cmin(a, 2)
+ 1
+ 2
+ 2
+[torch.DoubleTensor of size 3]
+```
+
+
+<a name="torch.median"></a>
+### torch.median([resval, resind,] x [,dim]) ###
+
+`y = torch.median(x)` performs the median operation over the last dimension of `x` (one-before-middle in the case of an even number of elements).
+
+`y, i = torch.median(x, 1)` returns the median element in each column (across rows) of `x`, and a `Tensor` `i` of their corresponding indices in `x`.
+
+`y, i = torch.median(x, 2)` performs the median operation for each row.
+
+`y, i = torch.median(x, n)` performs the median operation over the dimension `n`.
+
+```lua
+> x = torch.randn(3, 3)
+> x
+ 0.7860  0.7687 -0.9362
+ 0.0411  0.5407 -0.3616
+-0.0129 -0.2499 -0.5786
+[torch.DoubleTensor of size 3x3]
+
+> y, i = torch.median(x)
+> y
+ 0.7687
+ 0.0411
+-0.2499
+[torch.DoubleTensor of size 3x1]
+
+> i
+ 2
+ 1
+ 2
+[torch.LongTensor of size 3x1]
+
+> y, i = torch.median(x, 1)
+> y
+ 0.0411  0.5407 -0.5786
+[torch.DoubleTensor of size 1x3]
+
+> i
+ 2  2  3
+[torch.LongTensor of size 1x3]
+
+> y, i = torch.median(x, 2)
+> y
+ 0.7687
+ 0.0411
+-0.2499
+[torch.DoubleTensor of size 3x1]
+
+> i
+ 2
+ 1
+ 2
+[torch.LongTensor of size 3x1]
+```
+
+
+<a name="torch.mode"></a>
+### torch.mode([resval, resind,] x [,dim]) ###
+
+`y = torch.mode(x)` returns the most frequent element of `x` over its last dimension.
+
+`y, i = torch.mode(x, 1)` returns the mode element in each column (across rows) of `x`, and a `Tensor` `i` of their corresponding indices in `x`.
+
+`y, i = torch.mode(x, 2)` performs the mode operation for each row.
+
+`y, i = torch.mode(x, n)` performs the mode operation over the dimension `n`.
+
+
+<a name="torch.kthvalue"></a>
+### torch.kthvalue([resval, resind,] x, k [,dim]) ###
+
+`y = torch.kthvalue(x, k)` returns the `k`-th smallest element of `x` over its last dimension.
+
+`y, i = torch.kthvalue(x, k, 1)` returns the `k`-th smallest element in each column (across rows) of `x`, and a `Tensor` `i` of their corresponding indices in `x`.
+
+`y, i = torch.kthvalue(x, k, 2)` performs the `k`-th value operation for each row.
+
+`y, i = torch.kthvalue(x, k, n)` performs the `k`-th value operation over the dimension `n`.
+
+
+<a name="torch.prod"></a>
+### [res] torch.prod([res,] x [,n]) ###
+
+`y = torch.prod(x)` returns the product of all elements in `x`.
+
+`y = torch.prod(x, n)` returns a `Tensor` `y` whom size in dimension `n` is 1 and where elements are the product of elements of `x` with respect to dimension `n`.
+
+```lua
+> a = torch.Tensor{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}
+> a
+(1,.,.) =
+  1  2
+  3  4
+
+(2,.,.) =
+  5  6
+  7  8
+[torch.DoubleTensor of dimension 2x2x2]
+
+> torch.prod(a, 1)
+(1,.,.) =
+   5  12
+  21  32
+[torch.DoubleTensor of dimension 1x2x2]
+
+> torch.prod(a, 2)
+(1,.,.) =
+   3   8
+
+(2,.,.) =
+  35  48
+[torch.DoubleTensor of size 2x1x2]
+
+> torch.prod(a, 3)
+(1,.,.) =
+   2
+  12
+
+(2,.,.) =
+  30
+  56
+[torch.DoubleTensor of size 2x2x1]
+```
+
+
+<a name="torch.sort"></a>
+### torch.sort([resval, resind,] x [,d] [,flag]) ###
+
+`y, i = torch.sort(x)` returns a `Tensor` `y` where all entries are sorted along the last dimension, in **ascending** order.
+It also returns a `Tensor` `i` that provides the corresponding indices from `x`.
+
+`y, i = torch.sort(x, d)` performs the sort operation along a specific dimension `d`.
+
+`y, i = torch.sort(x)` is therefore equivalent to `y, i = torch.sort(x, x:dim())`
+
+`y, i = torch.sort(x, d, true)` performs the sort operation along a specific dimension `d`, in **descending** order.
+
+```lua
+> x = torch.randn(3, 3)
+> x
+-1.2470 -0.4288 -0.5337
+ 0.8836 -0.1622  0.9604
+ 0.6297  0.2397  0.0746
+[torch.DoubleTensor of size 3x3]
+
+> torch.sort(x)
+-1.2470 -0.5337 -0.4288
+-0.1622  0.8836  0.9604
+ 0.0746  0.2397  0.6297
+[torch.DoubleTensor of size 3x3]
+
+ 1  3  2
+ 2  1  3
+ 3  2  1
+[torch.LongTensor of size 3x3]
+```
+
+<a name="torch.topk"></a>
+### torch.topk([resval, resind,] x, k, [,dim] [,dir] [,sort]) ###
+
+`y, i = torch.topk(x, k)` returns all `k` smallest elements in `x` over its last dimension including their indices, in unsorted order.
+
+`y, i = torch.topk(x, k, dim)` performs the same operation except over dimension `dim`.
+
+`y, i = torch.topk(x, k, dim, dir)` adds a sorting direction that has the same sense as `torch.sort`; `false` returns the `k` smallest elements in the slice, `true` returns the `k` largest elements in the slice.
+
+`y, i = torch.topk(x, k, dim, dir, true)` specifies that the results in `y` should be sorted with respect to `dir`; by default, the results are potentially unsorted since the computation may be faster, but if sorting is desired, the sort flag may be passed, in which case the results are returned from smallest to `k`-th smallest (`dir == false`) or highest to `k`-th highest (`dir == true`).
+
+The implementation provides no guarantee of the order of selection (indices) among equivalent elements (e.g., topk `k == 2` selection of a vector `{1, 2, 1, 1}`; the values returned could be any pair of `1` entries in the vector).
+
+<a name="torch.std"></a>
+### [res] torch.std([res,] x, [,dim] [,flag]) ###
+
+`y = torch.std(x)` returns the standard deviation of the elements of `x`.
+
+`y = torch.std(x, dim)` performs the `std` operation over the dimension `dim`.
+
+`y = torch.std(x, dim, false)` performs the `std` operation normalizing by `n-1` (this is the default).
+
+`y = torch.std(x, dim, true)` performs the `std` operation normalizing by `n` instead of `n-1`.
+
+
+<a name="torch.sum"></a>
+### [res] torch.sum([res,] x) ###
+
+`y = torch.sum(x)` returns the sum of the elements of `x`.
+
+`y = torch.sum(x, 2)` performs the sum operation for each row.
+
+`y = torch.sum(x, n)` performs the sum operation over the dimension `n`.
+
+
+<a name="torch.var"></a>
+### [res] torch.var([res,] x [,dim] [,flag]) ###
+
+`y = torch.var(x)` returns the variance of the elements of `x`.
+
+`y = torch.var(x, dim)` performs the `var` operation over the dimension `dim`.
+
+`y = torch.var(x, dim, false)` performs the `var` operation normalizing by `n-1` (this is the default).
+
+`y = torch.var(x, dim, true)` performs the `var` operation normalizing by `n` instead of `n-1`.
+
+
+<a name="torch.matrixwide.dok"></a>
+## Matrix-wide operations  (`Tensor`-wide operations) ##
+
+Note that many of the operations in [dimension-wise operations](#torch.columnwise.dok) can also be used as matrix-wide operations, by just omitting the `dim` parameter.
+
+
+<a name="torch.norm"></a>
+### torch.norm(x [,p] [,dim]) ###
+
+`y = torch.norm(x)` returns the `2`-norm of the `Tensor` `x`.
+
+`y = torch.norm(x, p)` returns the `p`-norm of the `Tensor` `x`.
+
+`y = torch.norm(x, p, dim)` returns the `p`-norms of the `Tensor` `x` computed over the dimension `dim`.
+
+
+<a name="torch.renorm"></a>
+### torch.renorm([res], x, p, dim, maxnorm) ###
+
+Renormalizes the sub-`Tensor`s along dimension `dim` such that they do not exceed norm `maxnorm`.
+
+`y = torch.renorm(x, p, dim, maxnorm)` returns a version of `x` with `p`-norms lower than `maxnorm` over non-`dim` dimensions.
+The `dim` argument is not to be confused with the argument of the same name in function [`norm`](#torch.norm).
+In this case, the `p`-norm is measured for each `i`-th sub-`Tensor` `x:select(dim, i)`.
+This function is equivalent to (but faster than) the following:
+
+```lua
+function renorm(matrix, value, dim, maxnorm)
+   local m1 = matrix:transpose(dim, 1):contiguous()
+   -- collapse non-dim dimensions:
+   m2 = m1:reshape(m1:size(1), m1:nElement()/m1:size(1))
+   local norms = m2:norm(value, 2)
+   -- clip
+   local new_norms = norms:clone()
+   new_norms[torch.gt(norms, maxnorm)] = maxnorm
+   new_norms:cdiv(norms:add(1e-7))
+   -- renormalize
+   m1:cmul(new_norms:expandAs(m1))
+   return m1:transpose(dim, 1)
+end
+```
+
+`x:renorm(p, dim, maxnorm)` returns the equivalent of `x:copy(torch.renorm(x, p, dim, maxnorm))`.
+
+Note: this function is particularly useful as a regularizer for constraining the norm of parameter `Tensor`s.
+See [Hinton et al. 2012, p. 2](http://arxiv.org/pdf/1207.0580.pdf).
+
+
+<a name="torch.dist"></a>
+### torch.dist(x, y) ###
+
+`y = torch.dist(x, y)` returns the `2`-norm of `x - y`.
+
+`y = torch.dist(x, y, p)` returns the `p`-norm of `x - y`.
+
+
+<a name="torch.numel"></a>
+### torch.numel(x) ###
+
+`y = torch.numel(x)` returns the count of the number of elements in the matrix `x`.
+
+
+<a name="torch.trace"></a>
+### torch.trace(x) ###
+
+`y = torch.trace(x)` returns the trace (sum of the diagonal elements) of a matrix `x`.
+This is equal to the sum of the eigenvalues of `x`.
+The returned value `y` is a number, not a `Tensor`.
+
+
+<a name="torch.conv.dok"></a>
+## Convolution Operations ##
+
+These functions implement convolution or cross-correlation of an input image (or set of input images) with a kernel (or set of kernels).
+The convolution function in Torch can handle different types of input/kernel dimensions and produces corresponding outputs.
+The general form of operations always remain the same.
+
+
+<a name="torch.conv2"></a>
+### [res] torch.conv2([res,] x, k, [, 'F' or 'V']) ###
+<a name="torch.conv2"></a>
+
+This function computes 2 dimensional convolutions between `x` and `k`.
+These operations are similar to BLAS operations when number of dimensions of input and kernel are reduced by `2`.
+
+  * `x`  and `k` are 2D: convolution of a single image with a single kernel (2D output). This operation is similar to multiplication of two scalars.
+  * `x` (`p × m × n`)  and `k` (`p × ki × kj`) are 3D: convolution of each input slice with corresponding kernel (3D output).
+  * `x` (`p × m × n`) 3D, `k` (`q × p × ki × kj`) 4D: convolution of all input slices with the corresponding slice of kernel. Output is 3D (`q × m × n`). This operation is similar to matrix vector product of matrix `k` and vector `x`.
+
+The last argument controls if the convolution is a full (`'F'`) or valid (`'V'`) convolution.
+The default is **valid** convolution.
+
+```lua
+x = torch.rand(100, 100)
+k = torch.rand(10, 10)
+c = torch.conv2(x, k)
+> c:size()
+ 91
+ 91
+[torch.LongStorage of size 2]
+
+c = torch.conv2(x, k, 'F')
+> c:size()
+ 109
+ 109
+[torch.LongStorage of size 2]
+```
+
+
+<a name="torch.xcorr2"></a>
+### [res] torch.xcorr2([res,] x, k, [, 'F' or 'V']) ###
+<a name="torch.xcorr2"></a>
+
+This function operates with same options and input/output configurations as [`torch.conv2`](#torch.conv2), but performs cross-correlation of the input with the kernel `k`.
+
+
+<a name="torch.conv3"></a>
+### [res] torch.conv3([res,] x, k, [, 'F' or 'V']) ###
+<a name="torch.conv3"></a>
+
+This function computes 3 dimensional convolutions between `x` and `k`.
+These operations are similar to BLAS operations when number of dimensions of input and kernel are reduced by `3`.
+
+  * `x`  and `k` are 3D: convolution of a single image with a single kernel (3D output). This operation is similar to multiplication of two scalars.
+  * `x` (`p × m × n × o`)  and `k` (`p × ki × kj × kk`) are 4D: convolution of each input slice with corresponding kernel (4D output).
+  * `x` (`p × m × n × o`) 4D, `k` (`q × p × ki × kj × kk`) 5D: convolution of all input slices with the corresponding slice of kernel. Output is 4D `q × m × n × o`. This operation is similar to matrix vector product of matrix `k` and vector `x`.
+
+The last argument controls if the convolution is a full (`'F'`) or valid (`'V'`) convolution.
+The default is **valid** convolution.
+
+```lua
+x = torch.rand(100, 100, 100)
+k = torch.rand(10, 10, 10)
+c = torch.conv3(x, k)
+> c:size()
+ 91
+ 91
+ 91
+[torch.LongStorage of size 3]
+
+c = torch.conv3(x, k, 'F')
+> c:size()
+ 109
+ 109
+ 109
+[torch.LongStorage of size 3]
+
+```
+
+
+<a name="torch.xcorr3"></a>
+### [res] torch.xcorr3([res,] x, k, [, 'F' or 'V']) ###
+<a name="torch.xcorr3"></a>
+
+This function operates with same options and input/output configurations as [`torch.conv3`](#torch.conv3), but performs cross-correlation of the input with the kernel `k`.
+
+
+<a name="torch.linalg.dok"></a>
+## Eigenvalues, SVD, Linear System Solution ##
+
+Functions in this section are implemented with an interface to [LAPACK](https://en.wikipedia.org/wiki/LAPACK) libraries.
+If LAPACK libraries are not found during compilation step, then these functions will not be available.
+
+
+<a name="torch.gesv"></a>
+### [x, lu] torch.gesv([resb, resa,] B, A) ###
+
+`X, LU = torch.gesv(B, A)` returns the solution of `AX = B` and `LU` contains `L` and `U` factors for `LU` factorization of `A`.
+
+`A` has to be a square and non-singular matrix (2D `Tensor`).
+`A` and `LU` are `m × m`, `X` is `m × k` and `B` is `m × k`.
+
+If `resb` and `resa` are given, then they will be used for temporary storage and returning the result.
+
+  * `resa` will contain `L` and `U` factors for `LU` factorization of `A`.
+  * `resb` will contain the solution `X`.
+
+Note: Irrespective of the original strides, the returned matrices `resb` and `resa` will be transposed, i.e. with strides `1, m` instead of `m, 1`.
+
+```lua
+> a = torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                  {-6.05, -3.30,  5.36, -4.44,  1.08},
+                  {-0.45,  2.58, -2.70,  0.27,  9.04},
+                  {8.32,  2.71,  4.35,  -7.17,  2.14},
+                  {-9.67, -5.14, -7.26,  6.08, -6.87}}):t()
+
+> b = torch.Tensor({{4.02,  6.19, -8.22, -7.57, -3.03},
+                  {-1.56,  4.00, -8.67,  1.75,  2.86},
+                  {9.81, -4.09, -4.57, -8.61,  8.99}}):t()
+
+> b
+ 4.0200 -1.5600  9.8100
+ 6.1900  4.0000 -4.0900
+-8.2200 -8.6700 -4.5700
+-7.5700  1.7500 -8.6100
+-3.0300  2.8600  8.9900
+[torch.DoubleTensor of dimension 5x3]
+
+> a
+ 6.8000 -6.0500 -0.4500  8.3200 -9.6700
+-2.1100 -3.3000  2.5800  2.7100 -5.1400
+ 5.6600  5.3600 -2.7000  4.3500 -7.2600
+ 5.9700 -4.4400  0.2700 -7.1700  6.0800
+ 8.2300  1.0800  9.0400  2.1400 -6.8700
+[torch.DoubleTensor of dimension 5x5]
+
+
+> x = torch.gesv(b, a)
+> x
+-0.8007 -0.3896  0.9555
+-0.6952 -0.5544  0.2207
+ 0.5939  0.8422  1.9006
+ 1.3217 -0.1038  5.3577
+ 0.5658  0.1057  4.0406
+[torch.DoubleTensor of dimension 5x3]
+
+> b:dist(a * x)
+1.1682163181673e-14
+```
+
+
+<a name="torch.trtrs"></a>
+### [x] torch.trtrs([resb, resa,] b, a [, 'U' or 'L'] [, 'N' or 'T'] [, 'N' or 'U']) ###
+
+`X = torch.trtrs(B, A)` returns the solution of `AX = B` where `A` is upper-triangular.
+
+`A` has to be a square, triangular, non-singular matrix (2D `Tensor`).
+`A` and `resa` are `m × m`, `X` and `B` are `m × k`.
+(To be very precise: `A` does not have to be triangular and non-singular, rather only its upper or lower triangle will be taken into account and that part has to be non-singular.)
+
+The function has several options:
+
+* `uplo` (`'U'` or `'L'`) specifies whether `A` is upper or lower triangular; the default value is `'U'`.
+* `trans` (`'N'` or `'T`') specifies the system of equations: `'N'` for `A * X = B` (no transpose), or `'T'` for `A^T * X = B` (transpose); the default value is `'N'`.
+* `diag` (`'N'` or `'U'`) `'U'` specifies that `A` is unit triangular, i.e., it has ones on its diagonal; `'N'` specifies that `A` is not (necessarily) unit triangular; the default value is `'N'`.
+
+If `resb` and `resa` are given, then they will be used for temporary storage and returning the result.
+`resb` will contain the solution `X`.
+
+Note: Irrespective of the original strides, the returned matrices `resb` and `resa` will be transposed, i.e. with strides `1, m` instead of `m, 1`.
+
+```lua
+> a = torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                  {0, -3.30,  5.36, -4.44,  1.08},
+                  {0,  0, -2.70,  0.27,  9.04},
+                  {0,  0,  0,  -7.17,  2.14},
+                  {0,  0,  0,  0, -6.87}})
+
+> b = torch.Tensor({{4.02,  6.19, -8.22, -7.57, -3.03},
+                  {-1.56,  4.00, -8.67,  1.75,  2.86},
+                  {9.81, -4.09, -4.57, -8.61,  8.99}}):t()
+
+> b
+ 4.0200 -1.5600  9.8100
+ 6.1900  4.0000 -4.0900
+-8.2200 -8.6700 -4.5700
+-7.5700  1.7500 -8.6100
+-3.0300  2.8600  8.9900
+[torch.DoubleTensor of dimension 5x3]
+
+> a
+ 6.8000 -2.1100  5.6600  5.9700  8.2300
+ 0.0000 -3.3000  5.3600 -4.4400  1.0800
+ 0.0000  0.0000 -2.7000  0.2700  9.0400
+ 0.0000  0.0000  0.0000 -7.1700  2.1400
+ 0.0000  0.0000  0.0000  0.0000 -6.8700
+[torch.DoubleTensor of dimension 5x5]
+
+> x = torch.trtrs(b, a)
+> x
+-3.5416 -0.2514  3.0847
+ 4.2072  2.0391 -4.5146
+ 4.6399  1.7804 -2.6077
+ 1.1874 -0.3683  0.8103
+ 0.4410 -0.4163 -1.3086
+[torch.DoubleTensor of size 5x3]
+
+> b:dist(a*x)
+4.1895292266754e-15
+```
+
+
+<a name="torch.potrf"></a>
+### torch.potrf([res,] A [, 'U' or 'L'] ) ###
+
+Cholesky Decomposition of 2D `Tensor` `A`.
+The matrix `A` has to be a positive-definite and either symmetric or complex Hermitian.
+
+The factorization has the form
+
+     A = U**T * U,   if UPLO = 'U', or
+     A = L  * L**T,  if UPLO = 'L',
+
+where `U` is an upper triangular matrix and `L` is lower triangular.
+
+The optional character `uplo` = {'U', 'L'} specifies whether the upper or lower triangulardecomposition should be returned. By default, `uplo` = 'U'.
+
+`U = torch.potrf(A, 'U')` returns the upper triangular Cholesky decomposition of `A`.
+
+`L = torch.potrf(A, 'L')` returns the lower triangular Cholesky decomposition of `A`.
+
+If `Tensor` `res` is provided, the resulting decomposition will be stored therein.
+
+```lua
+> A = torch.Tensor({
+    {1.2705,  0.9971,  0.4948,  0.1389,  0.2381},
+    {0.9971,  0.9966,  0.6752,  0.0686,  0.1196},
+    {0.4948,  0.6752,  1.1434,  0.0314,  0.0582},
+    {0.1389,  0.0686,  0.0314,  0.0270,  0.0526},
+    {0.2381,  0.1196,  0.0582,  0.0526,  0.3957}})
+
+> chol = torch.potrf(A)
+> chol
+ 1.1272  0.8846  0.4390  0.1232  0.2112
+ 0.0000  0.4626  0.6200 -0.0874 -0.1453
+ 0.0000  0.0000  0.7525  0.0419  0.0738
+ 0.0000  0.0000  0.0000  0.0491  0.2199
+ 0.0000  0.0000  0.0000  0.0000  0.5255
+[torch.DoubleTensor of size 5x5]
+
+> torch.potrf(chol, A, 'L')
+> chol
+ 1.1272  0.0000  0.0000  0.0000  0.0000
+ 0.8846  0.4626  0.0000  0.0000  0.0000
+ 0.4390  0.6200  0.7525  0.0000  0.0000
+ 0.1232 -0.0874  0.0419  0.0491  0.0000
+ 0.2112 -0.1453  0.0738  0.2199  0.5255
+[torch.DoubleTensor of size 5x5]
+```
+
+<a name="torch.pstrf"></a>
+### torch.pstrf([res, piv, ] A [, 'U' or 'L'] ) ###
+
+Cholesky factorization with complete pivoting of a real symmetric positive semidefinite 2D `Tensor` `A`.
+The matrix `A` has to be a positive semi-definite and symmetric. The factorization has the form
+
+    P**T * A * P = U**T * U ,  if UPLO = 'U',
+    P**T * A * P = L  * L**T,  if UPLO = 'L',
+
+where `U` is an upper triangular matrix and `L` is lower triangular, and
+`P` is stored as the vector `piv`. More specifically, `piv` is such that the nonzero entries are `P[piv[k], k] = 1`.
+
+The optional character argument `uplo` = {'U', 'L'} specifies whether the upper or lower triangular decomposition should be returned. By default, `uplo` = 'U'.
+
+`U, piv = torch.sdtrf(A, 'U')` returns the upper triangular Cholesky decomposition of `A`
+
+`L, piv = torch.potrf(A, 'L')` returns the lower triangular Cholesky decomposition of `A`.
+
+If tensors `res` and `piv` (an `IntTensor`) are provided, the resulting decomposition will be stored therein.
+
+```lua
+> A = torch.Tensor({
+    {1.2705,  0.9971,  0.4948,  0.1389,  0.2381},
+    {0.9971,  0.9966,  0.6752,  0.0686,  0.1196},
+    {0.4948,  0.6752,  1.1434,  0.0314,  0.0582},
+    {0.1389,  0.0686,  0.0314,  0.0270,  0.0526},
+    {0.2381,  0.1196,  0.0582,  0.0526,  0.3957}})
+
+> U, piv = torch.pstrf(A)
+> U
+ 1.1272  0.4390  0.2112  0.8846  0.1232
+ 0.0000  0.9750 -0.0354  0.2942 -0.0233
+ 0.0000  0.0000  0.5915 -0.0961  0.0435
+ 0.0000  0.0000  0.0000  0.3439 -0.0854
+ 0.0000  0.0000  0.0000  0.0000  0.0456
+[torch.DoubleTensor of size 5x5]
+
+> piv
+ 1
+ 3
+ 5
+ 2
+ 4
+[torch.IntTensor of size 5]
+
+> Ap = U:t() * U
+> Ap
+ 1.2705  0.4948  0.2381  0.9971  0.1389
+ 0.4948  1.1434  0.0582  0.6752  0.0314
+ 0.2381  0.0582  0.3957  0.1196  0.0526
+ 0.9971  0.6752  0.1196  0.9966  0.0686
+ 0.1389  0.0314  0.0526  0.0686  0.0270
+[torch.DoubleTensor of size 5x5]
+
+> -- Permute rows and columns
+> Ap:indexCopy(1, piv:long(), Ap:clone())
+> Ap:indexCopy(2, piv:long(), Ap:clone())
+> (Ap - A):norm()
+1.5731560566382e-16
+```
+
+<a name="torch.potrs"></a>
+### torch.potrs([res,] B, chol [, 'U' or 'L'] ) ###
+
+Returns the solution to linear system `AX = B` using the Cholesky decomposition `chol` of 2D `Tensor` `A`.
+
+Square matrix `chol` should be triangular; and, righthand side matrix `B` should be of full rank.
+
+Optional character `uplo` = {'U', 'L'} specifies matrix `chol` as either upper or lower triangular; and, by default, equals 'U'.
+
+If `Tensor` `res` is provided, the resulting decomposition will be stored therein.
+
+```lua
+> A = torch.Tensor({
+    {1.2705,  0.9971,  0.4948,  0.1389,  0.2381},
+    {0.9971,  0.9966,  0.6752,  0.0686,  0.1196},
+    {0.4948,  0.6752,  1.1434,  0.0314,  0.0582},
+    {0.1389,  0.0686,  0.0314,  0.0270,  0.0526},
+    {0.2381,  0.1196,  0.0582,  0.0526,  0.3957}})
+
+> B = torch.Tensor({
+    {0.6219,  0.3439,  0.0431},
+    {0.5642,  0.1756,  0.0153},
+    {0.2334,  0.8594,  0.4103},
+    {0.7556,  0.1966,  0.9637},
+    {0.1420,  0.7185,  0.7476}})
+
+> chol = torch.potrf(A)
+> chol
+ 1.1272  0.8846  0.4390  0.1232  0.2112
+ 0.0000  0.4626  0.6200 -0.0874 -0.1453
+ 0.0000  0.0000  0.7525  0.0419  0.0738
+ 0.0000  0.0000  0.0000  0.0491  0.2199
+ 0.0000  0.0000  0.0000  0.0000  0.5255
+[torch.DoubleTensor of size 5x5]
+
+> solve = torch.potrs(B, chol)
+> solve
+  12.1945   61.8622   92.6882
+ -11.1782  -97.0303 -138.4874
+ -15.3442  -76.6562 -116.8218
+   6.1930   13.5238   25.2056
+  29.9678  251.7346  360.2301
+[torch.DoubleTensor of size 5x3]
+
+> A*solve
+ 0.6219  0.3439  0.0431
+ 0.5642  0.1756  0.0153
+ 0.2334  0.8594  0.4103
+ 0.7556  0.1966  0.9637
+ 0.1420  0.7185  0.7476
+[torch.DoubleTensor of size 5x3]
+
+> B:dist(A*solve)
+4.6783066076306e-14
+```
+
+
+<a name="torch.potri"></a>
+### torch.potri([res,] chol [, 'U' or 'L'] ) ###
+
+Returns the inverse of 2D `Tensor` `A` given its Cholesky decomposition `chol`.
+
+Square matrix `chol` should be triangular.
+
+Optional character `uplo` = {'U', 'L'} specifies matrix `chol` as either upper or lower triangular; and, by default, equals 'U'.
+
+If `Tensor` `res` is provided, the resulting inverse will be stored therein.
+
+```lua
+> A = torch.Tensor({
+    {1.2705,  0.9971,  0.4948,  0.1389,  0.2381},
+    {0.9971,  0.9966,  0.6752,  0.0686,  0.1196},
+    {0.4948,  0.6752,  1.1434,  0.0314,  0.0582},
+    {0.1389,  0.0686,  0.0314,  0.0270,  0.0526},
+    {0.2381,  0.1196,  0.0582,  0.0526,  0.3957}})
+
+> chol = torch.potrf(A)
+> chol
+ 1.1272  0.8846  0.4390  0.1232  0.2112
+ 0.0000  0.4626  0.6200 -0.0874 -0.1453
+ 0.0000  0.0000  0.7525  0.0419  0.0738
+ 0.0000  0.0000  0.0000  0.0491  0.2199
+ 0.0000  0.0000  0.0000  0.0000  0.5255
+[torch.DoubleTensor of size 5x5]
+
+> inv = torch.potri(chol)
+> inv
+  42.2781  -39.0824    8.3019 -133.4998    2.8980
+ -39.0824   38.1222   -8.7468  119.4247   -2.5944
+   8.3019   -8.7468    3.1104  -25.1405    0.5327
+-133.4998  119.4247  -25.1405  480.7511  -15.9747
+   2.8980   -2.5944    0.5327  -15.9747    3.6127
+[torch.DoubleTensor of size 5x5]
+
+> inv:dist(torch.inverse(A))
+2.8525852877633e-12
+```
+
+
+<a name="torch.gels"></a>
+### torch.gels([resb, resa,] b, a) ###
+
+Solution of least squares and least norm problems for a full rank `m × n` matrix `A`.
+
+  * If `n ≤ m`, then solve `||AX-B||_F`.
+  * If `n > m` , then solve `min ||X||_F` s.t. `AX = B`.
+
+On return, first `n` rows of `x` matrix contains the solution and the rest contains residual information.
+Square root of sum squares of elements of each column of `x` starting at row `n + 1` is the residual for corresponding column.
+
+Note: Irrespective of the original strides, the returned matrices `resb` and `resa` will be transposed, i.e. with strides `1, m` instead of `m, 1`.
+
+```lua
+> a = torch.Tensor({{ 1.44, -9.96, -7.55,  8.34,  7.08, -5.45},
+                  {-7.84, -0.28,  3.24,  8.09,  2.52, -5.70},
+                  {-4.39, -3.24,  6.27,  5.28,  0.74, -1.19},
+                  {4.53,  3.83, -6.64,  2.06, -2.47,  4.70}}):t()
+
+> b = torch.Tensor({{8.58,  8.26,  8.48, -5.28,  5.72,  8.93},
+                  {9.35, -4.43, -0.70, -0.26, -7.36, -2.52}}):t()
+
+> a
+ 1.4400 -7.8400 -4.3900  4.5300
+-9.9600 -0.2800 -3.2400  3.8300
+-7.5500  3.2400  6.2700 -6.6400
+ 8.3400  8.0900  5.2800  2.0600
+ 7.0800  2.5200  0.7400 -2.4700
+-5.4500 -5.7000 -1.1900  4.7000
+[torch.DoubleTensor of dimension 6x4]
+
+> b
+ 8.5800  9.3500
+ 8.2600 -4.4300
+ 8.4800 -0.7000
+-5.2800 -0.2600
+ 5.7200 -7.3600
+ 8.9300 -2.5200
+[torch.DoubleTensor of dimension 6x2]
+
+> x = torch.gels(b, a)
+> x
+ -0.4506   0.2497
+ -0.8492  -0.9020
+  0.7066   0.6323
+  0.1289   0.1351
+ 13.1193  -7.4922
+ -4.8214  -7.1361
+[torch.DoubleTensor of dimension 6x2]
+
+> b:dist(a*x:narrow(1, 1, 4))
+17.390200628863
+
+> math.sqrt(x:narrow(1, 5, 2):pow(2):sumall())
+17.390200628863
+```
+
+
+<a name="torch.symeig"></a>
+### torch.symeig([rese, resv,] a [, 'N' or 'V'] [, 'U' or 'L']) ###
+
+`e, V = torch.symeig(A)` returns eigenvalues and eigenvectors of a symmetric real matrix `A`.
+
+`A` and `V` are `m × m` matrices and `e` is a `m` dimensional vector.
+
+This function calculates all eigenvalues (and vectors) of `A` such that `A = V diag(e) V'`.
+
+Third argument defines computation of eigenvectors or eigenvalues only.
+If it is `'N'`, only eigenvalues are computed.
+If it is `'V'`, both eigenvalues and eigenvectors are computed.
+
+Since the input matrix `A` is supposed to be symmetric, only upper triangular portion is used by default.
+If the 4th argument is `'L'`, then lower triangular portion is used.
+
+Note: Irrespective of the original strides, the returned matrix `V` will be transposed, i.e. with strides `1, m` instead of `m, 1`.
+
+```lua
+> a = torch.Tensor({{ 1.96,  0.00,  0.00,  0.00,  0.00},
+                  {-6.49,  3.80,  0.00,  0.00,  0.00},
+                  {-0.47, -6.39,  4.17,  0.00,  0.00},
+                  {-7.20,  1.50, -1.51,  5.70,  0.00},
+                  {-0.65, -6.34,  2.67,  1.80, -7.10}}):t()
+
+> a
+ 1.9600 -6.4900 -0.4700 -7.2000 -0.6500
+ 0.0000  3.8000 -6.3900  1.5000 -6.3400
+ 0.0000  0.0000  4.1700 -1.5100  2.6700
+ 0.0000  0.0000  0.0000  5.7000  1.8000
+ 0.0000  0.0000  0.0000  0.0000 -7.1000
+[torch.DoubleTensor of dimension 5x5]
+
+> e = torch.symeig(a)
+> e
+-11.0656
+ -6.2287
+  0.8640
+  8.8655
+ 16.0948
+[torch.DoubleTensor of dimension 5]
+
+> e, v = torch.symeig(a, 'V')
+> e
+-11.0656
+ -6.2287
+  0.8640
+  8.8655
+ 16.0948
+[torch.DoubleTensor of dimension 5]
+
+> v
+-0.2981 -0.6075  0.4026 -0.3745  0.4896
+-0.5078 -0.2880 -0.4066 -0.3572 -0.6053
+-0.0816 -0.3843 -0.6600  0.5008  0.3991
+-0.0036 -0.4467  0.4553  0.6204 -0.4564
+-0.8041  0.4480  0.1725  0.3108  0.1622
+[torch.DoubleTensor of dimension 5x5]
+
+> v*torch.diag(e)*v:t()
+ 1.9600 -6.4900 -0.4700 -7.2000 -0.6500
+-6.4900  3.8000 -6.3900  1.5000 -6.3400
+-0.4700 -6.3900  4.1700 -1.5100  2.6700
+-7.2000  1.5000 -1.5100  5.7000  1.8000
+-0.6500 -6.3400  2.6700  1.8000 -7.1000
+[torch.DoubleTensor of dimension 5x5]
+
+> a:dist(torch.triu(v*torch.diag(e)*v:t()))
+1.0219480822443e-14
+```
+
+
+<a name="torch.eig"></a>
+### torch.eig([rese, resv,] a [, 'N' or 'V']) ###
+
+`e, V = torch.eig(A)` returns eigenvalues and eigenvectors of a general real square matrix `A`.
+
+`A` and `V` are `m × m` matrices and `e` is a `m` dimensional vector.
+
+This function calculates all right eigenvalues (and vectors) of `A` such that `A = V diag(e) V'`.
+
+Third argument defines computation of eigenvectors or eigenvalues only.
+If it is `'N'`, only eigenvalues are computed.
+If it is `'V'`, both eigenvalues and eigenvectors are computed.
+
+The eigen values returned follow [LAPACK convention](https://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-16EB5901-5644-4DA6-A332-A052309010C4.htm) and are returned as complex (real/imaginary) pairs of numbers (`2 * m` dimensional `Tensor`).
+
+Note: Irrespective of the original strides, the returned matrix `V` will be transposed, i.e. with strides `1, m` instead of `m, 1`.
+
+```lua
+> a = torch.Tensor({{ 1.96,  0.00,  0.00,  0.00,  0.00},
+                  {-6.49,  3.80,  0.00,  0.00,  0.00},
+                  {-0.47, -6.39,  4.17,  0.00,  0.00},
+                  {-7.20,  1.50, -1.51,  5.70,  0.00},
+                  {-0.65, -6.34,  2.67,  1.80, -7.10}}):t()
+
+> a
+ 1.9600 -6.4900 -0.4700 -7.2000 -0.6500
+ 0.0000  3.8000 -6.3900  1.5000 -6.3400
+ 0.0000  0.0000  4.1700 -1.5100  2.6700
+ 0.0000  0.0000  0.0000  5.7000  1.8000
+ 0.0000  0.0000  0.0000  0.0000 -7.1000
+[torch.DoubleTensor of dimension 5x5]
+
+> b = a + torch.triu(a, 1):t()
+> b
+
+  1.9600 -6.4900 -0.4700 -7.2000 -0.6500
+ -6.4900  3.8000 -6.3900  1.5000 -6.3400
+ -0.4700 -6.3900  4.1700 -1.5100  2.6700
+ -7.2000  1.5000 -1.5100  5.7000  1.8000
+ -0.6500 -6.3400  2.6700  1.8000 -7.1000
+[torch.DoubleTensor of dimension 5x5]
+
+> e = torch.eig(b)
+> e
+ 16.0948   0.0000
+-11.0656   0.0000
+ -6.2287   0.0000
+  0.8640   0.0000
+  8.8655   0.0000
+[torch.DoubleTensor of dimension 5x2]
+
+> e, v = torch.eig(b, 'V')
+> e
+ 16.0948   0.0000
+-11.0656   0.0000
+ -6.2287   0.0000
+  0.8640   0.0000
+  8.8655   0.0000
+[torch.DoubleTensor of dimension 5x2]
+
+> v
+-0.4896  0.2981 -0.6075 -0.4026 -0.3745
+ 0.6053  0.5078 -0.2880  0.4066 -0.3572
+-0.3991  0.0816 -0.3843  0.6600  0.5008
+ 0.4564  0.0036 -0.4467 -0.4553  0.6204
+-0.1622  0.8041  0.4480 -0.1725  0.3108
+[torch.DoubleTensor of dimension 5x5]
+
+> v * torch.diag(e:select(2, 1))*v:t()
+ 1.9600 -6.4900 -0.4700 -7.2000 -0.6500
+-6.4900  3.8000 -6.3900  1.5000 -6.3400
+-0.4700 -6.3900  4.1700 -1.5100  2.6700
+-7.2000  1.5000 -1.5100  5.7000  1.8000
+-0.6500 -6.3400  2.6700  1.8000 -7.1000
+[torch.DoubleTensor of dimension 5x5]
+
+> b:dist(v * torch.diag(e:select(2, 1)) * v:t())
+3.5423944346685e-14
+```
+
+
+<a name="torch.svd"></a>
+### torch.svd([resu, ress, resv,] a [, 'S' or 'A']) ###
+
+`U, S, V = torch.svd(A)` returns the singular value decomposition of a real matrix `A` of size `n × m` such that `A = USV'*`.
+
+`U` is `n × n`, `S` is `n × m` and `V` is `m × m`.
+
+The last argument, if it is string, represents the number of singular values to be computed.
+`'S'` stands for *some* and `'A'` stands for *all*.
+
+Note: Irrespective of the original strides, the returned matrix `U` will be transposed, i.e. with strides `1, n` instead of `n, 1`.
+
+```lua
+> a = torch.Tensor({{8.79,  6.11, -9.15,  9.57, -3.49,  9.84},
+                  {9.93,  6.91, -7.93,  1.64,  4.02,  0.15},
+                  {9.83,  5.04,  4.86,  8.83,  9.80, -8.99},
+                  {5.45, -0.27,  4.85,  0.74, 10.00, -6.02},
+                  {3.16,  7.98,  3.01,  5.80,  4.27, -5.31}}):t()
+
+> a
+  8.7900   9.9300   9.8300   5.4500   3.1600
+  6.1100   6.9100   5.0400  -0.2700   7.9800
+ -9.1500  -7.9300   4.8600   4.8500   3.0100
+  9.5700   1.6400   8.8300   0.7400   5.8000
+ -3.4900   4.0200   9.8000  10.0000   4.2700
+  9.8400   0.1500  -8.9900  -6.0200  -5.3100
+
+> u, s, v = torch.svd(a)
+> u
+-0.5911  0.2632  0.3554  0.3143  0.2299
+-0.3976  0.2438 -0.2224 -0.7535 -0.3636
+-0.0335 -0.6003 -0.4508  0.2334 -0.3055
+-0.4297  0.2362 -0.6859  0.3319  0.1649
+-0.4697 -0.3509  0.3874  0.1587 -0.5183
+ 0.2934  0.5763 -0.0209  0.3791 -0.6526
+[torch.DoubleTensor of dimension 6x5]
+
+> s
+ 27.4687
+ 22.6432
+  8.5584
+  5.9857
+  2.0149
+[torch.DoubleTensor of dimension 5]
+
+> v
+-0.2514  0.8148 -0.2606  0.3967 -0.2180
+-0.3968  0.3587  0.7008 -0.4507  0.1402
+-0.6922 -0.2489 -0.2208  0.2513  0.5891
+-0.3662 -0.3686  0.3859  0.4342 -0.6265
+-0.4076 -0.0980 -0.4933 -0.6227 -0.4396
+[torch.DoubleTensor of dimension 5x5]
+
+> u * torch.diag(s) * v:t()
+  8.7900   9.9300   9.8300   5.4500   3.1600
+  6.1100   6.9100   5.0400  -0.2700   7.9800
+ -9.1500  -7.9300   4.8600   4.8500   3.0100
+  9.5700   1.6400   8.8300   0.7400   5.8000
+ -3.4900   4.0200   9.8000  10.0000   4.2700
+  9.8400   0.1500  -8.9900  -6.0200  -5.3100
+[torch.DoubleTensor of dimension 6x5]
+
+> a:dist(u * torch.diag(s) * v:t())
+2.8923773593204e-14
+```
+
+
+<a name="torch.inverse"></a>
+### torch.inverse([res,] x) ###
+
+Computes the inverse of square matrix `x`.
+
+`torch.inverse(x)` returns the result as a new matrix.
+
+`torch.inverse(y, x)` puts the result in `y`.
+
+Note: Irrespective of the original strides, the returned matrix `y` will be transposed, i.e. with strides `1, m` instead of `m, 1`.
+
+```lua
+> x = torch.rand(10, 10)
+> y = torch.inverse(x)
+> z = x * y
+> z
+ 1.0000 -0.0000  0.0000 -0.0000  0.0000  0.0000  0.0000 -0.0000  0.0000  0.0000
+ 0.0000  1.0000 -0.0000 -0.0000  0.0000  0.0000 -0.0000 -0.0000 -0.0000  0.0000
+ 0.0000 -0.0000  1.0000 -0.0000  0.0000  0.0000 -0.0000 -0.0000  0.0000  0.0000
+ 0.0000 -0.0000 -0.0000  1.0000 -0.0000  0.0000  0.0000 -0.0000 -0.0000  0.0000
+ 0.0000 -0.0000  0.0000 -0.0000  1.0000  0.0000  0.0000 -0.0000 -0.0000  0.0000
+ 0.0000 -0.0000  0.0000 -0.0000  0.0000  1.0000  0.0000 -0.0000 -0.0000  0.0000
+ 0.0000 -0.0000  0.0000 -0.0000  0.0000  0.0000  1.0000 -0.0000  0.0000  0.0000
+ 0.0000 -0.0000 -0.0000 -0.0000  0.0000  0.0000  0.0000  1.0000  0.0000  0.0000
+ 0.0000 -0.0000 -0.0000 -0.0000  0.0000  0.0000 -0.0000 -0.0000  1.0000  0.0000
+ 0.0000 -0.0000  0.0000 -0.0000  0.0000  0.0000  0.0000 -0.0000  0.0000  1.0000
+[torch.DoubleTensor of dimension 10x10]
+
+> torch.max(torch.abs(z - torch.eye(10))) -- Max nonzero
+2.3092638912203e-14
+```
+
+
+<a name="torch.qr"></a>
+### torch.qr([q, r], x) ###
+
+Compute a QR decomposition of the matrix `x`: matrices `q` and `r` such that `x = q * r`, with `q` orthogonal and `r` upper triangular.
+This returns the thin (reduced) QR factorization.
+
+`torch.qr(x)` returns the Q and R components as new matrices.
+
+`torch.qr(q, r, x)` stores them in existing `Tensor`s `q` and `r`.
+
+Note that precision may be lost if the magnitudes of the elements of `x` are large.
+
+Note also that, while it should always give you a valid decomposition, it may not give you the same one across platforms - it will depend on your LAPACK implementation.
+
+Note: Irrespective of the original strides, the returned matrix `q` will be transposed, i.e. with strides `1, m` instead of `m, 1`.
+
+```lua
+> a = torch.Tensor{{12, -51, 4}, {6, 167, -68}, {-4, 24, -41}}
+> a
+  12  -51    4
+   6  167  -68
+  -4   24  -41
+[torch.DoubleTensor of dimension 3x3]
+
+> q, r = torch.qr(a)
+> q
+-0.8571  0.3943  0.3314
+-0.4286 -0.9029 -0.0343
+ 0.2857 -0.1714  0.9429
+[torch.DoubleTensor of dimension 3x3]
+
+> r
+ -14.0000  -21.0000   14.0000
+   0.0000 -175.0000   70.0000
+   0.0000    0.0000  -35.0000
+[torch.DoubleTensor of dimension 3x3]
+
+> (q * r):round()
+  12  -51    4
+   6  167  -68
+  -4   24  -41
+[torch.DoubleTensor of dimension 3x3]
+
+> (q:t() * q):round()
+ 1  0  0
+ 0  1  0
+ 0  0  1
+[torch.DoubleTensor of dimension 3x3]
+```
+
+
+<a name="torch.geqrf"></a>
+### torch.geqrf([m, tau], a) ###
+
+This is a low-level function for calling LAPACK directly.
+You'll generally want to use `torch.qr()` instead.
+
+Computes a QR decomposition of `a`, but without constructing Q and R as explicit separate matrices.
+Rather, this directly calls the underlying LAPACK function `?geqrf` which produces a sequence of 'elementary reflectors'.
+See [LAPACK documentation](https://software.intel.com/en-us/node/521004) for further details.
+
+
+<a name="torch.orgqr"></a>
+### torch.orgqr([q], m, tau) ###
+
+This is a low-level function for calling LAPACK directly.
+You'll generally want to use `torch.qr()` instead.
+
+Constructs a Q matrix from a sequence of elementary reflectors, such as that given by `torch.geqrf`.
+See [LAPACK documentation](https://software.intel.com/en-us/node/521010) for further details.
+
+
+<a name="torch.ormqr"></a>
+### torch.ormqr([res], m, tau, mat [, 'L' or 'R'] [, 'N' or 'T']) ###
+
+Multiply a matrix with `Q` as defined by the elementary reflectors and scalar factors returned by `geqrf`.
+This is a low-level function for calling LAPACK directly.
+You'll generally want to use `torch.qr()` instead.
+
+* `side` (`'L'` or `'R'`) specifies whether `mat` should be left-multiplied, `mat * Q`, or right-multiplied, `Q * mat`.
+* `trans` (`'N'` or `'T`') specifies whether `Q` should be transposed before being multiplied.
+
+See [LAPACK documentation](https://software.intel.com/en-us/node/521011) for further details.
+
+
+<a name="torch.logical.dok"></a>
+## Logical Operations on `Tensor`s ##
+
+These functions implement logical comparison operators that take a `Tensor` as input and another `Tensor` or a number as the comparison target.
+They return a `ByteTensor` in which each element is `0` or `1` indicating if the comparison for the corresponding element was `false` or `true` respectively.
+
+
+<a name="torch.lt"></a>
+### torch.lt(a, b) ###
+
+Implements `<` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
+
+
+<a name="torch.lt"></a>
+### torch.le(a, b) ###
+
+Implements `<=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
+
+
+<a name="torch.lt"></a>
+### torch.gt(a, b) ###
+
+Implements `>` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
+
+
+<a name="torch.lt"></a>
+### torch.ge(a, b) ###
+
+Implements `>=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
+
+
+<a name="torch.lt"></a>
+### torch.eq(a, b) ###
+
+Implements `==` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
+
+
+<a name="torch.lt"></a>
+### torch.ne(a, b) ###
+
+Implements `~=` operator comparing each element in `a` with `b` (if `b` is a number) or each element in `a` with corresponding element in `b`.
+
+
+### torch.all(a) ###
+### torch.any(a) ###
+
+Additionally, `any` and `all` logically sum a `ByteTensor` returning `true` if any or all elements are logically true respectively.
+Note that logically true here is meant in the C sense (zero is `false`, non-zero is `true`) such as the output of the `Tensor` element-wise logical operations.
+
+```lua
+> a = torch.rand(10)
+> b = torch.rand(10)
+> a
+ 0.5694
+ 0.5264
+ 0.3041
+ 0.4159
+ 0.1677
+ 0.7964
+ 0.0257
+ 0.2093
+ 0.6564
+ 0.0740
+[torch.DoubleTensor of dimension 10]
+
+> b
+ 0.2950
+ 0.4867
+ 0.9133
+ 0.1291
+ 0.1811
+ 0.3921
+ 0.7750
+ 0.3259
+ 0.2263
+ 0.1737
+[torch.DoubleTensor of dimension 10]
+
+> torch.lt(a, b)
+ 0
+ 0
+ 1
+ 0
+ 1
+ 0
+ 1
+ 1
+ 0
+ 1
+[torch.ByteTensor of dimension 10]
+
+> torch.eq(a, b)
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+[torch.ByteTensor of dimension 10]
+
+> torch.ne(a, b)
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+[torch.ByteTensor of dimension 10]
+
+> torch.gt(a, b)
+ 1
+ 1
+ 0
+ 1
+ 0
+ 1
+ 0
+ 0
+ 1
+ 0
+[torch.ByteTensor of dimension 10]
+
+> a[torch.gt(a, b)] = 10
+> a
+ 10.0000
+ 10.0000
+  0.3041
+ 10.0000
+  0.1677
+ 10.0000
+  0.0257
+  0.2093
+ 10.0000
+  0.0740
+[torch.DoubleTensor of dimension 10]
+
+> a[torch.gt(a, 1)] = -1
+> a
+-1.0000
+-1.0000
+ 0.3041
+-1.0000
+ 0.1677
+-1.0000
+ 0.0257
+ 0.2093
+-1.0000
+ 0.0740
+[torch.DoubleTensor of dimension 10]
+
+> a = torch.ones(3):byte()
+> torch.all(a)
+true
+
+> a[2] = 0
+> torch.all(a)
+false
+
+> torch.any(a)
+true
+
+> a:zero()
+> torch.any(a)
+false
+```
diff --git a/doc/memoryfile.md b/doc/memoryfile.md
new file mode 100644
index 0000000..acd4426
--- /dev/null
+++ b/doc/memoryfile.md
@@ -0,0 +1,42 @@
+<a name="torch.MemoryFile.dok"></a>
+# MemoryFile #
+
+Parent classes: [File](file.md)
+
+A `MemoryFile` is a particular `File` which is able to perform basic
+read/write operations on a buffer in `RAM`. It implements all methods
+described in [File](file.md).
+
+The data of the `File` is contained into a `NULL` terminated
+[CharStorage](storage.md).
+
+<a name="torch.MemoryFile"></a>
+### torch.MemoryFile([mode]) ###
+
+_Constructor_ which returns a new `MemoryFile` object using `mode`. Valid
+`mode` are `"r"` (read), `"w"` (write) or `"rw"` (read-write). Default is `"rw"`.
+
+
+<a name="torch.MemoryFile"></a>
+### torch.MemoryFile(storage, mode) ###
+
+_Constructor_ which returns a new `MemoryFile` object, using the given
+[storage](storage.md) (which must be a `CharStorage`) and `mode`. Valid
+`mode` are `"r"` (read), `"w"` (write) or `"rw"` (read-write). The last character
+in this storage _must_ be `NULL` or an error will be generated. This allows
+to read existing memory. If used for writing, note that the `storage` might
+be resized by this class if needed.
+
+<a name="torch.MemoryFile.storage"></a>
+### [CharStorage] storage() ###
+
+Returns the [storage](storage.md) which contains all the data of the
+`File` (note: this is _not_ a copy, but a _reference_ on this storage). The
+size of the storage is the size of the data in the `File`, plus one, the
+last character being `NULL`.
+
+<a name="torch.MemoryFile.longSize"/></a>
+### longSize([size]) ###
+
+Longs will be written and read from the file as `size` bytes long, which
+can be 0, 4 or 8. 0 means system default.
diff --git a/doc/pipefile.md b/doc/pipefile.md
new file mode 100644
index 0000000..fdba14c
--- /dev/null
+++ b/doc/pipefile.md
@@ -0,0 +1,22 @@
+<a name="torch.PipeFile.dok"></a>
+# PipeFile #
+
+Parent classes: [DiskFile](diskfile.md)
+
+A `PipeFile` is a particular `File` which is able to perform basic read/write operations
+on a command pipe. It implements all methods described in [DiskFile](diskfile.md) and [File](file.md).
+
+The file might be open in read or write mode, depending on the parameter
+`mode` (which can take the value `"r"` or `"w"`) 
+given to the [torch.PipeFile(fileName, mode)](#torch.PipeFile). Read-write mode is not allowed.
+
+<a name="torch.PipeFile"></a>
+### torch.PipeFile(command, [mode], [quiet]) ###
+
+_Constructor_ which execute `command` by opening a pipe in read or write
+`mode`. Valid `mode` are `"r"` (read) or `"w"` (write). Default is read
+mode.
+
+If (and only if) `quiet` is `true`, no error will be raised in case of
+problem opening the file: instead `nil` will be returned.
+
diff --git a/doc/random.md b/doc/random.md
new file mode 100644
index 0000000..7097edb
--- /dev/null
+++ b/doc/random.md
@@ -0,0 +1,173 @@
+<a name="torch.random.dok"></a>
+# Random Numbers #
+
+Torch provides accurate mathematical random generation, based on
+[Mersenne Twister](http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html)
+random number generator.
+
+<a name=":torch.gen.dok"></a>
+## Generator handling ##
+
+All of the below functions, as well as [randn()](maths.md#torch.randn),
+[rand()](maths.md#torch.rand) and [randperm()](maths.md#torch.randperm),
+take as optional first argument a random number generator.
+If this argument is not provided, the default global RNG is used.
+
+A non-global RNG can be obtained with [Generator()](#torch.Generator).
+Each RNG has its own state, independent from all other RNG's states.
+
+```
+-- Seed the global RNG
+> torch.manualSeed(0)
+> torch.random()
+2357136044
+-- Creates and seed a non-global RNG
+> gen = torch.Generator()
+> torch.manualSeed(gen, 0)
+> torch.random(gen)
+2357136044
+> torch.random(gen)
+2546248239
+> torch.random()
+2546248239
+```
+
+<a name=":torch.seed.dok"></a>
+## Seed Handling ##
+
+The random number generator is provided with a random seed via
+[seed()](#torch.seed) when torch is being initialized. It can be
+reinitialized using [seed()](#torch.seed) or [manualSeed()](#torch.manualSeed).
+
+Initial seed can be obtained using [initialSeed()](#torch.initialSeed).
+
+Setting a particular seed allows the user to (re)-generate a particular sequence
+of random numbers. Example:
+
+```
+> torch.manualSeed(123)
+> = torch.uniform()
+0.69646918727085
+> return  torch.uniform()
+0.71295532141812
+> return  torch.uniform()
+0.28613933874294
+> torch.manualSeed(123)
+> return  torch.uniform()
+0.69646918727085
+> return  torch.uniform()
+0.71295532141812
+> return  torch.uniform()
+0.28613933874294
+> torch.manualSeed(torch.initialSeed())
+> return  torch.uniform()
+0.69646918727085
+> return  torch.uniform()
+0.71295532141812
+> return  torch.uniform()
+0.28613933874294
+```
+
+To regenerate a sequence of random numbers starting from a specific point
+in the sequence, one can save the state of the random number generator
+using [getRNGState()](#torch.getRNGState) and then reset the random number
+generator to that state using [setRNGState()](#torch.setRNGState). Example:
+
+```
+> torch.manualSeed(123)
+> = torch.uniform()
+0.69646918727085
+> s = torch.getRNGState()
+> return  torch.uniform()
+0.71295532141812
+> return  torch.uniform()
+0.28613933874294
+> torch.setRNGState(s)
+> return  torch.uniform()
+0.71295532141812
+> return  torch.uniform()
+0.28613933874294
+```
+
+<a name="torch.Generator"></a>
+### [Generator] Generator() ###
+
+Creates a non-global random generator that carries its own state and can be
+passed as the first argument to any function that generates a random number.
+
+<a name="torch.seed"></a>
+### [number] seed([gen,]) ###
+
+Set the seed of the random number generator using `/dev/urandom`
+(on Windows the time of the computer with granularity of seconds is used).
+Returns the seed obtained.
+
+<a name="torch.manualSeed"></a>
+### manualSeed([gen,] number) ###
+
+Set the seed of the random number generator to the given `number`.
+
+<a name="torch.initialSeed"></a>
+### initialSeed([gen]) ###
+
+Returns the initial seed used to initialize the random generator.
+
+<a name="torch.getRNGState"></a>
+### [Tensor] getRNGState([gen]) ###
+Returns the current state of the random number generator as a torch.ByteTensor.
+This can then be used to set the state of the RNG so that the same sequence of
+random numbers is produced.
+
+<a name="torch.setRNGState"></a>
+### [Tensor] setRNGState([gen,] state) ###
+Set the state of the random number generator. If `state` was obtained earlier
+using `getRNGState` then the random number generator should now generate the
+same numbers as it did from the point where `state` was obtained. This function
+returns its argument, `state`.
+
+<a name="torch.random"></a>
+### [number] random([gen,] [a], [b]) ###
+
+Returns an unsigned 32 bit integer random number from [a,b]. By default `a` is 1 and `b` is 2^32.
+
+<a name="torch.uniform"></a>
+### [number] uniform([gen,] [a],[b]) ###
+
+Returns a random real number according to uniform distribution on [a,b). By default `a` is 0 and `b` is 1.
+
+<a name="torch.normal"></a>
+### [number] normal([gen,] [mean],[stdv]) ###
+
+Returns a random real number according to a normal distribution with the given `mean` and standard deviation `stdv`.
+`stdv` must be positive.
+
+<a name="torch.exponential"></a>
+### [number] exponential([gen,] lambda) ###
+
+Returns a random real number according to the exponential distribution
+''p(x) = lambda * exp(-lambda * x)''
+
+<a name="torch.cauchy"></a>
+### [number] cauchy([gen,] median, sigma) ###
+
+Returns a random real number according to the Cauchy distribution
+''p(x) = sigma/(pi*(sigma^2 + (x-median)^2))''
+
+<a name="torch.logNormal"></a>
+### [number] logNormal([gen,] mean, stdv) ###
+
+Returns a random real number according to the log-normal distribution, with
+the given `mean` and standard deviation `stdv`.
+`stdv` must be positive.
+
+<a name="torch.geometric"></a>
+### [number] geometric([gen,] p) ###
+
+Returns a random integer number according to a geometric distribution
+''p(i) = (1-p) * p^(i-1)`. `p` must satisfy `0 < p < 1''.
+
+<a name="torch.bernoulli"></a>
+### [number] bernoulli([gen,] [p]) ###
+
+Returns `1` with probability `p` and `0` with probability `1-p`. `p` must satisfy `0 <= p <= 1`.
+By default `p` is equal to `0.5`.
diff --git a/doc/serialization.md b/doc/serialization.md
new file mode 100644
index 0000000..bcc6bff
--- /dev/null
+++ b/doc/serialization.md
@@ -0,0 +1,112 @@
+
+<a name="torch.serialization.dok"></a>
+# Serialization #
+
+Torch provides 4 high-level methods to serialize/deserialize arbitrary Lua/Torch objects.
+These functions are just abstractions over the [File](file.md) object, and were created
+for convenience (these are very common routines).
+
+The first two functions are useful to serialize/deserialize data to/from files:
+
+  - `torch.save(filename, object [, format, referenced])`
+  - `[object] torch.load(filename [, format, referenced])`
+
+The next two functions are useful to serialize/deserialize data to/from strings:
+
+  - `[str] torch.serialize(object)`
+  - `[object] torch.deserialize(str)`
+
+Serializing to files is useful to save arbitrary data structures, or share them with other people.
+Serializing to strings is useful to store arbitrary data structures in databases, or 3rd party
+software.
+
+<a name="torch.save"></a>
+### torch.save(filename, object [, format, referenced]) ###
+
+Writes `object` into a file named `filename`. The `format` can be set to
+`ascii` or `binary` (default is binary). Binary format is platform
+dependent, but typically more compact and faster to read/write. The ASCII
+format is platform-independent, and should be used to share data structures
+across platforms. The option `referenced` specifies if
+[object references](file.md#torch.File.referenced) should be tracked or not
+(`true` by default).
+
+```
+-- arbitrary object:
+obj = {
+   mat = torch.randn(10,10),
+   name = '10',
+   test = {
+      entry = 1
+   }
+}
+
+-- save to disk:
+torch.save('test.dat', obj)
+```
+
+<a name="torch.load"></a>
+### [object] torch.load(filename [, format, referenced]) ###
+
+Reads `object` from a file named `filename`.
+The `format` can be set to `ascii`, `binary`, `b32` or `b64` (default is binary).
+Binary format is platform dependent, but typically more compact and faster to read/write.
+Use `b32`/`b64`, instead of `binary`, for loading files saved on a 32/64 bit OS.
+The ASCII format is platform-independent, and may be used to share data structures across platforms.
+The option `referenced` specifies if [object references](file.md#torch.File.referenced) should be tracked or not (`true` by default).
+Note that files written with `referenced` at `true` cannot be loaded with `referenced` at `false`.
+
+```
+-- given serialized object from section above, reload:
+obj = torch.load('test.dat')
+
+print(obj)
+-- will print:
+-- {[mat]  = DoubleTensor - size: 10x10
+--  [name] = string : "10"
+--  [test] = table - size: 0}
+```
+
+<a name="torch.serialize"></a>
+### [str] torch.serialize(object [, format]) ###
+
+Serializes `object` into a string. The `format` can be set
+to `ascii` or `binary` (default is binary). Binary format is platform
+dependent, but typically more compact and faster to read/write. The ASCII
+format is platform-independent, and should be used to share data structures
+across platforms.
+
+```
+-- arbitrary object:
+obj = {
+   mat = torch.randn(10,10),
+   name = '10',
+   test = {
+      entry = 1
+   }
+}
+
+-- serialize:
+str = torch.serialize(obj)
+```
+
+<a name="torch.deserialize"></a>
+### [object] torch.deserialize(str [, format]) ###
+
+Deserializes `object` from a string. The `format` can be set
+to `ascii` or `binary` (default is binary). Binary format is platform
+dependent, but typically more compact and faster to read/write. The ASCII
+format is platform-independent, and should be used to share data structures
+across platforms.
+
+```
+-- given serialized object from section above, deserialize:
+obj = torch.deserialize(str)
+
+print(obj)
+-- will print:
+-- {[mat]  = DoubleTensor - size: 10x10
+--  [name] = string : "10"
+--  [test] = table - size: 0}
+```
+
diff --git a/doc/storage.md b/doc/storage.md
new file mode 100644
index 0000000..6052042
--- /dev/null
+++ b/doc/storage.md
@@ -0,0 +1,300 @@
+<a name="torch.Storage.dok"></a>
+# Storage #
+<a name="torch.CharStorage.dok"></a>
+<a name="torch.ByteStorage.dok"></a>
+<a name="torch.IntStorage.dok"></a>
+<a name="torch.ShortStorage.dok"></a>
+<a name="torch.FloatStorage.dok"></a>
+<a name="torch.LongStorage.dok"></a>
+<a name="torch.DoubleStorage.dok"></a>
+
+_Storages_ are basically a way for `Lua` to access memory of a `C` pointer
+or array. _Storages_ can also [map the contents of a file to memory](#__torch.StorageMap).
+A `Storage` is an array of _basic_ `C` types. For arrays of `Torch` objects,
+use the `Lua` tables.
+
+Several `Storage` classes for all the basic `C` types exist and have the
+following self-explanatory names: `ByteStorage`, `CharStorage`, `ShortStorage`,
+`IntStorage`, `LongStorage`, `FloatStorage`, `DoubleStorage`.
+
+Note that `ByteStorage` and `CharStorage` represent both arrays of bytes. `ByteStorage` represents an array of
+_unsigned_ chars, while `CharStorage` represents an array of _signed_ chars.
+
+Conversions between two `Storage` type might be done using `copy`:
+```lua
+x = torch.IntStorage(10):fill(1)
+y = torch.DoubleStorage(10):copy(x)
+```
+
+[Classical storages](#torch.Storage) are [serializable](file.md#torch.File.serialization).
+[Storages mapping a file](#__torch.StorageMap) are also [serializable](file.md#torch.File.serialization),
+but _will be saved as a normal storage_. High-level serialization commands are described in the
+[serialization](serialization.md) section.
+
+An alias `torch.Storage()` is made over your preferred Storage type,
+controlled by the
+[torch.setdefaulttensortype](utility.md#torch.setdefaulttensortype)
+function. By default, this "points" on `torch.DoubleStorage`.
+
+## Constructors and Access Methods ##
+
+<a name="torch.Storage"></a>
+### torch.TYPEStorage([size [, ptr]]) ###
+
+Returns a new `Storage` of type `TYPE`. Valid `TYPE` are `Byte`, `Char`, `Short`,
+`Int`, `Long`, `Float`, and `Double`. If `size` is given, resize the
+`Storage` accordingly, else create an empty `Storage`.
+
+Example:
+```lua
+-- Creates a Storage of 10 double:
+x = torch.DoubleStorage(10)
+```
+
+The data in the `Storage` is _uninitialized_.
+
+The optional second argument `ptr` is a number whose value is a
+pointer to a memory chunk of size `size*sizeof(TYPE)` (for example coming from the
+[`torch.data()`](tensor.md#result-datatensor-asnumber)
+method). The caller remains responsible of the memory chunk and must ensure it remains stable as the storage only keeps a pointer to it (the memory is _not_ copied and will _not_ be freed at storage deletion).
+
+<a name="torch.Storage"></a>
+### torch.TYPEStorage(table) ###
+
+`table` is assumed to be a Lua array of numbers. The constructor returns a new storage of the specified `TYPE`,
+of the size of the table, containing all the table elements converted
+
+Example:
+```lua
+> = torch.IntStorage({1,2,3,4})
+
+ 1
+ 2
+ 3
+ 4
+[torch.IntStorage of size 4]
+```
+
+<a name="torch.Storage"></a>
+### torch.TYPEStorage(storage [, offset [, size]]) ###
+
+Returns a new `Storage` of type `TYPE`, which is a view on the first argument. The first argument must be of the same type `TYPE`. An optional `offset` can be provided (defaults to 1). An optional `size` can also be provided to restrict the size of the new storage (defaults to `storage:size()-(offset-1)`).
+
+Example:
+```lua
+-- Creates a Storage of 10 double:
+> x = torch.DoubleStorage(10)
+
+-- Creates a view on this Storage, starting at offset 3, with a size of 5:
+> y = torch.DoubleStorage(x, 3, 5)
+
+-- Modifying elements of y will modify x:
+> x:fill(0)
+> y:fill(1)
+> print(x)
+ 0
+ 0
+ 1
+ 1
+ 1
+ 1
+ 1
+ 0
+ 0
+ 0
+[torch.DoubleStorage of size 10]
+```
+
+<a name="torch.Storage"></a>
+### torch.TYPEStorage(filename [, shared [, size [, sharedMem]]]) ###
+<a name="__torch.StorageMap"></a>
+
+Returns a new kind of `Storage` which maps the contents of the given
+`filename` to memory. Valid `TYPE` are `Byte`, `Char`, `Short`, `Int`, `Long`,
+`Float`, and `Double`. If the optional boolean argument `shared` is `true`,
+the mapped memory is shared amongst all processes on the computer.
+
+When `shared` is `true`, the file must be accessible in read-write mode. Any
+changes on the storage will be written in the file. The changes might be written
+only after destruction of the storage.
+
+When `shared` is `false` (or not provided), the file must be at least
+readable. Any changes on the storage will not affect the file. Note:
+changes made on the file after creation of the storage have an unspecified
+effect on the storage contents.
+
+If `size` is specified, it is the [size](#torch.Storage.size) of the returned
+`Storage` (in elements). In this case, if `shared` is `false` then the file must
+already contain at least
+```lua
+size*(size of TYPE)
+```
+bytes. If `shared` is `true` then the file will be created if necessary, and
+extended if necessary to that many bytes in length.
+
+If `size` is not specified then the [size](#torch.Storage.size) of the returned
+`Storage`  will be
+```lua
+(size of file in byte)/(size of TYPE)
+```
+elements provided a non empty file already exists.
+
+If `sharedMem` is true then, the file will be created (or mapped) from the shared
+memory area using [`shm_open()`](http://linux.die.net/man/3/shm_open). On Linux systems
+this is implemented at `/dev/shm` partition on RAM for interprocess communication.
+
+
+Example:
+```lua
+$ echo "Hello World" > hello.txt
+$ lua
+Lua 5.1.3  Copyright (C) 1994-2008 Lua.org, PUC-Rio
+> require 'torch'
+> x = torch.CharStorage('hello.txt')
+> = x
+  72
+ 101
+ 108
+ 108
+ 111
+  32
+  87
+ 111
+ 114
+ 108
+ 100
+  10
+[torch.CharStorage of size 12]
+
+> = x:string()
+Hello World
+
+> = x:fill(42):string()
+************
+>
+$ cat hello.txt
+Hello World
+$ lua
+Lua 5.1.3  Copyright (C) 1994-2008 Lua.org, PUC-Rio
+> require 'torch'
+> x = torch.CharStorage('hello.txt', true)
+> = x:string()
+Hello World
+
+> x:fill(42)
+>
+$ cat hello.txt
+************
+```
+
+<a name="__torch.StorageSharp"></a>
+### [number] #self ###
+
+Returns the number of elements in the storage. Equivalent to [size()](#torch.Storage.size).
+
+<a name="torch.Storage.__index__"></a>
+### [number] self[index] ###
+
+Returns or set the element at position `index` in the storage. Valid range
+of `index` is 1 to [size()](#torch.Storage.size).
+
+Example:
+```lua
+x = torch.DoubleStorage(10)
+print(x[5])
+```
+
+<a name="torch.Storage.copy"></a>
+### [self] copy(storage) ###
+
+Copy another `storage`. The types of the two storages might be different: in that case
+a conversion of types occur (which might result, of course, in loss of precision or rounding).
+This method returns self, allowing things like:
+```lua
+x = torch.IntStorage(10):fill(1)
+y = torch.DoubleStorage(10):copy(x) -- y won't be nil!
+```
+
+<a name="torch.Storage.fill"></a>
+### [self] fill(value) ###
+
+Fill the `Storage` with the given value. This method returns self, allowing things like:
+```lua
+x = torch.IntStorage(10):fill(0) -- x won't be nil!
+```
+
+<a name="torch.Storage.resize"></a>
+### [self] resize(size) ###
+
+Resize the storage to the provided `size`. _The new contents are undetermined_.
+
+This function returns self, allowing things like:
+```lua
+x = torch.DoubleStorage(10):fill(1)
+y = torch.DoubleStorage():resize(x:size()):copy(x) -- y won't be nil!
+```
+
+<a name="torch.Storage.size"></a>
+### [number] size() ###
+
+Returns the number of elements in the storage. Equivalent to [#](#__torch.StorageSharp).
+
+<a name="torch.Storage.string"></a>
+### [self] string(str) ###
+
+This function is available only on `ByteStorage` and `CharStorage`.
+
+This method resizes the storage to the length of the provided
+string `str`, and copy the contents of `str` into the storage. The `NULL` terminating character is not copied,
+but `str` might contain `NULL` characters. The method returns the `Storage`.
+```lua
+> x = torch.CharStorage():string("blah blah")
+> print(x)
+  98
+ 108
+  97
+ 104
+  32
+  98
+ 108
+  97
+ 104
+[torch.CharStorage of size 9]
+```
+
+<a name="torch.Storage.string"></a>
+### [string] string() ###
+
+This function is available only on `ByteStorage` and `CharStorage`.
+
+The contents of the storage viewed as a string are returned. The string might contain
+`NULL` characters.
+```lua
+> x = torch.CharStorage():string("blah blah")
+> print(x:string())
+blah blah
+```
+
+## Reference counting methods ##
+
+Storages are reference-counted. It means that each time an object (C or the
+Lua state) need to keep a reference over a storage, the corresponding
+storage reference counter will be [increased](#torch.Storage.retain). The
+reference counter is [decreased]((#torch.Storage.free)) when the object
+does not need the storage anymore.
+
+These methods should be used with extreme care. In general, they should
+never be called, except if you know what you are doing, as the handling of
+references is done automatically. They can be useful in threaded
+environments. Note that these methods are atomic operations.
+
+<a name="torch.Storage.retain"></a>
+### retain() ###
+
+Increment the reference counter of the storage.
+
+<a name="torch.Storage.free"></a>
+### free() ###
+
+Decrement the reference counter of the storage. Free the storage if the
+counter is at 0.
diff --git a/doc/tensor.md b/doc/tensor.md
new file mode 100644
index 0000000..931d0a6
--- /dev/null
+++ b/doc/tensor.md
@@ -0,0 +1,2415 @@
+<a name="torch.Tensor.dok"></a>
+# Tensor #
+
+The `Tensor` class is probably the most important class in
+`Torch`. Almost every package depends on this class. It is *__the__*
+class for handling numeric data. As with   pretty much anything in
+[Torch7](./../index.md), tensors are
+[serializable](file.md#torch.File.serialization).
+
+__Multi-dimensional matrix__
+
+A `Tensor` is a potentially multi-dimensional matrix. The number of
+dimensions is unlimited that can be created using
+[LongStorage](storage.md) with more dimensions.
+
+Example:
+```lua
+ --- creation of a 4D-tensor 4x5x6x2
+ z = torch.Tensor(4,5,6,2)
+ --- for more dimensions, (here a 6D tensor) one can do:
+ s = torch.LongStorage(6)
+ s[1] = 4; s[2] = 5; s[3] = 6; s[4] = 2; s[5] = 7; s[6] = 3;
+ x = torch.Tensor(s)
+```
+
+The number of dimensions of a `Tensor` can be queried by
+[nDimension()](#torch.nDimension) or
+[dim()](#torch.Tensor.dim). Size of the `i-th` dimension is
+returned by [size(i)](#torch.Tensor.size). A [LongStorage](storage.md)
+containing all the dimensions can be returned by
+[size()](#torch.Tensor.size).
+
+```lua
+> x:nDimension()
+6
+> x:size()
+ 4
+ 5
+ 6
+ 2
+ 7
+ 3
+[torch.LongStorage of size 6]
+```
+
+__Internal data representation__
+
+The actual data of a `Tensor` is contained into a
+[Storage](storage.md). It can be accessed using
+[`storage()`](#torch.storage). While the memory of a
+`Tensor` has to be contained in this unique `Storage`, it might
+not be contiguous: the first position used in the `Storage` is given
+by [`storageOffset()`](#torch.storageOffset) (starting at
+`1`). And the _jump_ needed to go from one element to another
+element in the `i-th` dimension is given by
+[`stride(i)`](#torch.Tensor.stride). In other words, given a 3D
+tensor
+
+```lua
+x = torch.Tensor(7,7,7)
+```
+accessing the element `(3,4,5)` can be done by
+```lua
+> x[3][4][5]
+```
+or equivalently (but slowly!)
+```lua
+> x:storage()[x:storageOffset()
+              +(3-1)*x:stride(1)+(4-1)*x:stride(2)+(5-1)*x:stride(3)]
+```
+One could say that a `Tensor` is a particular way of _viewing_ a
+`Storage`: a `Storage` only represents a chunk of memory, while the
+`Tensor` interprets this chunk of memory as having dimensions:
+```lua
+x = torch.Tensor(4,5)
+s = x:storage()
+for i=1,s:size() do -- fill up the Storage
+  s[i] = i
+end
+> x -- s is interpreted by x as a 2D matrix
+  1   2   3   4   5
+  6   7   8   9  10
+ 11  12  13  14  15
+ 16  17  18  19  20
+[torch.DoubleTensor of dimension 4x5]
+```
+
+Note also that in Torch7 ___elements in the same row___ [elements along the __last__ dimension]
+are contiguous in memory for a matrix [tensor]:
+```lua
+x = torch.Tensor(4,5)
+i = 0
+
+x:apply(function()
+  i = i + 1
+  return i
+end)
+
+> x
+  1   2   3   4   5
+  6   7   8   9  10
+ 11  12  13  14  15
+ 16  17  18  19  20
+[torch.DoubleTensor of dimension 4x5]
+
+> x:stride()
+ 5
+ 1  -- element in the last dimension are contiguous!
+[torch.LongStorage of size 2]
+```
+This is exactly like in C (and not `Fortran`).
+
+__Tensors of different types__
+
+Actually, several types of `Tensor` exists:
+```lua
+ByteTensor -- contains unsigned chars
+CharTensor -- contains signed chars
+ShortTensor -- contains shorts
+IntTensor -- contains ints
+LongTensor -- contains longs
+FloatTensor -- contains floats
+DoubleTensor -- contains doubles
+```
+
+Most numeric operations are implemented _only_ for `FloatTensor` and `DoubleTensor`.
+Other Tensor types are useful if you want to save memory space.
+
+__Default Tensor type__
+
+For convenience, _an alias_ `torch.Tensor` is provided, which allows the user to write
+type-independent scripts, which can then ran after choosing the desired Tensor type with
+a call like
+```lua
+torch.setdefaulttensortype('torch.FloatTensor')
+```
+See [torch.setdefaulttensortype](utility.md#torch.setdefaulttensortype) for more details.
+By default, the alias "points" on `torch.DoubleTensor`.
+
+__Efficient memory management__
+
+_All_ tensor operations in this class do _not_ make any memory copy. All
+these methods transform the existing tensor, or return a new tensor
+referencing _the same storage_. This magical behavior is internally
+obtained by good usage of the [stride()](#torch.Tensor.stride) and
+[storageOffset()](#torch.storageOffset). Example:
+```lua
+x = torch.Tensor(5):zero()
+> x
+0
+0
+0
+0
+0
+[torch.DoubleTensor of dimension 5]
+> x:narrow(1, 2, 3):fill(1) -- narrow() returns a Tensor
+                            -- referencing the same Storage as x
+> x
+ 0
+ 1
+ 1
+ 1
+ 0
+[torch.Tensor of dimension 5]
+```
+
+If you really need to copy a `Tensor`, you can use the [copy()](#torch.Tensor.copy) method:
+```lua
+y = torch.Tensor(x:size()):copy(x)
+```
+Or the convenience method
+```lua
+y = x:clone()
+```
+
+We now describe all the methods for `Tensor`. If you want to specify the Tensor type,
+just replace `Tensor` by the name of the Tensor variant (like `CharTensor`).
+
+<a name="torch.Tensor"></a>
+## Tensor constructors ##
+
+Tensor constructors, create new Tensor object, optionally, allocating
+new memory. By default the elements of a newly allocated memory are
+not initialized, therefore, might contain arbitrary numbers. Here are
+several ways to construct a new `Tensor`.
+
+<a name="torch.Tensor"></a>
+### torch.Tensor() ###
+
+Returns an empty tensor.
+
+<a name="torch.Tensor"></a>
+### torch.Tensor(tensor) ###
+
+Returns a new tensor which reference the same
+[Storage](#torch.storage) than the given `tensor`. The
+[size](#torch.Tensor.size), [stride](#torch.Tensor.stride), and
+[storage offset](#torch.storageOffset) are the same than the
+given tensor.
+
+The new `Tensor` is now going to "view" the same [storage](storage.md)
+as the given `tensor`. As a result, any modification in the elements
+of the `Tensor` will have a impact on the elements of the given
+`tensor`, and vice-versa. No memory copy!
+
+```lua
+x = torch.Tensor(2,5):fill(3.14)
+> x
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+[torch.DoubleTensor of dimension 2x5]
+
+y = torch.Tensor(x)
+> y
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+[torch.DoubleTensor of dimension 2x5]
+
+y:zero()
+> x -- elements of x are the same as y!
+0 0 0 0 0
+0 0 0 0 0
+[torch.DoubleTensor of dimension 2x5]
+```
+
+
+<a name="torch.Tensor"></a>
+### torch.Tensor(sz1 [,sz2 [,sz3 [,sz4]]]]) ###
+
+Create a tensor up to 4 dimensions. The tensor size will be `sz1 x sz2 x sx3 x sz4`.
+
+<a name="torch.Tensor"></a>
+### torch.Tensor(sizes, [strides]) ###
+
+Create a tensor of any number of dimensions. The
+[LongStorage](storage.md) `sizes` gives the size in each dimension of
+the tensor. The optional [LongStorage](storage.md) `strides` gives the
+jump necessary to go from one element to the next one in the each
+dimension. Of course, `sizes` and `strides` must have the same
+number of elements. If not given, or if some elements of `strides`
+are _negative_, the [stride()](#torch.Tensor.stride) will be
+computed such that the tensor is as contiguous as possible in memory.
+
+Example, create a 4D 4x4x3x2 tensor:
+```lua
+x = torch.Tensor(torch.LongStorage({4,4,3,2}))
+```
+
+Playing with the strides can give some interesting things:
+```lua
+x = torch.Tensor(torch.LongStorage({4}), torch.LongStorage({0})):zero() -- zeroes the tensor
+x[1] = 1 -- all elements point to the same address!
+> x
+ 1
+ 1
+ 1
+ 1
+[torch.DoubleTensor of dimension 4]
+```
+
+Note that _negative strides are not allowed_, and, if given as
+argument when constructing the Tensor, will be interpreted as //choose
+the right stride such that the Tensor is contiguous in memory//.
+
+Note _this method cannot be used to create `torch.LongTensor`s_.
+The constructor [from a storage](tensor.md#torchtensorstorage-storageoffset-sizes-strides) will be used:
+```lua
+a = torch.LongStorage({1,2}) -- We have a torch.LongStorage containing the values 1 and 2
+-- General case for TYPE ~= Long, e.g. for TYPE = Float:
+b = torch.FloatTensor(a)
+-- Creates a new torch.FloatTensor with 2 dimensions, the first of size 1 and the second of size 2
+> b:size()
+ 1
+ 2
+[torch.LongStorage of size 2]
+
+-- Special case of torch.LongTensor
+c = torch.LongTensor(a)
+-- Creates a new torch.LongTensor that uses a as storage and thus contains the values 1 and 2
+> c
+ 1
+ 2
+[torch.LongTensor of size 2]
+```
+
+<a name="torch.Tensor"></a>
+### torch.Tensor(storage, [storageOffset, sizes, [strides]]) ###
+
+Returns a tensor which uses the existing [Storage](storage.md)
+`storage`, starting at position `storageOffset` (>=1).  The size
+of each dimension of the tensor is given by the
+[LongStorage](storage.md) `sizes`.
+
+If only `storage` is provided, it will create a 1D Tensor viewing
+the all Storage.
+
+The jump necessary to go from one element to the next one in each
+dimension is given by the optional argument [LongStorage](storage.md)
+`strides`. If not given, or if some elements of `strides` are
+negative, the [stride()](#torch.Tensor.stride) will be computed such
+that the tensor is as contiguous as possible in memory.
+
+Any modification in the elements of the `Storage` will have an
+impact on the elements of the new `Tensor`, and vice-versa. There is
+no memory copy!
+
+```lua
+-- creates a storage with 10 elements
+s = torch.Storage(10):fill(1)
+
+-- we want to see it as a 2x5 tensor
+x = torch.Tensor(s, 1, torch.LongStorage{2,5})
+> x
+ 1  1  1  1  1
+ 1  1  1  1  1
+[torch.DoubleTensor of dimension 2x5]
+
+x:zero()
+> s -- the storage contents have been modified
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+[torch.DoubleStorage of size 10]
+```
+
+<a name="torch.Tensor"></a>
+### torch.Tensor(storage, [storageOffset, sz1 [, st1 ... [, sz4 [, st4]]]]) ###
+
+Convenience constructor (for the previous constructor) assuming a
+number of dimensions inferior or equal to 4. `szi` is the size in
+the `i-th` dimension, and `sti` is the stride in the `i-th`
+dimension.
+
+<a name="torch.Tensor"></a>
+### torch.Tensor(table) ###
+
+The argument is assumed to be a Lua array of numbers. The constructor
+returns a new Tensor of the size of the table, containing all the table
+elements. The table might be multi-dimensional.
+
+Example:
+```lua
+> torch.Tensor({{1,2,3,4}, {5,6,7,8}})
+ 1  2  3  4
+ 5  6  7  8
+[torch.DoubleTensor of dimension 2x4]
+```
+
+## A note on function calls ##
+
+The rest of this guide will present many functions that can be used to manipulate tensors. Most functions have been
+defined so that they can be called flexibly, either in an object-oriented "method call" style i.e. `src:function(...)`
+or a more "functional" style `torch.function(src, ...)`, where `src` is a tensor. Note that these different invocations
+may differ in whether they modify the tensor in-place, or create a new tensor. Additionally, some functions can be
+called in the form `dst:function(src, ...)` which usually suggests that the result of the operation on the `src` tensor
+will be stored in the tensor `dst`.  Further details are given in the individual function definitions, below, but it
+should be noted that the documentation is currently incomplete in this regard, and readers are encouraged to experiment
+in an interactive session.
+
+## Cloning ##
+
+<a name="torch.Tensor.clone"></a>
+### [Tensor] clone() ###
+
+Returns a clone of a tensor. The memory is copied.
+
+```lua
+i = 0
+x = torch.Tensor(5):apply(function(x)
+  i = i + 1
+  return i
+end)
+> x
+ 1
+ 2
+ 3
+ 4
+ 5
+[torch.DoubleTensor of dimension 5]
+
+-- create a clone of x
+y = x:clone()
+> y
+ 1
+ 2
+ 3
+ 4
+ 5
+[torch.DoubleTensor of dimension 5]
+
+-- fill up y with 1
+y:fill(1)
+> y
+ 1
+ 1
+ 1
+ 1
+ 1
+[torch.DoubleTensor of dimension 5]
+
+-- the contents of x were not changed:
+> x
+ 1
+ 2
+ 3
+ 4
+ 5
+[torch.DoubleTensor of dimension 5]
+```
+
+<a name="torch.Tensor.contiguous"></a>
+### [Tensor] contiguous ###
+
+  * If the given Tensor contents are contiguous in memory, returns the exact same Tensor (no memory copy).
+  * Otherwise (_not contiguous in memory_), returns a [clone](#torch.Tensor.clone) (memory _copy_).
+
+```lua
+x = torch.Tensor(2,3):fill(1)
+> x
+ 1  1  1
+ 1  1  1
+[torch.DoubleTensor of dimension 2x3]
+
+-- x is contiguous, so y points to the same thing
+y = x:contiguous():fill(2)
+> y
+ 2  2  2
+ 2  2  2
+[torch.DoubleTensor of dimension 2x3]
+
+-- contents of x have been changed
+> x
+ 2  2  2
+ 2  2  2
+[torch.DoubleTensor of dimension 2x3]
+
+-- x:t() is not contiguous, so z is a clone
+z = x:t():contiguous():fill(3.14)
+> z
+ 3.1400  3.1400
+ 3.1400  3.1400
+ 3.1400  3.1400
+[torch.DoubleTensor of dimension 3x2]
+
+-- contents of x have not been changed
+> x
+ 2  2  2
+ 2  2  2
+[torch.DoubleTensor of dimension 2x3]
+```
+
+<a name="torch.type"></a>
+### [Tensor or string] type(type) ###
+
+__If `type` is `nil`__, returns a string containing the type name of
+  the given tensor.
+
+```lua
+= torch.Tensor():type()
+torch.DoubleTensor
+```
+
+__If `type` is a string__ describing a Tensor type, and is equal to
+the given tensor typename, returns the exact same tensor (//no memory
+copy//).
+
+```lua
+x = torch.Tensor(3):fill(3.14)
+> x
+ 3.1400
+ 3.1400
+ 3.1400
+[torch.DoubleTensor of dimension 3]
+
+y = x:type('torch.DoubleTensor')
+> y
+ 3.1400
+ 3.1400
+ 3.1400
+[torch.DoubleTensor of dimension 3]
+
+-- zero y contents
+y:zero()
+
+-- contents of x have been changed
+> x
+ 0
+ 0
+ 0
+[torch.DoubleTensor of dimension 3]
+
+```
+
+__If `type` is a string__ describing a Tensor type, different from
+the type name of the given Tensor, returns a new Tensor of the
+specified type, whose contents corresponds to the contents of the
+original Tensor, casted to the given type (//memory copy occurs, with
+possible loss of precision//).
+
+```lua
+x = torch.Tensor(3):fill(3.14)
+> x
+ 3.1400
+ 3.1400
+ 3.1400
+[torch.DoubleTensor of dimension 3]
+
+y = x:type('torch.IntTensor')
+> y
+ 3
+ 3
+ 3
+[torch.IntTensor of dimension 3]
+
+```
+
+<a name="torch.Tensor.typeAs"></a>
+### [Tensor] typeAs(tensor) ###
+
+Convenience method for the [type](#torch.type) method. Equivalent to
+```lua
+type(tensor:type())
+```
+
+<a name="torch.isTensor"></a>
+### [boolean] isTensor(object) ###
+
+Returns `true` iff the provided `object` is one of the `torch.*Tensor` types.
+
+```lua
+> torch.isTensor(torch.randn(3,4))
+true
+
+> torch.isTensor(torch.randn(3,4)[1])
+true
+
+> torch.isTensor(torch.randn(3,4)[1][2])
+false
+```
+
+<a name="torch.byte"></a>
+### [Tensor] byte(), char(), short(), int(), long(), float(), double() ###
+<a name="torch.Tensor.short"></a>
+<a name="torch.Tensor.char"></a>
+<a name="torch.Tensor.long"></a>
+<a name="torch.Tensor.int"></a>
+<a name="torch.Tensor.double"></a>
+<a name="torch.Tensor.float"></a>
+
+Convenience methods for the [type](#torch.type) method. For e.g.,
+```lua
+x = torch.Tensor(3):fill(3.14)
+> x
+ 3.1400
+ 3.1400
+ 3.1400
+[torch.DoubleTensor of dimension 3]
+
+-- calling type('torch.IntTensor')
+> x:type('torch.IntTensor')
+ 3
+ 3
+ 3
+[torch.IntTensor of dimension 3]
+
+
+-- is equivalent to calling int()
+> x:int()
+ 3
+ 3
+ 3
+[torch.IntTensor of dimension 3]
+```
+
+## Querying the size and structure ##
+
+<a name="torch.nDimension"></a>
+### [number] nDimension() ###
+
+Returns the number of dimensions in a `Tensor`.
+```lua
+x = torch.Tensor(4,5) -- a matrix
+> x:nDimension()
+2
+```
+
+<a name="torch.Tensor.dim"></a>
+### [number] dim() ###
+
+Same as [nDimension()](#torch.nDimension).
+
+<a name="torch.Tensor.size"></a>
+### [number] size(dim) ###
+
+Returns the size of the specified dimension `dim`. Example:
+```lua
+x = torch.Tensor(4,5):zero()
+> x
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+[torch.DoubleTensor of dimension 4x5]
+
+> x:size(2) -- gets the number of columns
+5
+```
+
+<a name="torch.Tensor.size"></a>
+### [LongStorage] size() ###
+
+Returns a [LongStorage](storage.md) containing the size of each dimension
+of the tensor.
+```lua
+x = torch.Tensor(4,5):zero()
+> x
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+[torch.DoubleTensor of dimension 4x5]
+
+> x:size()
+ 4
+ 5
+[torch.LongStorage of size 2]
+```
+
+<a name="torch.Tensor.size"></a>
+### [LongStorage] #self ###
+
+Same as [size()](#torch.Tensor.size) method.
+
+<a name="torch.Tensor.stride"></a>
+### [number] stride(dim) ###
+
+Returns the jump necessary to go from one element to the next one in the
+specified dimension `dim`. Example:
+```lua
+x = torch.Tensor(4,5):zero()
+> x
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+[torch.DoubleTensor of dimension 4x5]
+
+-- elements in a row are contiguous in memory
+> x:stride(2)
+1
+
+-- to go from one element to the next one in a column
+-- we need here to jump the size of the row
+> x:stride(1)
+5
+```
+
+Note also that in `Torch` _elements in the same row_ [elements along the __last__ dimension]
+are contiguous in memory for a matrix [tensor].
+
+<a name="torch.Tensor.stride"></a>
+### [LongStorage] stride() ###
+
+Returns the jump necessary to go from one element to the next one in each dimension. Example:
+```lua
+x = torch.Tensor(4,5):zero()
+> x
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+ 0 0 0 0 0
+[torch.DoubleTensor of dimension 4x5]
+
+> x:stride()
+ 5
+ 1 -- elements are contiguous in a row [last dimension]
+[torch.LongStorage of size 2]
+```
+
+Note also that in `Torch` _elements in the same row_ [elements along the __last__ dimension]
+are contiguous in memory for a matrix [tensor].
+
+<a name="torch.storage"></a>
+### [Storage] storage() ###
+
+Returns the [Storage](storage.md) used to store all the elements of the `Tensor`.
+Basically, a `Tensor` is a particular way of _viewing_ a `Storage`.
+```lua
+x = torch.Tensor(4,5)
+s = x:storage()
+for i=1,s:size() do -- fill up the Storage
+  s[i] = i
+end
+
+> x -- s is interpreted by x as a 2D matrix
+  1   2   3   4   5
+  6   7   8   9  10
+ 11  12  13  14  15
+ 16  17  18  19  20
+[torch.DoubleTensor of dimension 4x5]
+```
+
+<a name="torch.Tensor.isContiguous"></a>
+### [boolean] isContiguous() ###
+
+Returns `true` iff the elements of the `Tensor` are contiguous in memory.
+```lua
+-- normal tensors are contiguous in memory
+x = torch.randn(4,5)
+> x:isContiguous()
+true
+
+-- y now "views" the 3rd column of x
+-- the storage of y is the same than x
+-- so the memory cannot be contiguous
+y = x:select(2, 3)
+> y:isContiguous()
+false
+
+-- indeed, to jump to one element to
+-- the next one, the stride is 5
+> y:stride()
+ 5
+[torch.LongStorage of size 1]
+```
+
+<a name="torch.Tensor.isSize"></a>
+### [boolean] isSize(storage) ###
+
+Returns `true` iff the dimensions of the `Tensor` match the elements of the `storage`.
+```lua
+x = torch.Tensor(4,5)
+y = torch.LongStorage({4,5})
+z = torch.LongStorage({5,4,1})
+> x:isSize(y)
+true
+
+> x:isSize(z)
+false
+
+> x:isSize(x:size())
+true
+```
+
+<a name="torch.Tensor.isSameSizeAs"></a>
+### [boolean] isSameSizeAs(tensor) ###
+
+Returns `true` iff the dimensions of the `Tensor` and the argument `Tensor` are exactly the same.
+```lua
+x = torch.Tensor(4,5)
+y = torch.Tensor(4,5)
+> x:isSameSizeAs(y)
+true
+
+y = torch.Tensor(4,6)
+> x:isSameSizeAs(y)
+false
+```
+
+<a name="torch.Tensor.nElement"></a>
+### [number] nElement() ###
+
+Returns the number of elements of a tensor.
+```lua
+x = torch.Tensor(4,5)
+> x:nElement() -- 4x5 = 20!
+20
+```
+
+<a name="torch.storageOffset"></a>
+### [number] storageOffset() ###
+
+Return the first index (starting at 1) used in the tensor's [storage](#torch.storage).
+
+<a name="torch.__index__"></a>
+## Querying elements ##
+
+Elements of a tensor can be retrieved with the `[index]` operator.
+
+If `index` is a number, `[index]` operator is equivalent to a
+[`select(1, index)`](#torch.Tensor.select). If the tensor has more
+than one dimension, this operation returns a slice of the tensor that
+shares the same underlying storage. If the tensor is a 1D tensor, it
+returns the value at `index` in this tensor.
+
+If `index` is a table, the table must contain _n_ numbers, where
+_n_ is the [number of dimensions](#torch.nDimension) of the
+Tensor. It will return the element at the given position.
+
+In the same spirit, `index` might be a [LongStorage](storage.md),
+specifying the position (in the Tensor) of the element to be
+retrieved.
+
+If `index` is a `ByteTensor` in which each element is 0 or 1 then it acts as a
+selection mask used to extract a subset of the original tensor. This is
+particularly useful with [logical operators](maths.md#logical-operations-on-tensors)
+like [`torch.le`](maths.md#torchlea-b).
+
+Example:
+```lua
+x = torch.Tensor(3,3)
+i = 0; x:apply(function() i = i + 1; return i end)
+> x
+ 1  2  3
+ 4  5  6
+ 7  8  9
+[torch.DoubleTensor of dimension 3x3]
+
+> x[2] -- returns row 2
+ 4
+ 5
+ 6
+[torch.DoubleTensor of dimension 3]
+
+> x[2][3] -- returns row 2, column 3
+6
+
+> x[{2,3}] -- another way to return row 2, column 3
+6
+
+> x[torch.LongStorage{2,3}] -- yet another way to return row 2, column 3
+6
+
+> x[torch.le(x,3)] -- torch.le returns a ByteTensor that acts as a mask
+ 1
+ 2
+ 3
+[torch.DoubleTensor of dimension 3]
+```
+
+<a name="torch.Tensor.set"></a>
+## Referencing a tensor to an existing tensor or chunk of memory ##
+
+A `Tensor` being a way of _viewing_ a [Storage](storage.md), it is
+possible to "set" a `Tensor` such that it views an existing [Storage](storage.md).
+
+Note that if you want to perform a set on an empty `Tensor` like
+```lua
+y = torch.Storage(10)
+x = torch.Tensor()
+x:set(y, 1, 10)
+```
+you might want in that case to use one of the [equivalent constructor](#torch.Tensor).
+```lua
+y = torch.Storage(10)
+x = torch.Tensor(y, 1, 10)
+```
+
+<a name="torch.Tensor.set"></a>
+### [self] set(tensor) ###
+
+The `Tensor` is now going to "view" the same [storage](#torch.storage)
+as the given `tensor`. As the result, any modification in the elements of
+the `Tensor` will have an impact on the elements of the given `tensor`, and
+vice-versa. This is an efficient method, as there is no memory copy!
+
+```lua
+x = torch.Tensor(2,5):fill(3.14)
+> x
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+[torch.DoubleTensor of dimension 2x5]
+
+y = torch.Tensor():set(x)
+> y
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+ 3.1400  3.1400  3.1400  3.1400  3.1400
+[torch.DoubleTensor of dimension 2x5]
+
+y:zero()
+> x -- elements of x are the same than y!
+ 0 0 0 0 0
+ 0 0 0 0 0
+[torch.DoubleTensor of dimension 2x5]
+```
+
+<a name="torch.Tensor.isSetTo"></a>
+### [boolean] isSetTo(tensor) ###
+
+Returns true iff the `Tensor` is set to the argument `Tensor`. Note: this is
+only true if the tensors are the same size, have the same strides and share the
+same storage and offset.
+
+```lua
+x = torch.Tensor(2,5)
+y = torch.Tensor()
+> y:isSetTo(x)
+ false
+> y:set(x)
+> y:isSetTo(x)
+  true
+> y:t():isSetTo(x)
+  false -- x and y have different strides
+```
+
+<a name="torch.Tensor.set"></a>
+### [self] set(storage, [storageOffset, sizes, [strides]]) ###
+
+The `Tensor` is now going to "view" the given
+[`storage`](storage.md), starting at position `storageOffset` (>=1)
+with the given [dimension `sizes`](#torch.Tensor.size) and the optional given
+[`strides`](#torch.Tensor.stride). As the result, any modification in the
+elements of the `Storage` will have a impact on the elements of the
+`Tensor`, and vice-versa. This is an efficient method, as there is no
+memory copy!
+
+If only `storage` is provided, the whole storage will be viewed as a 1D Tensor.
+
+```lua
+-- creates a storage with 10 elements
+s = torch.Storage(10):fill(1)
+
+-- we want to see it as a 2x5 tensor
+sz = torch.LongStorage({2,5})
+x = torch.Tensor()
+x:set(s, 1, sz)
+> x
+ 1  1  1  1  1
+ 1  1  1  1  1
+[torch.DoubleTensor of dimension 2x5]
+
+x:zero()
+> s -- the storage contents have been modified
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+ 0
+[torch.DoubleStorage of size 10]
+```
+
+<a name="torch.Tensor.set"></a>
+### [self] set(storage, [storageOffset, sz1 [, st1 ... [, sz4 [, st4]]]]) ###
+
+This is a "shortcut" for previous method.
+It works up to 4 dimensions. `szi` is the size of the `i`-th dimension of the tensor.
+`sti` is the stride in the `i`-th dimension.
+
+## Copying and initializing ##
+
+<a name="torch.Tensor.copy"></a>
+### [self] copy(tensor) ###
+
+Replace the elements of the `Tensor` by copying the elements of the given `tensor`. The
+[number of elements](#torch.Tensor.nElement) must match, but the
+sizes might be different.
+
+```lua
+x = torch.Tensor(4):fill(1)
+y = torch.Tensor(2,2):copy(x)
+> x
+ 1
+ 1
+ 1
+ 1
+[torch.DoubleTensor of dimension 4]
+
+> y
+ 1  1
+ 1  1
+[torch.DoubleTensor of dimension 2x2]
+```
+
+If a different type of `tensor` is given, then a type conversion occurs,
+which, of course, might result in loss of precision.
+
+<a name="torch.fill"></a>
+### [self] fill(value) ###
+
+Fill the tensor with the given `value`.
+```lua
+> torch.DoubleTensor(4):fill(3.14)
+ 3.1400
+ 3.1400
+ 3.1400
+ 3.1400
+[torch.DoubleTensor of dimension 4]
+```
+
+<a name="torch.zero"></a>
+### [self] zero() ###
+
+Fill the tensor with zeros.
+```lua
+> torch.Tensor(4):zero()
+ 0
+ 0
+ 0
+ 0
+[torch.DoubleTensor of dimension 4]
+```
+
+<a name="torch.resize.dok"></a>
+## Resizing ##
+
+__When resizing to a larger size__, the underlying [Storage](storage.md) is resized to fit
+all the elements of the `Tensor`.
+
+__When resizing to a smaller size__, the underlying [Storage](#Storage) is not resized.
+
+__Important note:__ the content of a `Tensor` after resizing is _undetermined_ as [strides](#torch.Tensor.stride)
+might have been completely changed. In particular, _the elements of the resized tensor are contiguous in memory_.
+
+<a name="torch.Tensor.resizeAs"></a>
+### [self] resizeAs(tensor) ###
+
+Resize the `tensor` as the given `tensor` (of the same type).
+
+<a name="torch.resize"></a>
+### [self] resize(sizes) ###
+
+Resize the `tensor` according to the given [LongStorage](storage.md) `sizes`.
+
+<a name="torch.resize"></a>
+### [self] resize(sz1 [,sz2 [,sz3 [,sz4]]]]) ###
+
+Convenience method of the previous method, working for a number of dimensions up to 4.
+
+## Extracting sub-tensors ##
+
+Each of these methods returns a `Tensor` which is a sub-tensor of the given
+tensor. 
+
+For methods `narrow`, `select` and `sub` the returned tensor _shares the same `Storage`_ as the original. Hence, any modification in the memory of the sub-tensor will have an impact on the primary tensor, and vice-versa. These methods are very fast, as they do not involve any memory copy.
+
+For all other methods in this section such as `index`, `indexCopy` etc., since you cannot extract a shared subtensor (technically), a new tensor is returned. If you make changes in this new tensor, they are not reflected in the original tensor.
+
+<a name="torch.Tensor.narrow"></a>
+### [self] narrow(dim, index, size) ###
+
+Returns a new `Tensor` which is a narrowed version of the current one: the dimension `dim` is narrowed
+from `index` to `index+size-1`.
+
+```lua
+x = torch.Tensor(5, 6):zero()
+> x
+
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+0 0 0 0 0 0
+[torch.DoubleTensor of dimension 5x6]
+
+y = x:narrow(1, 2, 3) -- narrow dimension 1 from index 2 to index 2+3-1
+y:fill(1) -- fill with 1
+> y
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+[torch.DoubleTensor of dimension 3x6]
+
+> x -- memory in x has been modified!
+ 0  0  0  0  0  0
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+ 0  0  0  0  0  0
+[torch.DoubleTensor of dimension 5x6]
+```
+
+<a name="torch.Tensor.sub"></a>
+### [Tensor] sub(dim1s, dim1e ... [, dim4s [, dim4e]]) ###
+
+This method is equivalent to do a series of
+[narrow](#torch.Tensor.narrow) up to the first 4 dimensions.  It
+returns a new `Tensor` which is a sub-tensor going from index
+`dimis` to `dimie` in the `i`-th dimension. Negative values are
+interpreted index starting from the end: `-1` is the last index,
+`-2` is the index before the last index, ...
+
+```lua
+x = torch.Tensor(5, 6):zero()
+> x
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+[torch.DoubleTensor of dimension 5x6]
+
+y = x:sub(2,4):fill(1) -- y is sub-tensor of x:
+> y                    -- dimension 1 starts at index 2, ends at index 4
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+[torch.DoubleTensor of dimension 3x6]
+
+> x                    -- x has been modified!
+ 0  0  0  0  0  0
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+ 1  1  1  1  1  1
+ 0  0  0  0  0  0
+[torch.DoubleTensor of dimension 5x6]
+
+z = x:sub(2,4,3,4):fill(2) -- we now take a new sub-tensor
+> z                        -- dimension 1 starts at index 2, ends at index 4
+                           -- dimension 2 starts at index 3, ends at index 4
+ 2  2
+ 2  2
+ 2  2
+[torch.DoubleTensor of dimension 3x2]
+
+> x                        -- x has been modified
+ 0  0  0  0  0  0
+ 1  1  2  2  1  1
+ 1  1  2  2  1  1
+ 1  1  2  2  1  1
+ 0  0  0  0  0  0
+[torch.DoubleTensor of dimension 5x6]
+
+> y                        -- y has been modified
+ 1  1  2  2  1  1
+ 1  1  2  2  1  1
+ 1  1  2  2  1  1
+[torch.DoubleTensor of dimension 3x6]
+
+> y:sub(-1, -1, 3, 4)      -- negative values = bounds
+ 2  2
+[torch.DoubleTensor of dimension 1x2]
+```
+
+<a name="torch.Tensor.select"></a>
+### [Tensor] select(dim, index) ###
+
+Returns a new `Tensor` which is a tensor slice at the given `index` in the
+dimension `dim`. The returned tensor has one less dimension: the dimension
+`dim` is removed.  As a result, it is not possible to `select()` on a 1D
+tensor.
+
+Note that "selecting" on the first dimension is equivalent to use the [[] operator](#torch.__index__ )
+
+```lua
+x = torch.Tensor(5,6):zero()
+> x
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+[torch.DoubleTensor of dimension 5x6]
+
+y = x:select(1, 2):fill(2) -- select row 2 and fill up
+> y
+ 2
+ 2
+ 2
+ 2
+ 2
+ 2
+[torch.DoubleTensor of dimension 6]
+
+> x
+ 0  0  0  0  0  0
+ 2  2  2  2  2  2
+ 0  0  0  0  0  0
+ 0  0  0  0  0  0
+ 0  0  0  0  0  0
+[torch.DoubleTensor of dimension 5x6]
+
+z = x:select(2,5):fill(5) -- select column 5 and fill up
+> z
+ 5
+ 5
+ 5
+ 5
+ 5
+[torch.DoubleTensor of dimension 5]
+
+> x
+ 0  0  0  0  5  0
+ 2  2  2  2  5  2
+ 0  0  0  0  5  0
+ 0  0  0  0  5  0
+ 0  0  0  0  5  0
+[torch.DoubleTensor of dimension 5x6]
+```
+
+<a name="torch.Tensor.indexing"></a>
+### [Tensor] [{ dim1,dim2,... }] or [{ {dim1s,dim1e}, {dim2s,dim2e} }] ###
+
+The indexing operator [] can be used to combine narrow/sub and
+select in a concise and efficient way. It can also be used
+to copy, and fill (sub) tensors.
+
+This operator also works with an input mask made of a `ByteTensor` with 0 and 1
+elements, e.g with a [logical operator](maths.md#logical-operations-on-tensors).
+
+```lua
+x = torch.Tensor(5, 6):zero()
+> x
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+ 0 0 0 0 0 0
+[torch.DoubleTensor of dimension 5x6]
+
+x[{ 1,3 }] = 1 -- sets element at (i=1,j=3) to 1
+> x
+ 0  0  1  0  0  0
+ 0  0  0  0  0  0
+ 0  0  0  0  0  0
+ 0  0  0  0  0  0
+ 0  0  0  0  0  0
+[torch.DoubleTensor of dimension 5x6]
+
+x[{ 2,{2,4} }] = 2  -- sets a slice of 3 elements to 2
+> x
+ 0  0  1  0  0  0
+ 0  2  2  2  0  0
+ 0  0  0  0  0  0
+ 0  0  0  0  0  0
+ 0  0  0  0  0  0
+[torch.DoubleTensor of dimension 5x6]
+
+x[{ {},4 }] = -1 -- sets the full 4th column to -1
+> x
+ 0  0  1 -1  0  0
+ 0  2  2 -1  0  0
+ 0  0  0 -1  0  0
+ 0  0  0 -1  0  0
+ 0  0  0 -1  0  0
+[torch.DoubleTensor of dimension 5x6]
+
+x[{ {},2 }] = torch.range(1,5) -- copy a 1D tensor to a slice of x
+> x
+
+ 0  1  1 -1  0  0
+ 0  2  2 -1  0  0
+ 0  3  0 -1  0  0
+ 0  4  0 -1  0  0
+ 0  5  0 -1  0  0
+[torch.DoubleTensor of dimension 5x6]
+
+x[torch.lt(x,0)] = -2 -- sets all negative elements to -2 via a mask
+> x
+
+ 0  1  1 -2  0  0
+ 0  2  2 -2  0  0
+ 0  3  0 -2  0  0
+ 0  4  0 -2  0  0
+ 0  5  0 -2  0  0
+[torch.DoubleTensor of dimension 5x6]
+```
+
+<a name="torch.Tensor.index"></a>
+### [Tensor] index(dim, index) ###
+
+Returns a new `Tensor` which indexes the original `Tensor` along dimension `dim`
+using the entries in `torch.LongTensor` `index`.
+The returned `Tensor` has the same number of dimensions as the original `Tensor`.
+The returned `Tensor` does __not__ use the same storage as the original `Tensor` -- see below for storing the result
+ in an existing `Tensor`.
+
+```lua
+x = torch.rand(5,5)
+> x
+ 0.8020  0.7246  0.1204  0.3419  0.4385
+ 0.0369  0.4158  0.0985  0.3024  0.8186
+ 0.2746  0.9362  0.2546  0.8586  0.6674
+ 0.7473  0.9028  0.1046  0.9085  0.6622
+ 0.1412  0.6784  0.1624  0.8113  0.3949
+[torch.DoubleTensor of dimension 5x5]
+
+y = x:index(1,torch.LongTensor{3,1})
+> y
+ 0.2746  0.9362  0.2546  0.8586  0.6674
+ 0.8020  0.7246  0.1204  0.3419  0.4385
+[torch.DoubleTensor of dimension 2x5]
+
+y:fill(1)
+> y
+ 1  1  1  1  1
+ 1  1  1  1  1
+[torch.DoubleTensor of dimension 2x5]
+
+> x
+ 0.8020  0.7246  0.1204  0.3419  0.4385
+ 0.0369  0.4158  0.0985  0.3024  0.8186
+ 0.2746  0.9362  0.2546  0.8586  0.6674
+ 0.7473  0.9028  0.1046  0.9085  0.6622
+ 0.1412  0.6784  0.1624  0.8113  0.3949
+[torch.DoubleTensor of dimension 5x5]
+
+```
+
+Note the explicit `index` function is different than the indexing operator `[]`.
+The indexing operator `[]` is a syntactic shortcut for a series of select and narrow operations,
+therefore it always returns a new view on the original tensor that shares the same storage.
+However, the explicit `index` function can not use the same storage.
+
+It is possible to store the result into an existing Tensor with `result:index(source, ...)`:
+
+```lua
+x = torch.rand(5,5)
+> x
+ 0.8020  0.7246  0.1204  0.3419  0.4385
+ 0.0369  0.4158  0.0985  0.3024  0.8186
+ 0.2746  0.9362  0.2546  0.8586  0.6674
+ 0.7473  0.9028  0.1046  0.9085  0.6622
+ 0.1412  0.6784  0.1624  0.8113  0.3949
+[torch.DoubleTensor of dimension 5x5]
+
+y = torch.Tensor()
+y:index(x,1,torch.LongTensor{3,1})
+> y
+ 0.2746  0.9362  0.2546  0.8586  0.6674
+ 0.8020  0.7246  0.1204  0.3419  0.4385
+[torch.DoubleTensor of dimension 2x5]
+```
+
+
+<a name="torch.Tensor.indexCopy"></a>
+### [Tensor] indexCopy(dim, index, tensor) ###
+
+Copies the elements of `tensor` into the original tensor by selecting the indices in the order
+given in `index`. The shape of `tensor` must exactly match the elements indexed or an error will be thrown.
+
+```lua
+> x
+ 0.8020  0.7246  0.1204  0.3419  0.4385
+ 0.0369  0.4158  0.0985  0.3024  0.8186
+ 0.2746  0.9362  0.2546  0.8586  0.6674
+ 0.7473  0.9028  0.1046  0.9085  0.6622
+ 0.1412  0.6784  0.1624  0.8113  0.3949
+[torch.DoubleTensor of dimension 5x5]
+
+z=torch.Tensor(5,2)
+z:select(2,1):fill(-1)
+z:select(2,2):fill(-2)
+> z
+-1 -2
+-1 -2
+-1 -2
+-1 -2
+-1 -2
+[torch.DoubleTensor of dimension 5x2]
+
+x:indexCopy(2,torch.LongTensor{5,1},z)
+> x
+-2.0000  0.7246  0.1204  0.3419 -1.0000
+-2.0000  0.4158  0.0985  0.3024 -1.0000
+-2.0000  0.9362  0.2546  0.8586 -1.0000
+-2.0000  0.9028  0.1046  0.9085 -1.0000
+-2.0000  0.6784  0.1624  0.8113 -1.0000
+[torch.DoubleTensor of dimension 5x5]
+
+```
+
+<a name="torch.Tensor.indexAdd"></a>
+### [Tensor] indexAdd(dim, index, tensor) ###
+
+Accumulate the elements of `tensor` into the original tensor by adding to the indices in the order
+given in `index`. The shape of `tensor` must exactly match the elements indexed or an error will be thrown.
+
+```lua
+Example 1
+
+> x
+-2.1742  0.5688 -1.0201  0.1383  1.0504
+ 0.0970  0.2169  0.1324  0.9553 -1.9518
+-0.7607  0.8947  0.1658 -0.2181 -2.1237
+-1.4099  0.2342  0.4549  0.6316 -0.2608
+ 0.0349  0.4713  0.0050  0.1677  0.2103
+[torch.DoubleTensor of size 5x5]
+
+z=torch.Tensor(5, 2)
+z:select(2,1):fill(-1)
+z:select(2,2):fill(-2)
+> z
+-1 -2
+-1 -2
+-1 -2
+-1 -2
+-1 -2
+[torch.DoubleTensor of dimension 5x2]
+
+> x:indexAdd(2,torch.LongTensor{5,1},z)
+> x
+-4.1742  0.5688 -1.0201  0.1383  0.0504
+-1.9030  0.2169  0.1324  0.9553 -2.9518
+-2.7607  0.8947  0.1658 -0.2181 -3.1237
+-3.4099  0.2342  0.4549  0.6316 -1.2608
+-1.9651  0.4713  0.0050  0.1677 -0.7897
+[torch.DoubleTensor of size 5x5]
+
+Example 2
+
+> a = torch.range(1, 5)
+> a
+ 1
+ 2
+ 3
+ 4
+ 5
+[torch.DoubleTensor of size 5]
+
+> a:indexAdd(1, torch.LongTensor{1, 1, 3, 3}, torch.range(1, 4))
+> a
+  4
+  2
+ 10
+  4
+  5
+[torch.DoubleTensor of size 5]
+
+```
+
+<a name="torch.Tensor.indexFill"></a>
+### [Tensor] indexFill(dim, index, val) ###
+
+Fills the elements of the original `Tensor` with value `val` by selecting the indices in the order
+given in `index`.
+
+```lua
+x=torch.rand(5,5)
+> x
+ 0.8414  0.4121  0.3934  0.5600  0.5403
+ 0.3029  0.2040  0.7893  0.6079  0.6334
+ 0.3743  0.1389  0.1573  0.1357  0.8460
+ 0.2838  0.9925  0.0076  0.7220  0.5185
+ 0.8739  0.6887  0.4271  0.0385  0.9116
+[torch.DoubleTensor of dimension 5x5]
+
+x:indexFill(2,torch.LongTensor{4,2},-10)
+> x
+  0.8414 -10.0000   0.3934 -10.0000   0.5403
+  0.3029 -10.0000   0.7893 -10.0000   0.6334
+  0.3743 -10.0000   0.1573 -10.0000   0.8460
+  0.2838 -10.0000   0.0076 -10.0000   0.5185
+  0.8739 -10.0000   0.4271 -10.0000   0.9116
+[torch.DoubleTensor of dimension 5x5]
+
+```
+
+<a name="torch.Tensor.gather"></a>
+### [Tensor] gather(dim, index) ###
+
+Creates a new `Tensor` from the original tensor by gathering a number of values from
+each "row", where the rows are along the dimension `dim`. The values in a `LongTensor`, passed as `index`,
+specify which values to take from each row. Specifically, the resulting `Tensor`, which will have the same size as
+the `index` tensor, is given by
+
+```lua
+-- dim = 1
+result[i][j][k]... = src[index[i][j][k]...][j][k]...
+
+-- dim = 2
+result[i][j][k]... = src[i][index[i][j][k]...][k]...
+
+-- etc.
+```
+where `src` is the original `Tensor`.
+
+The same number of values are selected from each row, and the same value cannot be selected from a row more than
+once. The values in the `index` tensor must not be larger than the length of the row, that is they must be between
+1 and `src:size(dim)` inclusive. It can be somewhat confusing to ensure that the `index` tensor has the correct shape.
+Viewed pictorially:
+
+![The gather operation](gather.png)
+
+Numerically, to give an example, if `src` has size `n x m x p x q`, we are gathering along `dim = 3`, and we wish to
+gather `k` elements from each row (where `k <= p`) then `index` must have size `n x m x k x q`.
+
+It is possible to store the result into an existing Tensor with `result:gather(src, ...)`.
+
+```lua
+x = torch.rand(5, 5)
+> x
+ 0.7259  0.5291  0.4559  0.4367  0.4133
+ 0.0513  0.4404  0.4741  0.0658  0.0653
+ 0.3393  0.1735  0.6439  0.1011  0.7923
+ 0.7606  0.5025  0.5706  0.7193  0.1572
+ 0.1720  0.3546  0.8354  0.8339  0.3025
+[torch.DoubleTensor of size 5x5]
+
+y = x:gather(1, torch.LongTensor{{1, 2, 3, 4, 5}, {2, 3, 4, 5, 1}})
+> y
+ 0.7259  0.4404  0.6439  0.7193  0.3025
+ 0.0513  0.1735  0.5706  0.8339  0.4133
+[torch.DoubleTensor of size 2x5]
+
+z = x:gather(2, torch.LongTensor{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 1}})
+> z
+ 0.7259  0.5291
+ 0.4404  0.4741
+ 0.6439  0.1011
+ 0.7193  0.1572
+ 0.3025  0.1720
+[torch.DoubleTensor of size 5x2]
+
+```
+
+<a name="torch.Tensor.scatter"></a>
+### [Tensor] scatter(dim, index, src|val) ###
+
+Writes all values from tensor `src` or the scalar `val` into `self` at the specified indices. The indices are specified
+with respect to the given dimension, `dim`, in the manner described in [gather](#torch.Tensor.gather). Note that, as
+for gather, the values of index must be between 1 and `self:size(dim)` inclusive and all values in a row along the
+specified dimension must be unique.
+
+```lua
+x = torch.rand(2, 5)
+> x
+ 0.3227  0.4294  0.8476  0.9414  0.1159
+ 0.7338  0.5185  0.2947  0.0578  0.1273
+[torch.DoubleTensor of size 2x5]
+
+y = torch.zeros(3, 5):scatter(1, torch.LongTensor{{1, 2, 3, 1, 1}, {3, 1, 1, 2, 3}}, x)
+> y
+ 0.3227  0.5185  0.2947  0.9414  0.1159
+ 0.0000  0.4294  0.0000  0.0578  0.0000
+ 0.7338  0.0000  0.8476  0.0000  0.1273
+[torch.DoubleTensor of size 3x5]
+
+z = torch.zeros(2, 4):scatter(2, torch.LongTensor{{3}, {4}}, 1.23)
+> z
+ 0.0000  0.0000  1.2300  0.0000
+ 0.0000  0.0000  0.0000  1.2300
+[torch.DoubleTensor of size 2x4]
+
+```
+
+<a name="torch.Tensor.maskedSelect"></a>
+### [Tensor] maskedSelect(mask) ###
+
+Returns a new Tensor which contains all elements aligned to a `1` in the corresponding
+`mask`. This `mask` is a `torch.ByteTensor` of zeros and ones. The `mask` and
+`Tensor` must have the same number of elements. The resulting Tensor will
+be a 1D tensor of the same type as `Tensor` having size `mask:sum()`.
+
+```lua
+x = torch.range(1,12):double():resize(3,4)
+> x
+  1   2   3   4
+  5   6   7   8
+  9  10  11  12
+[torch.DoubleTensor of dimension 3x4]
+
+mask = torch.ByteTensor(2,6):bernoulli()
+> mask
+ 1  0  1  0  0  0
+ 1  1  0  0  0  1
+[torch.ByteTensor of dimension 2x6]
+
+y = x:maskedSelect(mask)
+> y
+  1
+  3
+  7
+  8
+ 12
+[torch.DoubleTensor of dimension 5]
+
+z = torch.DoubleTensor()
+z:maskedSelect(x, mask)
+> z
+  1
+  3
+  7
+  8
+ 12
+```
+
+Note how the dimensions of the above `x`, `mask` and `y` do not match.
+Also note how an existing tensor `z` can be used to store the results.
+
+
+<a name="torch.Tensor.maskedCopy"></a>
+### [Tensor] maskedCopy(mask, tensor) ###
+
+Copies the masked elements of `tensor` into itself. The masked elements are those elements having a
+corresponding `1` in the `mask` Tensor. This `mask` is a `torch.ByteTensor`
+of zeros and ones. The destination `Tensor` and the `mask` Tensor should have the same number of elements.
+The source `tensor` should have at least as many elements as the number of 1s in the `mask`.
+
+```lua
+x = torch.range(1,4):double():resize(2,2)
+> x
+ 1  2
+ 3  4
+[torch.DoubleTensor of dimension 2x4]
+
+mask = torch.ByteTensor(1,8):bernoulli()
+> mask
+ 0  0  1  1  1  0  1  0
+[torch.ByteTensor of dimension 1x8]
+
+y = torch.DoubleTensor(2,4):fill(-1)
+> y
+-1 -1 -1 -1
+-1 -1 -1 -1
+[torch.DoubleTensor of dimension 2x4]
+
+y:maskedCopy(mask, x)
+> y
+ -1 -1  1  2
+  3 -1  4 -1
+[torch.DoubleTensor of dimension 2x4]
+```
+
+Note how the dimensions of the above `x`, `mask` and `y' do not match,
+but the number of elements do.
+
+<a name="torch.Tensor.maskedFill"></a>
+### [Tensor] maskedFill(mask, val) ###
+
+Fills the masked elements of itself with value `val`. The masked elements are those elements having a
+corresponding `1` in the `mask` Tensor. This `mask` is a `torch.ByteTensor`
+of zeros and ones. The `mask` and `Tensor` must have the same number of elements.
+
+```lua
+x = torch.range(1,4):double():resize(1,4)
+> x
+ 1  2  3  4
+[torch.DoubleTensor of dimension 1x4]
+
+mask = torch.ByteTensor(2,2):bernoulli()
+> mask
+ 0  0
+ 1  1
+[torch.ByteTensor of dimension 2x2]
+
+x:maskedFill(mask, -1)
+> x
+ 1  2 -1 -1
+[torch.DoubleTensor of dimension 1x4]
+
+```
+Note how the dimensions of the above `x` and `mask` do not match,
+but the number of elements do.
+
+## Search ##
+
+Each of these methods returns a `LongTensor` corresponding to the indices of the
+given search operation.
+
+<a name="torch.Tensor.nonzero"/>
+### [LongTensor] nonzero(tensor)
+
+Finds and returns a `LongTensor` corresponding to the *subscript* indices of all
+non-zero elements in `tensor`.
+
+Note that torch uses the first argument on dispatch to determine the return
+type. Since the first argument is any `torch.TensorType`, but the return type
+is always `torch.LongTensor`, the function call
+`torch.nonzero(torch.LongTensor(), tensor)` does not work. However,
+`tensor.nonzero(torch.LongTensor(), tensor)` does work.
+
+```lua
+> x = torch.rand(4, 4):mul(3):floor():int()
+> x
+ 2  0  2  0
+ 0  0  1  2
+ 0  2  2  1
+ 2  1  2  2
+[torch.IntTensor of dimension 4x4]
+
+> torch.nonzero(x)
+ 1  1
+ 1  3
+ 2  3
+ 2  4
+ 3  2
+ 3  3
+ 3  4
+ 4  1
+ 4  2
+ 4  3
+ 4  4
+[torch.LongTensor of dimension 11x2]
+
+> x:nonzero()
+ 1  1
+ 1  3
+ 2  3
+ 2  4
+ 3  2
+ 3  3
+ 3  4
+ 4  1
+ 4  2
+ 4  3
+ 4  4
+[torch.LongTensor of dimension 11x2]
+
+> indices = torch.LongTensor()
+> x.nonzero(indices, x)
+ 1  1
+ 1  3
+ 2  3
+ 2  4
+ 3  2
+ 3  3
+ 3  4
+ 4  1
+ 4  2
+ 4  3
+ 4  4
+[torch.LongTensor of dimension 11x2]
+
+> x:eq(1):nonzero()
+ 2  3
+ 3  4
+ 4  2
+[torch.LongTensor of dimension 3x2]
+
+```
+
+## Expanding/Replicating/Squeezing Tensors ##
+
+These methods returns a Tensor which is created by replications of the
+original tensor.
+
+<a name="torch.expand"></a>
+#### [result] expand([result,] sizes) ####
+
+`sizes` can either be a `torch.LongStorage` or numbers. Expanding a tensor
+does not allocate new memory, but only creates a new view on the existing tensor where
+singleton dimensions can be expanded to multiple ones by setting the `stride` to 0.
+Any dimension that has size 1 can be expanded to arbitrary value without any new memory allocation. Attempting to
+expand along a dimension that does not have size 1 will result in an error.
+
+```lua
+x = torch.rand(10,1)
+> x
+ 0.3837
+ 0.5966
+ 0.0763
+ 0.1896
+ 0.4958
+ 0.6841
+ 0.4038
+ 0.4068
+ 0.1502
+ 0.2239
+[torch.DoubleTensor of dimension 10x1]
+
+y = torch.expand(x,10,2)
+> y
+ 0.3837  0.3837
+ 0.5966  0.5966
+ 0.0763  0.0763
+ 0.1896  0.1896
+ 0.4958  0.4958
+ 0.6841  0.6841
+ 0.4038  0.4038
+ 0.4068  0.4068
+ 0.1502  0.1502
+ 0.2239  0.2239
+[torch.DoubleTensor of dimension 10x2]
+
+y:fill(1)
+> y
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+ 1  1
+[torch.DoubleTensor of dimension 10x2]
+
+> x
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+ 1
+[torch.DoubleTensor of dimension 10x1]
+
+i=0; y:apply(function() i=i+1;return i end)
+> y
+  2   2
+  4   4
+  6   6
+  8   8
+ 10  10
+ 12  12
+ 14  14
+ 16  16
+ 18  18
+ 20  20
+[torch.DoubleTensor of dimension 10x2]
+
+> x
+  2
+  4
+  6
+  8
+ 10
+ 12
+ 14
+ 16
+ 18
+ 20
+[torch.DoubleTensor of dimension 10x1]
+
+```
+
+<a name="torch.Tensor.expandAs"></a>
+#### [result] expandAs([result,] tensor) ####
+
+This is equivalent to `self:expand(tensor:size())`
+
+<a name="torch.repeatTensor"></a>
+#### [Tensor] repeatTensor([result,] sizes) ####
+
+`sizes` can either be a `torch.LongStorage` or numbers. Repeating a tensor allocates
+ new memory, unless `result` is provided, in which case its memory is
+ resized. `sizes` specify the number of times the tensor is repeated in each dimension.
+
+ ```lua
+x = torch.rand(5)
+> x
+ 0.7160
+ 0.6514
+ 0.0704
+ 0.7856
+ 0.7452
+[torch.DoubleTensor of dimension 5]
+
+> torch.repeatTensor(x,3,2)
+ 0.7160  0.6514  0.0704  0.7856  0.7452  0.7160  0.6514  0.0704  0.7856  0.7452
+ 0.7160  0.6514  0.0704  0.7856  0.7452  0.7160  0.6514  0.0704  0.7856  0.7452
+ 0.7160  0.6514  0.0704  0.7856  0.7452  0.7160  0.6514  0.0704  0.7856  0.7452
+[torch.DoubleTensor of dimension 3x10]
+
+> torch.repeatTensor(x,3,2,1)
+(1,.,.) =
+  0.7160  0.6514  0.0704  0.7856  0.7452
+  0.7160  0.6514  0.0704  0.7856  0.7452
+
+(2,.,.) =
+  0.7160  0.6514  0.0704  0.7856  0.7452
+  0.7160  0.6514  0.0704  0.7856  0.7452
+
+(3,.,.) =
+  0.7160  0.6514  0.0704  0.7856  0.7452
+  0.7160  0.6514  0.0704  0.7856  0.7452
+[torch.DoubleTensor of dimension 3x2x5]
+
+ ```
+
+<a name="torch.squeeze"></a>
+#### [Tensor] squeeze([dim]) ####
+
+Removes all singleton dimensions of the tensor.
+If `dim` is given, squeezes only that particular dimension of the tensor.
+
+ ```lua
+x=torch.rand(2,1,2,1,2)
+> x
+(1,1,1,.,.) =
+  0.6020  0.8897
+
+(2,1,1,.,.) =
+  0.4713  0.2645
+
+(1,1,2,.,.) =
+  0.4441  0.9792
+
+(2,1,2,.,.) =
+  0.5467  0.8648
+[torch.DoubleTensor of dimension 2x1x2x1x2]
+
+> torch.squeeze(x)
+(1,.,.) =
+  0.6020  0.8897
+  0.4441  0.9792
+
+(2,.,.) =
+  0.4713  0.2645
+  0.5467  0.8648
+[torch.DoubleTensor of dimension 2x2x2]
+
+> torch.squeeze(x,2)
+(1,1,.,.) =
+  0.6020  0.8897
+
+(2,1,.,.) =
+  0.4713  0.2645
+
+(1,2,.,.) =
+  0.4441  0.9792
+
+(2,2,.,.) =
+  0.5467  0.8648
+[torch.DoubleTensor of dimension 2x2x1x2]
+
+ ```
+
+## Manipulating the tensor view ##
+
+Each of these methods returns a `Tensor` which is another way of viewing
+the `Storage` of the given tensor. Hence, any modification in the memory of
+the sub-tensor will have an impact on the primary tensor, and vice-versa.
+
+These methods are very fast, because they do not involve any memory copy.
+
+<a name="torch.view"></a>
+### [result] view([result,] tensor, sizes) ###
+
+Creates a view with different dimensions of the storage associated with `tensor`.
+If `result` is not passed, then a new tensor is returned, otherwise its storage is
+made to point to storage of `tensor`.
+
+`sizes` can either be a `torch.LongStorage` or numbers. If one of the dimensions
+is -1, the size of that dimension is inferred from the rest of the elements.
+
+
+```lua
+x = torch.zeros(4)
+> x:view(2,2)
+ 0 0
+ 0 0
+[torch.DoubleTensor of dimension 2x2]
+
+> x:view(2,-1)
+ 0 0
+ 0 0
+[torch.DoubleTensor of dimension 2x2]
+
+> x:view(torch.LongStorage{2,2})
+ 0 0
+ 0 0
+[torch.DoubleTensor of dimension 2x2]
+
+> x
+ 0
+ 0
+ 0
+ 0
+[torch.DoubleTensor of dimension 4]
+```
+
+<a name="torch.viewAs"></a>
+### [result] viewAs([result,] tensor, template) ###
+
+Creates a view with the same dimensions as `template` of the storage associated
+with `tensor`. If `result` is not passed, then a new tensor is returned, otherwise its storage is
+made to point to storage of `tensor`.
+
+
+```lua
+x = torch.zeros(4)
+y = torch.Tensor(2,2)
+> x:viewAs(y)
+ 0 0
+ 0 0
+[torch.DoubleTensor of dimension 2x2]
+```
+
+
+<a name="torch.Tensor.transpose"></a>
+### [Tensor] transpose(dim1, dim2) ###
+
+Returns a tensor where dimensions `dim1` and `dim2` have been swapped. For 2D tensors,
+the convenience method of [t()](#torch.Tensor.t) is available.
+```lua
+x = torch.Tensor(3,4):zero()
+x:select(2,3):fill(7) -- fill column 3 with 7
+> x
+ 0  0  7  0
+ 0  0  7  0
+ 0  0  7  0
+[torch.DoubleTensor of dimension 3x4]
+
+y = x:transpose(1,2) -- swap dimension 1 and 2
+> y
+ 0  0  0
+ 0  0  0
+ 7  7  7
+ 0  0  0
+[torch.DoubleTensor of dimension 4x3]
+
+y:select(2, 3):fill(8) -- fill column 3 with 8
+> y
+ 0  0  8
+ 0  0  8
+ 7  7  8
+ 0  0  8
+[torch.DoubleTensor of dimension 4x3]
+
+> x -- contents of x have changed as well
+ 0  0  7  0
+ 0  0  7  0
+ 8  8  8  8
+[torch.DoubleTensor of dimension 3x4]
+```
+
+
+<a name="torch.Tensor.t"></a>
+### [Tensor] t() ###
+
+Convenience method of [transpose()](#torch.Tensor.transpose) for 2D
+tensors. The given tensor must be 2 dimensional. Swap dimensions 1 and 2.
+```lua
+x = torch.Tensor(3,4):zero()
+x:select(2,3):fill(7)
+y = x:t()
+> y
+ 0  0  0
+ 0  0  0
+ 7  7  7
+ 0  0  0
+[torch.DoubleTensor of dimension 4x3]
+
+> x
+ 0  0  7  0
+ 0  0  7  0
+ 0  0  7  0
+[torch.DoubleTensor of dimension 3x4]
+```
+
+
+<a name="torch.Tensor.permute"></a>
+### [Tensor] permute(dim1, dim2, ..., dimn) ###
+
+Generalizes the function [transpose()](#torch.Tensor.transpose) and can be used
+as a convenience method replacing a sequence of transpose() calls.
+Returns a tensor where the dimensions were permuted according to the permutation
+given by (dim1, dim2, ... , dimn). The permutation must be specified fully, i.e.
+there must be as many parameters as the tensor has dimensions.
+```lua
+x = torch.Tensor(3,4,2,5)
+> x:size()
+ 3
+ 4
+ 2
+ 5
+[torch.LongStorage of size 4]
+
+y = x:permute(2,3,1,4) -- equivalent to y = x:transpose(1,3):transpose(1,2)
+> y:size()
+ 4
+ 2
+ 3
+ 5
+[torch.LongStorage of size 4]
+
+```
+
+
+<a name="torch.Tensor.unfold"></a>
+### [Tensor] unfold(dim, size, step) ###
+
+Returns a tensor which contains all slices of size `size` in the dimension `dim`. Step between
+two slices is given by `step`.
+
+If `sizedim` is the original size of dimension `dim`, the size of dimension
+`dim` in the returned tensor will be `(sizedim - size) / step + 1`
+
+An additional dimension of size `size` is appended in the returned tensor.
+
+```lua
+x = torch.Tensor(7)
+for i=1,7 do x[i] = i end
+> x
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+[torch.DoubleTensor of dimension 7]
+
+> x:unfold(1, 2, 1)
+ 1  2
+ 2  3
+ 3  4
+ 4  5
+ 5  6
+ 6  7
+[torch.DoubleTensor of dimension 6x2]
+
+> x:unfold(1, 2, 2)
+ 1  2
+ 3  4
+ 5  6
+[torch.DoubleTensor of dimension 3x2]
+```
+
+## Applying a function to a tensor ##
+
+These functions apply a function to each element of the tensor on which called the
+method (self). These methods are much faster than using a `for`
+loop in `Lua`. The results is stored in `self` (if the function returns
+something).
+
+<a name="torch.Tensor.apply"></a>
+### [self] apply(function) ###
+
+Apply the given function to all elements of self.
+
+The function takes a number (the current element of the tensor) and might return
+a number, in which case it will be stored in self.
+
+Examples:
+```lua
+i = 0
+z = torch.Tensor(3,3)
+z:apply(function(x)
+  i = i + 1
+  return i
+end) -- fill up the tensor
+> z
+ 1  2  3
+ 4  5  6
+ 7  8  9
+[torch.DoubleTensor of dimension 3x3]
+
+z:apply(math.sin) -- apply the sin function
+> z
+ 0.8415  0.9093  0.1411
+-0.7568 -0.9589 -0.2794
+ 0.6570  0.9894  0.4121
+[torch.DoubleTensor of dimension 3x3]
+
+sum = 0
+z:apply(function(x)
+  sum = sum + x
+end) -- compute the sum of the elements
+> sum
+1.9552094821074
+
+> z:sum() -- it is indeed correct!
+1.9552094821074
+```
+
+<a name="torch.Tensor.map"></a>
+### [self] map(tensor, function(xs, xt)) ###
+
+Apply the given function to all elements of self and `tensor`. The number of elements of both tensors
+must match, but sizes do not matter.
+
+The function takes two numbers (the current element of self and `tensor`) and might return
+a number, in which case it will be stored in self.
+
+Example:
+```lua
+x = torch.Tensor(3,3)
+y = torch.Tensor(9)
+i = 0
+x:apply(function() i = i + 1; return i end) -- fill-up x
+i = 0
+y:apply(function() i = i + 1; return i end) -- fill-up y
+> x
+ 1  2  3
+ 4  5  6
+ 7  8  9
+[torch.DoubleTensor of dimension 3x3]
+
+> y
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+[torch.DoubleTensor of dimension 9]
+
+x:map(y, function(xx, yy) return xx*yy end) -- element-wise multiplication
+> x
+  1   4   9
+ 16  25  36
+ 49  64  81
+[torch.DoubleTensor of dimension 3x3]
+```
+
+<a name="torch.Tensor.map2"></a>
+### [self] map2(tensor1, tensor2, function(x, xt1, xt2)) ###
+
+Apply the given function to all elements of self, `tensor1` and `tensor2`. The number of elements of all tensors
+must match, but sizes do not matter.
+
+The function takes three numbers (the current element of self, `tensor1` and `tensor2`) and might return
+a number, in which case it will be stored in self.
+
+Example:
+```lua
+x = torch.Tensor(3,3)
+y = torch.Tensor(9)
+z = torch.Tensor(3,3)
+
+i = 0; x:apply(function() i = i + 1; return math.cos(i)*math.cos(i) end)
+i = 0; y:apply(function() i = i + 1; return i end)
+i = 0; z:apply(function() i = i + 1; return i end)
+
+> x
+ 0.2919  0.1732  0.9801
+ 0.4272  0.0805  0.9219
+ 0.5684  0.0212  0.8302
+[torch.DoubleTensor of dimension 3x3]
+
+> y
+ 1
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+[torch.DoubleTensor of dimension 9]
+
+> z
+ 1  2  3
+ 4  5  6
+ 7  8  9
+[torch.DoubleTensor of dimension 3x3]
+
+x:map2(y, z, function(xx, yy, zz) return xx+yy*zz end)
+> x
+  1.2919   4.1732   9.9801
+ 16.4272  25.0805  36.9219
+ 49.5684  64.0212  81.8302
+[torch.DoubleTensor of dimension 3x3]
+```
+
+
+## Dividing a tensor into a table of tensors ##
+
+These functions divide a Tensor into a table of Tensors.
+
+<a name="torch.split"></a>
+### [result] split([result,] tensor, size, [dim]) ###
+
+Splits Tensor `tensor` along dimension `dim`
+into a `result` table of Tensors of size `size` (a number)
+or less (in the case of the last Tensor). The sizes of the non-`dim`
+dimensions remain unchanged. Internally, a series of
+[narrows](#torch.Tensor.narrow) are performed along
+dimensions `dim`. Argument `dim` defaults to 1.
+
+If `result` is not passed, then a new table is returned, otherwise it
+is emptied and reused.
+
+Example:
+```lua
+x = torch.randn(3,4,5)
+
+> x:split(2,1)
+{
+  1 : DoubleTensor - size: 2x4x5
+  2 : DoubleTensor - size: 1x4x5
+}
+
+> x:split(3,2)
+{
+  1 : DoubleTensor - size: 3x3x5
+  2 : DoubleTensor - size: 3x1x5
+}
+
+> x:split(2,3)
+{
+  1 : DoubleTensor - size: 3x4x2
+  2 : DoubleTensor - size: 3x4x2
+  3 : DoubleTensor - size: 3x4x1
+}
+```
+
+
+<a name="torch.chunk"></a>
+### [result] chunk([result,] tensor, n, [dim]) ###
+
+Splits Tensor `tensor` into `n` chunks of approximately equal size along
+dimensions `dim` and returns these as a `result` table of Tensors.
+Argument `dim` defaults to 1.
+
+This function uses [split](#torch.split) internally:
+```lua
+torch.split(result, tensor, math.ceil(tensor:size(dim)/n), dim)
+```
+
+Example:
+```lua
+x = torch.randn(3,4,5)
+
+> x:chunk(2,1)
+{
+  1 : DoubleTensor - size: 2x4x5
+  2 : DoubleTensor - size: 1x4x5
+}
+
+> x:chunk(2,2)
+{
+  1 : DoubleTensor - size: 3x2x5
+  2 : DoubleTensor - size: 3x2x5
+}
+
+> x:chunk(2,3)
+{
+  1 : DoubleTensor - size: 3x4x3
+  2 : DoubleTensor - size: 3x4x2
+}
+```
+
+## LuaJIT FFI access ##
+These functions expose Torch's Tensor and Storage data structures, through
+[LuaJIT FFI](http://luajit.org/ext_ffi_api.html).
+This allows extremely fast access to Tensors and Storages, all from Lua.
+
+<a name="torch.data"></a>
+### [result] data(tensor, [asnumber]) ###
+
+Returns a LuaJIT FFI pointer to the raw data of the tensor.
+If `asnumber` is true, then returns the pointer as a `intptr_t` cdata
+that you can transform to a plain lua number with `tonumber()`.
+
+Accessing the raw data of a Tensor like this is extremely efficient, in fact, it's
+almost as fast as C in lots of cases.
+
+Example:
+```lua
+t = torch.randn(3,2)
+> t
+ 0.8008 -0.6103
+ 0.6473 -0.1870
+-0.0023 -0.4902
+[torch.DoubleTensor of dimension 3x2]
+
+t_data = torch.data(t)
+for i = 0,t:nElement()-1 do t_data[i] = 0 end
+> t
+ 0 0
+ 0 0
+ 0 0
+[torch.DoubleTensor of dimension 3x2]
+```
+
+WARNING: bear in mind that accessing the raw data like this is dangerous, and should
+only be done on contiguous tensors (if a tensor is not contiguous, then you have to
+use its size and stride information). Making sure a tensor is contiguous is easy:
+```lua
+t = torch.randn(3,2)
+t_noncontiguous = t:transpose(1,2)
+
+-- it would be unsafe to work with torch.data(t_noncontiguous)
+t_transposed_and_contiguous = t_noncontiguous:contiguous()
+
+-- it is now safe to work with the raw pointer
+data = torch.data(t_transposed_and_contiguous)
+```
+
+Last, the pointer can be returned as a plain `intptr_t` cdata. This can be useful
+to share pointers between threads (warning: this is dangerous, as the second
+tensor doesn't increment the reference counter on the storage. If the first tensor
+gets freed, then the data of the second tensor becomes a dangling pointer):
+
+```lua
+t = torch.randn(10)
+p = tonumber(torch.data(t,true))
+s = torch.Storage(10, p)
+tt = torch.Tensor(s)
+-- tt and t are a view on the same data.
+```
+
+<a name="torch.cdata"></a>
+### [result] cdata(tensor, [asnumber]) ###
+
+Returns a LuaJIT FFI pointer to the C structure of the tensor.
+Use this with caution, and look at [FFI.lua](https://github.com/torch/torch7/blob/master/FFI.lua)
+for the members of the tensor
+
+## Reference counting ##
+
+Tensors are reference-counted. It means that each time an object (C or the
+Lua state) need to keep a reference over a tensor, the corresponding
+tensor reference counter will be [increased](#torch.Tensor.retain). The
+reference counter is [decreased]((#torch.Tensor.free)) when the object
+does not need the tensor anymore.
+
+These methods should be used with extreme care. In general, they should
+never be called, except if you know what you are doing, as the handling of
+references is done automatically. They can be useful in threaded
+environments. Note that these methods are atomic operations.
+
+<a name="torch.Tensor.retain"></a>
+### retain() ###
+
+Increment the reference counter of the tensor.
+
+<a name="torch.Tensor.free"></a>
+### free() ###
+
+Decrement the reference counter of the tensor. Free the tensor if the
+counter is at 0.
diff --git a/doc/tester.md b/doc/tester.md
new file mode 100644
index 0000000..eab061a
--- /dev/null
+++ b/doc/tester.md
@@ -0,0 +1,363 @@
+<a name="torch.Tester.dok"></a>
+# Tester #
+
+This class provides a generic unit testing framework. It is already
+being used in [nn](../index.md) package to verify the correctness of classes.
+
+The framework is generally used as follows.
+
+```lua
+local mytest = torch.TestSuite()
+
+local tester = torch.Tester()
+
+function mytest.testA()
+   local a = torch.Tensor{1, 2, 3}
+   local b = torch.Tensor{1, 2, 4}
+   tester:eq(a, b, "a and b should be equal")
+end
+
+function mytest.testB()
+   local a = {2, torch.Tensor{1, 2, 2}}
+   local b = {2, torch.Tensor{1, 2, 2.001}}
+   tester:eq(a, b, 0.01, "a and b should be approximately equal")
+end
+
+function mytest.testC()
+   local function myfunc()
+      return "hello " .. world
+   end
+   tester:assertNoError(myfunc, "myfunc shouldn't give an error")
+end
+
+tester:add(mytest)
+tester:run()
+```
+
+Running this code will report two test failures (and one test success).
+Generally it is  better to put a single test case in each test function unless
+several very related test cases exist.
+The error report includes the message and line number of the error.
+
+```
+Running 3 tests
+1/3 testB ............................................................... [PASS]
+2/3 testA ............................................................... [FAIL]
+3/3 testC ............................................................... [FAIL]
+Completed 3 asserts in 3 tests with 2 failures and 0 errors
+--------------------------------------------------------------------------------
+testA
+a and b should be equal
+TensorEQ(==) violation: max diff=1, tolerance=0
+stack traceback:
+        ./test.lua:8: in function <./test.lua:5>
+
+--------------------------------------------------------------------------------
+testC
+myfunc shouldn't give an error
+ERROR violation: err=./test.lua:19: attempt to concatenate global 'world' (a nil value)
+stack traceback:
+        ./test.lua:21: in function <./test.lua:17>
+
+--------------------------------------------------------------------------------
+torch/torch/Tester.lua:383: An error was found while running tests!
+stack traceback:
+        [C]: in function 'assert'
+        torch/torch/Tester.lua:383: in function 'run'
+        ./test.lua:25: in main chunk
+```
+
+Historically, Tester has supported a variety of equality checks
+([asserteq](#torch.Tester.asserteq),
+[assertalmosteq](#torch.Tester.assertalmosteq),
+[assertTensorEq](#torch.Tester.assertTensorEq),
+[assertTableEq](#torch.Tester.assertTableEq), and their negations). In general
+however, you should just use [eq](#torch.Tester.eq) (or its negation
+[ne](#torch.Tester.ne)).  These functions do deep checking of many object types
+including recursive tables and tensors, and support a
+tolerance parameter for comparing numerical values (including tensors).
+
+Many of the tester functions accept both an optional `tolerance` parameter and a
+`message` to display if the test case fails. For both convenience and backwards
+compatibility, these arguments can be supplied in either order.
+
+<a name="torch.Tester"></a>
+### torch.Tester() ###
+
+Returns a new instance of `torch.Tester` class.
+
+<a name="torch.Tester.add"></a>
+### add(f, 'name') ###
+
+Add `f`, either a test function or a table of test functions, to the tester.
+
+If `f` is a function then names should be unique. There are a couple of special
+values for `name`: if it is `_setUp` or `_tearDown`, then the function will be
+called either *before* or *after* every test respectively, with the name of the
+test passed as a parameter.
+
+If `f` is a table then `name` should be nil, and the names of the individual
+tests in the table will be taken from the corresponding table key. It's
+recommended you use [TestSuite](#torch.TestSuite.dok) for tables of tests.
+
+Returns the torch.Tester instance.
+
+<a name="torch.Tester.run"></a>
+### run(testNames) ###
+
+Run tests that have been added by [add(f, 'name')](#torch.Tester.add).
+While running it reports progress, and at the end gives a summary of all errors.
+
+If a list of names `testNames` is passed, then all tests matching these names
+(using `string.match`) will be run; otherwise all tests will be run.
+
+```lua
+tester:run() -- runs all tests
+tester:run("test1") -- runs the test named "test1"
+tester:run({"test2", "test3"}) -- runs the tests named "test2" and "test3"
+```
+
+<a name="torch.Tester.disable"></a>
+### disable(testNames) ###
+
+Prevent the given tests from running, where `testNames` can be a single string
+or list of strings. More precisely, when [run](#torch.Tester.run)
+is invoked, it will skip these tests, while still printing out an indication of
+skipped tests. This is useful for temporarily disabling tests without
+commenting out the code (for example, if they depend on upstream code that is
+currently broken), and explicitly flagging them as skipped.
+
+Returns the torch.Tester instance.
+
+```lua
+local tester = torch.Tester()
+local tests = torch.TestSuite()
+
+function tests.brokenTest()
+  -- ...
+end
+
+tester:add(tests):disable('brokenTest'):run()
+```
+
+```
+Running 1 test
+1/1 brokenTest .......................................................... [SKIP]
+Completed 0 asserts in 1 test with 0 failures and 0 errors and 1 disabled
+```
+
+<a name="torch.Tester.assert"></a>
+### assert(condition [, message]) ###
+
+Check that `condition` is true (using the optional `message` if the test
+fails).
+Returns whether the test passed.
+
+<a name="torch.Tester.assertGeneralEq"></a>
+### assertGeneralEq(got, expected [, tolerance] [, message]) ###
+
+General equality check between numbers, tables, strings, `torch.Tensor`
+objects, `torch.Storage` objects, etc.
+
+Check that `got` and `expected` have the same contents, where tables are
+compared recursively, tensors and storages are compared elementwise, and numbers
+are compared within `tolerance` (default value `0`). Other types are compared by
+strict equality. The optional `message` is used if the test fails.
+Returns whether the test passed.
+
+<a name="torch.Tester.eq"></a>
+### eq(got, expected  [, tolerance] [, message]) ###
+
+Convenience function; does the same as
+[assertGeneralEq](#torch.Tester.assertGeneralEq).
+
+<a name="torch.Tester.assertGeneralNe"></a>
+### assertGeneralNe(got, unexpected  [, tolerance] [, message]) ###
+
+General inequality check between numbers, tables, strings, `torch.Tensor`
+objects, `torch.Storage` objects, etc.
+
+Check that `got` and `unexpected` have different contents, where tables are
+compared recursively, tensors and storages are compared elementwise, and numbers
+are compared within `tolerance` (default value `0`). Other types are compared by
+strict equality. The optional `message` is used if the test fails.
+Returns whether the test passed.
+
+<a name="torch.Tester.ne"></a>
+### ne(got, unexpected  [, tolerance] [, message]) ###
+
+Convenience function; does the same as
+[assertGeneralNe](#torch.Tester.assertGeneralNe).
+
+<a name="torch.Tester.assertlt"></a>
+### assertlt(a, b [, message]) ###
+
+Check that `a < b` (using the optional `message` if the test fails),
+where `a` and `b` are numbers.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertgt"></a>
+### assertgt(a, b [, message]) ###
+
+Check that `a > b` (using the optional `message` if the test fails),
+where `a` and `b` are numbers.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertle"></a>
+### assertle(a, b [, message]) ###
+
+Check that `a <= b` (using the optional `message` if the test fails),
+where `a` and `b` are numbers.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertge"></a>
+### assertge(a, b [, message]) ###
+
+Check that `a >= b` (using the optional `message` if the test fails),
+where `a` and `b` are numbers.
+Returns whether the test passed.
+
+<a name="torch.Tester.asserteq"></a>
+### asserteq(a, b [, message]) ###
+
+Check that `a == b` (using the optional `message` if the test fails).
+Note that this uses the generic lua equality check, so objects such as tensors
+that have the same content but are distinct objects will fail this test;
+consider using [assertGeneralEq()](#torch.Tester.assertGeneralEq) instead.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertne"></a>
+### assertne(a, b [, message]) ###
+
+Check that `a ~= b` (using the optional `message` if the test fails).
+Note that this uses the generic lua inequality check, so objects such as tensors
+that have the same content but are distinct objects will pass this test;
+consider using [assertGeneralNe()](#torch.Tester.assertGeneralNe) instead.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertalmosteq"></a>
+### assertalmosteq(a, b [, tolerance] [, message]) ###
+
+Check that `|a - b| <= tolerance` (using the optional `message` if the
+test fails), where `a` and `b` are numbers, and `tolerance` is an optional
+number (default `1e-16`).
+Returns whether the test passed.
+
+<a name="torch.Tester.assertTensorEq"></a>
+### assertTensorEq(ta, tb [, tolerance] [, message]) ###
+
+Check that `max(abs(ta - tb)) <= tolerance` (using the optional `message`
+if the test fails), where `ta` and `tb` are tensors, and `tolerance` is an
+optional number (default `1e-16`). Tensors that are different types or sizes
+will cause this check to fail.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertTensorNe"></a>
+### assertTensorNe(ta, tb [, tolerance] [, message]) ###
+
+Check that `max(abs(ta - tb)) > tolerance` (using the optional `message`
+if the test fails), where `ta` and `tb` are tensors, and `tolerance` is an
+optional number (default `1e-16`). Tensors that are different types or sizes
+will cause this check to pass.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertTableEq"></a>
+### assertTableEq(ta, tb [, tolerance] [, message]) ###
+
+Check that the two tables have the same contents, comparing them
+recursively, where objects such as tensors are compared using their contents.
+Numbers (such as those appearing in tensors) are considered equal if
+their difference is at most the given tolerance.
+
+<a name="torch.Tester.assertTableNe"></a>
+### assertTableNe(ta, tb [, tolerance] [, message]) ###
+
+Check that the two tables have distinct contents, comparing them
+recursively, where objects such as tensors are compared using their contents.
+Numbers (such as those appearing in tensors) are considered equal if
+their difference is at most the given tolerance.
+
+<a name="torch.Tester.assertError"></a>
+### assertError(f [, message]) ###
+
+Check that calling `f()` (via `pcall`) raises an error (using the
+optional `message` if the test fails).
+Returns whether the test passed.
+
+<a name="torch.Tester.assertNoError"></a>
+### assertNoError(f [, message]) ###
+
+Check that calling `f()` (via `pcall`) does not raise an error (using the
+optional `message` if the test fails).
+Returns whether the test passed.
+
+<a name="torch.Tester.assertErrorMsg"></a>
+### assertErrorMsg(f, errmsg [, message]) ###
+
+Check that calling `f()` (via `pcall`) raises an error with the specific error
+message `errmsg` (using the optional `message` if the test fails).
+Returns whether the test passed.
+
+<a name="torch.Tester.assertErrorPattern"></a>
+### assertErrorPattern(f, errPattern [, message]) ###
+
+Check that calling `f()` (via `pcall`) raises an error matching `errPattern`
+(using the optional `message` if the test fails).
+The matching is done using `string.find`; in particular substrings will match.
+Returns whether the test passed.
+
+<a name="torch.Tester.assertErrorObj"></a>
+### assertErrorObj(f, errcomp [, message]) ###
+
+Check that calling `f()` (via `pcall`) raises an error object `err` such that
+calling `errcomp(err)` returns true (using the optional `message` if the test
+fails).
+Returns whether the test passed.
+
+<a name="torch.Tester.setEarlyAbort"></a>
+### setEarlyAbort(earlyAbort) ###
+
+If `earlyAbort == true` then the testing will stop on the first test failure.
+By default this is off.
+
+<a name="torch.Tester.setRethrowErrors"></a>
+### setRethrowErrors(rethrowErrors) ###
+
+If `rethrowErrors == true` then lua errors encountered during the execution of
+the tests will be rethrown, instead of being caught by the tester.
+By default this is off.
+
+<a name="torch.Tester.setSummaryOnly"></a>
+### setSummaryOnly(summaryOnly) ###
+
+If `summaryOnly == true`, then only the pass / fail status of the tests will be
+printed out, rather than full error messages. By default, this is off.
+
+
+<a name="torch.TestSuite.dok"></a>
+# TestSuite #
+
+A TestSuite is used in conjunction with [Tester](#torch.Tester.dok). It is
+created via `torch.TestSuite()`, and behaves like a plain lua table,
+except that it also checks that duplicate tests are not created.
+It is recommended that you always use a TestSuite instead of a plain table for
+your tests.
+
+The following example code attempts to add a function with the same name
+twice to a TestSuite (a surprisingly common mistake), which gives an error.
+
+```lua
+> test = torch.TestSuite()
+>
+> function test.myTest()
+>    -- ...
+> end
+>
+> -- ...
+>
+> function test.myTest()
+>    -- ...
+> end
+torch/TestSuite.lua:16: Test myTest is already defined.
+```
+
diff --git a/doc/timer.md b/doc/timer.md
new file mode 100644
index 0000000..aa6aba2
--- /dev/null
+++ b/doc/timer.md
@@ -0,0 +1,47 @@
+<a name="torch.Timer.dok"></a>
+# Timer #
+
+This class is able to measure time (in seconds) elapsed in a particular period. Example:
+```lua
+  timer = torch.Timer() -- the Timer starts to count now
+  x = 0
+  for i=1,1000000 do
+    x = x + math.sin(x)
+  end
+  print('Time elapsed for 1,000,000 sin: ' .. timer:time().real .. ' seconds')
+```
+
+<a name="torch.Timer"></a>
+## Timer Class Constructor and Methods ##
+
+<a name="torch.Timer"></a>
+### torch.Timer() ###
+
+Returns a new `Timer`. The timer starts to count the time now.
+
+<a name="torch.Timer.reset"></a>
+### [self] reset() ###
+
+Reset the timer accumulated time to `0`. If the timer was running, the timer
+restarts to count the time now. If the timer was stopped, it stays stopped.
+
+<a name="torch.Timer.resume"></a>
+### [self] resume() ###
+
+Resume a stopped timer. The timer restarts to count the time, and addition
+the accumulated time with the time already counted before being stopped.
+
+<a name="torch.Timer.stop"></a>
+### [self] stop() ###
+
+Stop the timer. The accumulated time counted until now is stored.
+
+<a name="torch.Timer.time"></a>
+### [table] time() ###
+
+Returns a table reporting the accumulated time elapsed until now. Following the UNIX shell `time` command,
+there are three fields in the table:
+  * `real`: the wall-clock elapsed time.
+  * `user`: the elapsed CPU time. Note that the CPU time of a threaded program sums time spent in all threads.
+  * `sys`: the time spent in system usage.
+
diff --git a/doc/utility.md b/doc/utility.md
new file mode 100644
index 0000000..6b7397b
--- /dev/null
+++ b/doc/utility.md
@@ -0,0 +1,327 @@
+<a name="torch.utility.dok"></a>
+# Torch utility functions #
+
+These functions are used in all Torch package for creating and handling classes.
+The most interesting function is probably [`torch.class()`](#torch.class) which allows
+the user to create easily new classes. [`torch.typename()`](#torch.typename) might
+also be interesting to check what is the class of a given *Torch7* object.
+
+The other functions are for more advanced users.
+
+
+<a name="torch.class"></a>
+### [metatable] torch.class(name, [parentName], [module]) ###
+
+Creates a new `Torch` class called `name`. If `parentName` is provided, the class will inherit
+`parentName` methods. A class is a table which has a particular metatable.
+
+If `module` is not provided and if `name` is of the form
+`package.className` then the class `className` will be added to the
+specified `package`. In that case, `package` has to be a valid (and
+already loaded) package. If `name` does not contain any `.`, then the class
+will be defined in the global environment.
+
+If `module` is provided table, the class will be defined in this table at
+key `className`.
+
+One \[or two\] (meta)tables are returned. These tables contain all the method
+provided by the class [and its parent class if it has been provided]. After
+a call to `torch.class()` you have to fill-up properly the metatable.
+
+After the class definition is complete, constructing a new class `name` will be achieved by a call to `name()`.
+This call will first call the method ```lua__init()``` if it exists, passing all arguments of `name()`.
+
+```lua
+-- for naming convenience
+do
+   --- creates a class "Foo"
+   local Foo = torch.class('Foo')
+
+   --- the initializer
+   function Foo:__init()
+      self.contents = 'this is some text'
+   end
+
+   --- a method
+   function Foo:print()
+      print(self.contents)
+   end
+
+   --- another one
+   function Foo:bip()
+      print('bip')
+   end
+
+end
+
+--- now create an instance of Foo
+foo = Foo()
+
+--- try it out
+foo:print()
+
+--- create a class torch.Bar which
+--- inherits from Foo
+do
+   local Bar, parent = torch.class('torch.Bar', 'Foo')
+
+   --- the initializer
+   function Bar:__init(stuff)
+      --- call the parent initializer on ourself
+      parent.__init(self)
+
+      --- do some stuff
+      self.stuff = stuff
+   end
+
+   --- a new method
+   function Bar:boing()
+      print('boing!')
+   end
+
+   --- override parent's method
+   function Bar:print()
+      print(self.contents)
+      print(self.stuff)
+   end
+end
+
+--- create a new instance and use it
+bar = torch.Bar('ha ha!')
+bar:print() -- overrided method
+bar:boing() -- child method
+bar:bip()   -- parent's method
+```
+
+For advanced users, it is worth mentionning that `torch.class()` actually
+calls [`torch.newmetatable()`](#torch.newmetatable) with a particular
+constructor. The constructor creates a Lua table and set the right
+metatable on it, and then calls ```lua__init()``` if it exists in the
+metatable. It also sets a [factory](#torch.factory) field ```lua__factory``` such that it
+is possible to create an empty object of this class.
+
+
+<a name="torch.type"></a>
+### [string] torch.type(object) ###
+
+Checks if `object` has a metatable. If it does, and if it corresponds to a
+`Torch` class, then returns a string containing the name of the
+class. Otherwise, it returns the Lua `type(object)` of the object.
+Unlike [`torch.typename()`](#torch.typename), all outputs are strings:
+
+```lua
+> torch.type(torch.Tensor())
+torch.DoubleTensor
+> torch.type({})
+table
+> torch.type(7)
+number
+```
+
+
+<a name="torch.typename"></a>
+### [string] torch.typename(object) ###
+
+Checks if `object` has a metatable. If it does, and if it corresponds to a
+`Torch` class, then returns a string containing the name of the
+class. Returns `nil` in any other cases.
+
+```lua
+> torch.typename(torch.Tensor())
+torch.DoubleTensor
+> torch.typename({})
+
+> torch.typename(7)
+
+```
+
+A Torch class is a class created with [`torch.class()`](#torch.class) or
+[`torch.newmetatable()`](#torch.newmetatable).
+
+
+<a name="torch.typename2id"></a>
+### [userdata] torch.typename2id(string) ###
+
+Given a Torch class name specified by `string`, returns a unique
+corresponding id (defined by a `lightuserdata` pointing on the internal
+structure of the class). This might be useful to do a *fast* check of the
+class of an object (if used with [`torch.id()`](#torch.id)), avoiding string
+comparisons.
+
+Returns `nil` if `string` does not specify a Torch object.
+
+
+<a name="torch.id"></a>
+### [userdata] torch.id(object) ###
+
+Returns a unique id corresponding to the `class` of the given *Torch7* object.
+The id is defined by a `lightuserdata` pointing on the internal structure
+of the class.
+
+Returns `nil` if `object` is not a Torch object.
+
+This is different from the `object` id returned by [`torch.pointer()`](#torch.pointer).
+
+
+<a name="torch.isTypeOf"></a>
+### [boolean] isTypeOf(object, typeSpec) ###
+
+Checks if a given `object` is an instance of the type specified by `typeSpec`.
+`typeSpec` can be a string (including a `string.find` pattern) or the constructor
+object for a Torch class. This function traverses up the class hierarchy,
+so if b is an instance of B which is a subclass of A, then
+`torch.isTypeOf(b, B)` and `torch.isTypeOf(b, A)` will both return `true`.
+
+
+<a name="torch.newmetatable"></a>
+### [table] torch.newmetatable(name, parentName, constructor) ###
+
+Register a new metatable as a Torch type with the given string `name`. The new metatable is returned.
+
+If the string `parentName` is not `nil` and is a valid Torch type (previously created
+by `torch.newmetatable()`) then set the corresponding metatable as a metatable to the returned new
+metatable.
+
+If the given `constructor` function is not `nil`, then assign to the variable `name` the given constructor.
+The given `name` might be of the form `package.className`, in which case the `className` will be local to the
+specified `package`. In that case, `package` must be a valid and already loaded package.
+
+
+<a name="torch.factory"></a>
+### [function] torch.factory(name) ###
+
+Returns the factory function of the Torch class `name`. If the class name is invalid or if the class
+has no factory, then returns `nil`.
+
+A Torch class is a class created with [`torch.class()`](#torch.class) or
+[`torch.newmetatable()`](#torch.newmetatable).
+
+A factory function is able to return a new (empty) object of its corresponding class. This is helpful for
+[object serialization](file.md#torch.File.serialization).
+
+
+<a name="torch.getmetatable"></a>
+### [table] torch.getmetatable(string) ###
+
+Given a `string`, returns a metatable corresponding to the Torch class described
+by `string`. Returns `nil` if the class does not exist.
+
+A Torch class is a class created with [`torch.class()`](#torch.class) or
+[`torch.newmetatable()`](#torch.newmetatable).
+
+Example:
+
+```lua
+> for k, v in pairs(torch.getmetatable('torch.CharStorage')) do print(k, v) end
+
+__index__       function: 0x1a4ba80
+__typename      torch.CharStorage
+write           function: 0x1a49cc0
+__tostring__    function: 0x1a586e0
+__newindex__    function: 0x1a4ba40
+string          function: 0x1a4d860
+__version       1
+read            function: 0x1a4d840
+copy            function: 0x1a49c80
+__len__         function: 0x1a37440
+fill            function: 0x1a375c0
+resize          function: 0x1a37580
+__index         table: 0x1a4a080
+size            function: 0x1a4ba20
+```
+
+
+<a name="torch.isequal"></a>
+### [boolean] torch.isequal(object1, object2) ###
+
+If the two objects given as arguments are *Lua* tables (or *Torch7* objects), then returns `true` if and only if the
+tables (or Torch objects) have the same address in memory. Returns `false` in any other cases.
+
+A Torch class is a class created with [`torch.class()`](#TorchClass) or
+[`torch.newmetatable()`](#torch.newmetatable).
+
+
+<a name="torch.getdefaulttensortype"></a>
+### [string] torch.getdefaulttensortype() ###
+
+Returns a string representing the default tensor type currently in use
+by *Torch7*.
+
+
+<a name="torch.getenv"></a>
+### [table] torch.getenv(function or userdata) ###
+
+Returns the Lua `table` environment of the given `function` or the given
+`userdata`.  To know more about environments, please read the documentation
+of [`lua_setfenv()`](http://www.lua.org/manual/5.1/manual.html#lua_setfenv)
+and [`lua_getfenv()`](http://www.lua.org/manual/5.1/manual.html#lua_getfenv).
+
+
+<a name="torch.version"></a>
+### [number] torch.version(object) ###
+
+Returns the field ```lua__version``` of a given object. This might
+be helpful to handle variations in a class over time.
+
+
+<a name="torch.pointer"></a>
+### [number] torch.pointer(object) ###
+
+Returns a unique id (pointer) of the given `object`, which can be a *Torch7*
+object, a table, a thread or a function.
+
+This is different from the `class` id returned by [`torch.id()`](#torch.id).
+
+
+<a name="torch.setdefaulttensortype"></a>
+### torch.setdefaulttensortype([typename]) ###
+
+Sets the default tensor type for all the tensors allocated from this
+point on. Valid types are:
+
+  * `torch.ByteTensor`
+  * `torch.CharTensor`
+  * `torch.ShortTensor`
+  * `torch.IntTensor`
+  * `torch.FloatTensor`
+  * `torch.DoubleTensor`
+
+
+<a name="torch.setenv"></a>
+### torch.setenv(function or userdata, table) ###
+
+Assign `table` as the Lua environment of the given `function` or the given
+`userdata`.  To know more about environments, please read the documentation
+of [`lua_setfenv()`](http://www.lua.org/manual/5.1/manual.html#lua_setfenv)
+and [`lua_getfenv()`](http://www.lua.org/manual/5.1/manual.html#lua_getfenv).
+
+
+<a name="torch.setmetatable"></a>
+### [object] torch.setmetatable(table, classname) ###
+
+Set the metatable of the given `table` to the metatable of the Torch
+object named `classname`.  This function has to be used with a lot
+of care.
+
+
+<a name="torch.getconstructortable"></a>
+### [table] torch.getconstructortable(string) ###
+
+BUGGY
+Return the constructor table of the Torch class specified by `string`.
+
+
+<a name="torch.totable"></a>
+### [table] torch.totable(object) ###
+
+Converts a Tensor or a Storage to a lua table. Also available as methods: `tensor:totable()` and `storage:totable()`.
+Multidimensional Tensors are converted to a set of nested tables, matching the shape of the source Tensor.
+
+```lua
+> print(torch.totable(torch.Tensor({1, 2, 3})))
+{
+  1 : 1
+  2 : 2
+  3 : 3
+}
+```
diff --git a/general.h b/general.h
new file mode 100644
index 0000000..4896adf
--- /dev/null
+++ b/general.h
@@ -0,0 +1,28 @@
+#ifndef TORCH_GENERAL_INC
+#define TORCH_GENERAL_INC
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "luaT.h"
+#include "TH.h"
+
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+
+#define snprintf _snprintf
+#define popen _popen
+#define pclose _pclose
+
+#endif
+
+#if LUA_VERSION_NUM >= 503
+/* one can simply enable LUA_COMPAT_5_2 to be backward compatible.
+However, this does not work when we are trying to use system-installed lua,
+hence these redefines
+*/
+#define luaL_optlong(L,n,d)     ((long)luaL_optinteger(L, (n), (d)))
+#define luaL_checklong(L,n)     ((long)luaL_checkinteger(L, (n)))
+#define luaL_checkint(L,n)      ((int)luaL_checkinteger(L, (n)))
+#endif
+
+#endif
diff --git a/generic/Storage.c b/generic/Storage.c
new file mode 100644
index 0000000..287796b
--- /dev/null
+++ b/generic/Storage.c
@@ -0,0 +1,286 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Storage.c"
+#else
+
+#include "luaG.h"
+
+static int torch_Storage_(new)(lua_State *L)
+{
+  int index = 1;
+  THStorage *storage;
+  THAllocator *allocator = luaT_toudata(L, index, "torch.Allocator");
+  if (allocator) index++;
+
+  if(lua_type(L, index) == LUA_TSTRING)
+  {
+    if (allocator)
+      THError("Passing allocator not supported when using file mapping");
+
+    const char *fileName = luaL_checkstring(L, index);
+    int isShared = 0;
+    if(luaT_optboolean(L, index + 1, 0))
+      isShared = TH_ALLOCATOR_MAPPED_SHARED;
+    long size = luaL_optlong(L, index + 2, 0);
+    if (isShared && luaT_optboolean(L, index + 3, 0))
+      isShared = TH_ALLOCATOR_MAPPED_SHAREDMEM;
+    storage = THStorage_(newWithMapping)(fileName, size, isShared);
+  }
+  else if(lua_type(L, index) == LUA_TTABLE)
+  {
+    long size = lua_objlen(L, index);
+    long i;
+    if (allocator)
+      storage = THStorage_(newWithAllocator)(size, allocator, NULL);
+    else
+      storage = THStorage_(newWithSize)(size);
+    for(i = 1; i <= size; i++)
+    {
+      lua_rawgeti(L, index, i);
+      if(!lua_isnumber(L, -1))
+      {
+        THStorage_(free)(storage);
+        luaL_error(L, "element at index %d is not a number", i);
+      }
+      THStorage_(set)(storage, i-1, (real)lua_tonumber(L, -1));
+      lua_pop(L, 1);
+    }
+  }
+  else if(lua_type(L, index) == LUA_TUSERDATA)
+  {
+    if (allocator)
+      THError("Passing allocator not supported when using storage views");
+
+    THStorage *src = luaT_checkudata(L, index, torch_Storage);
+    real *ptr = src->data;
+    long offset = luaL_optlong(L, index + 1, 1) - 1;
+    if (offset < 0 || offset >= src->size) {
+      luaL_error(L, "offset out of bounds");
+    }
+    long size = luaL_optlong(L, index + 2, src->size - offset);
+    if (size < 1 || size > (src->size - offset)) {
+      luaL_error(L, "size out of bounds");
+    }
+    storage = THStorage_(newWithData)(ptr + offset, size);
+    storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_VIEW;
+    storage->view = src;
+    THStorage_(retain)(storage->view);
+  }
+  else if(lua_type(L, index + 1) == LUA_TNUMBER)
+  {
+    long size = luaL_optlong(L, index, 0);
+    real *ptr = (real *)luaL_optlong(L, index + 1, 0);
+    if (allocator)
+      storage = THStorage_(newWithDataAndAllocator)(ptr, size, allocator, NULL);
+    else
+      storage = THStorage_(newWithData)(ptr, size);
+    storage->flag = TH_STORAGE_REFCOUNTED;
+  }
+  else
+  {
+    long size = luaL_optlong(L, index, 0);
+    if (allocator)
+      storage = THStorage_(newWithAllocator)(size, allocator, NULL);
+    else
+      storage = THStorage_(newWithSize)(size);
+  }
+  luaT_pushudata(L, storage, torch_Storage);
+  return 1;
+}
+
+static int torch_Storage_(retain)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THStorage_(retain)(storage);
+  return 0;
+}
+
+static int torch_Storage_(free)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THStorage_(free)(storage);
+  return 0;
+}
+
+static int torch_Storage_(resize)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  long size = luaL_checklong(L, 2);
+/*  int keepContent = luaT_optboolean(L, 3, 0); */
+  THStorage_(resize)(storage, size);/*, keepContent); */
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Storage_(copy)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  void *src;
+  if( (src = luaT_toudata(L, 2, torch_Storage)) )
+    THStorage_(copy)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
+    THStorage_(copyByte)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
+    THStorage_(copyChar)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
+    THStorage_(copyShort)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
+    THStorage_(copyInt)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
+    THStorage_(copyLong)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
+    THStorage_(copyFloat)(storage, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
+    THStorage_(copyDouble)(storage, src);
+  else
+    luaL_typerror(L, 2, "torch.*Storage");
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Storage_(fill)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  real value = luaG_(checkreal)(L, 2);
+  THStorage_(fill)(storage, value);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Storage_(elementSize)(lua_State *L)
+{
+  luaT_pushlong(L, THStorage_(elementSize)());
+  return 1;
+}
+
+static int torch_Storage_(__len__)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  luaT_pushlong(L, storage->size);
+  return 1;
+}
+
+static int torch_Storage_(__newindex__)(lua_State *L)
+{
+  if(lua_isnumber(L, 2))
+  {
+    THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+    long index = luaL_checklong(L, 2) - 1;
+    real number = luaG_(checkreal)(L, 3);
+    THStorage_(set)(storage, index, number);
+    lua_pushboolean(L, 1);
+  }
+  else
+    lua_pushboolean(L, 0);
+
+  return 1;
+}
+
+static int torch_Storage_(__index__)(lua_State *L)
+{
+  if(lua_isnumber(L, 2))
+  {
+    THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+    long index = luaL_checklong(L, 2) - 1;
+    luaG_(pushreal)(L, THStorage_(get)(storage, index));
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else
+  {
+    lua_pushboolean(L, 0);
+    return 1;
+  }
+}
+
+#if defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_BYTE)
+static int torch_Storage_(string)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  if(lua_isstring(L, -1))
+  {
+    size_t len = 0;
+    const char *str = lua_tolstring(L, -1, &len);
+    THStorage_(resize)(storage, len);
+    memmove(storage->data, str, len);
+    lua_settop(L, 1);
+  }
+  else
+    lua_pushlstring(L, (char*)storage->data, storage->size);
+
+  return 1; /* either storage or string */
+}
+#endif
+
+static int torch_Storage_(totable)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  long i;
+
+  lua_newtable(L);
+  for(i = 0; i < storage->size; i++)
+  {
+    luaG_(pushreal)(L, storage->data[i]);
+    lua_rawseti(L, -2, i+1);
+  }
+  return 1;
+}
+
+static int torch_Storage_(factory)(lua_State *L)
+{
+  THStorage *storage = THStorage_(new)();
+  luaT_pushudata(L, storage, torch_Storage);
+  return 1;
+}
+
+static int torch_Storage_(write)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+  THFile_writeLongScalar(file, storage->size);
+  THFile_writeRealRaw(file, storage->data, storage->size);
+
+  return 0;
+}
+
+static int torch_Storage_(read)(lua_State *L)
+{
+  THStorage *storage = luaT_checkudata(L, 1, torch_Storage);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+  long size = THFile_readLongScalar(file);
+
+  THStorage_(resize)(storage, size);
+  THFile_readRealRaw(file, storage->data, storage->size);
+
+  return 0;
+}
+
+static const struct luaL_Reg torch_Storage_(_) [] = {
+  {"retain", torch_Storage_(retain)},
+  {"free", torch_Storage_(free)},
+  {"size", torch_Storage_(__len__)},
+  {"elementSize", torch_Storage_(elementSize)},
+  {"__len__", torch_Storage_(__len__)},
+  {"__newindex__", torch_Storage_(__newindex__)},
+  {"__index__", torch_Storage_(__index__)},
+  {"resize", torch_Storage_(resize)},
+  {"fill", torch_Storage_(fill)},
+  {"copy", torch_Storage_(copy)},
+  {"totable", torch_Storage_(totable)},
+  {"write", torch_Storage_(write)},
+  {"read", torch_Storage_(read)},
+#if defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_BYTE)
+  {"string", torch_Storage_(string)},
+#endif
+  {NULL, NULL}
+};
+
+void torch_Storage_(init)(lua_State *L)
+{
+  luaT_newmetatable(L, torch_Storage, NULL,
+                    torch_Storage_(new), torch_Storage_(free), torch_Storage_(factory));
+  luaT_setfuncs(L, torch_Storage_(_), 0);
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/Tensor.c b/generic/Tensor.c
new file mode 100644
index 0000000..0bf74e1
--- /dev/null
+++ b/generic/Tensor.c
@@ -0,0 +1,1323 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tensor.c"
+#else
+
+#include "luaG.h"
+
+static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride,
+                                                         THStorage **storage_, long *storageOffset_, THLongStorage **size_, THLongStorage **stride_);
+
+static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowStride, THLongStorage **size_, THLongStorage **stride_);
+
+static int torch_Tensor_(size)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  if(lua_isnumber(L,2))
+  {
+    int dim = luaL_checkint(L, 2)-1;
+    THArgCheck(dim >= 0 && dim < tensor->nDimension, 2, "dimension %d out of range of %dD tensor",
+        dim+1, THTensor_(nDimension)(tensor));
+    luaT_pushlong(L, tensor->size[dim]);
+  }
+  else
+  {
+    THLongStorage *size = THTensor_(newSizeOf)(tensor);
+    luaT_pushudata(L, size, "torch.LongStorage");
+  }
+  return 1;
+}
+
+static int torch_Tensor_(elementSize)(lua_State *L)
+{
+  luaT_pushlong(L, THStorage_(elementSize)());
+  return 1;
+}
+
+static int torch_Tensor_(stride)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  if(lua_isnumber(L,2))
+  {
+    int dim = luaL_checkint(L, 2)-1;
+    THArgCheck(dim >= 0 && dim < tensor->nDimension, 2, "dimension %d out of range of %dD tensor",
+        dim+1, THTensor_(nDimension)(tensor));
+    luaT_pushlong(L, tensor->stride[dim]);
+  }
+  else
+  {
+    THLongStorage *storage = THLongStorage_newWithSize(tensor->nDimension);
+    memmove(storage->data, tensor->stride, sizeof(long)*tensor->nDimension);
+    luaT_pushudata(L, storage, "torch.LongStorage");
+  }
+  return 1;
+}
+
+static int torch_Tensor_(nDimension)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  luaT_pushlong(L, tensor->nDimension);
+  return 1;
+}
+
+static int torch_Tensor_(storage)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  if(tensor->storage)
+  {
+    THStorage_(retain)(tensor->storage);
+    luaT_pushudata(L, tensor->storage, torch_Storage);
+  }
+  else
+    lua_pushnil(L);
+
+  return 1;
+}
+
+static int torch_Tensor_(storageOffset)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  luaT_pushlong(L, tensor->storageOffset+1);
+  return 1;
+}
+
+static int torch_Tensor_(new)(lua_State *L)
+{
+  THTensor *tensor;
+  long storageOffset;
+  THLongStorage *size, *stride;
+
+  if(lua_type(L, 1) == LUA_TTABLE)
+  {
+    long i, j;
+    THLongStorage *counter;
+    long si = 0;
+    int dimension = 0;
+    int is_finished = 0;
+
+    lua_settop(L, 1);
+    size = THLongStorage_new();
+
+    while( (lua_type(L, -1) == LUA_TTABLE) && (lua_objlen(L, -1) > 0) )
+    {
+      THLongStorage_resize(size, dimension+1);
+      size->data[dimension] = lua_objlen(L, -1);
+      dimension++;
+      lua_rawgeti(L, -1, 1);
+    }
+    lua_pop(L, 1);
+
+    counter = THLongStorage_newWithSize(size->size);
+    THLongStorage_fill(counter, 0);
+
+    tensor = THTensor_(newWithSize)(size, NULL);
+
+    if(size->size == 0)
+      is_finished = 1;
+
+    while(!is_finished)
+    {
+      if(!lua_istable(L, -1))
+      {
+        THLongStorage_free(size);
+        THLongStorage_free(counter);
+        THTensor_(free)(tensor);
+        THError("invalid tensor definition");
+      }
+
+      if(lua_objlen(L, -1) != size->data[size->size-1])
+      {
+        THLongStorage_free(size);
+        THLongStorage_free(counter);
+        THTensor_(free)(tensor);
+        THError("invalid tensor sizes");
+      }
+
+      for(i = 0; i < size->data[size->size-1]; i++)
+      {
+        lua_rawgeti(L, -1, i+1);
+        if(!lua_isnumber(L, -1))
+        {
+          THLongStorage_free(size);
+          THLongStorage_free(counter);
+          THTensor_(free)(tensor);
+          THError("invalid element (not a number)");
+        }
+        THStorage_(set)(THTensor_(storage)(tensor), si++, (real)lua_tonumber(L, -1));
+        lua_pop(L, 1);
+      }
+
+      if(size->size == 1)
+        break;
+
+      for(i = size->size-2; i >= 0; i--)
+      {
+        if(++counter->data[i] == size->data[i])
+        {
+          if(i == 0)
+          {
+            is_finished = 1;
+            break;
+          }
+          else
+          {
+            counter->data[i] = 0;
+            lua_pop(L, 1);
+          }
+        }
+        else
+        {
+          lua_pop(L, 1);
+          for(j = i; j < size->size-1; j++)
+          {
+            if(!lua_istable(L, -1))
+            {
+              THLongStorage_free(size);
+              THLongStorage_free(counter);
+              THTensor_(free)(tensor);
+              THError("invalid tensor definition");
+            }
+            if(lua_objlen(L, -1) != size->data[j])
+            {
+              THLongStorage_free(size);
+              THLongStorage_free(counter);
+              THTensor_(free)(tensor);
+              THError("invalid tensor sizes");
+            }
+            lua_rawgeti(L, -1, counter->data[j]+1);
+          }
+          break;
+        }
+      }
+    }
+
+    THLongStorage_free(size);
+    THLongStorage_free(counter);
+  }
+  else
+  {
+    THStorage *storage;
+
+    torch_Tensor_(c_readTensorStorageSizeStride)(L, 1, 1, 1, 1, 1,
+                                                 &storage, &storageOffset, &size, &stride);
+
+    tensor = THTensor_(newWithStorage)(storage, storageOffset, size, stride);
+
+    THLongStorage_free(size);
+    THLongStorage_free(stride);
+  }
+
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(set)(lua_State *L)
+{
+  THTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  THStorage *storage;
+  long storageOffset;
+  THLongStorage *size, *stride;
+
+  torch_Tensor_(c_readTensorStorageSizeStride)(L, 2, 1, 1, 1, 1,
+                                               &storage, &storageOffset, &size, &stride);
+
+  THTensor_(setStorage)(self, storage, storageOffset, size, stride);
+
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(clone)(lua_State *L)
+{
+  THTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  self = THTensor_(newClone)(self);
+  luaT_pushudata(L, self, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(contiguous)(lua_State *L)
+{
+  THTensor *self = luaT_checkudata(L, 1, torch_Tensor);
+  self = THTensor_(newContiguous)(self);
+  luaT_pushudata(L, self, torch_Tensor);
+  return 1;
+}
+
+/* Resize */
+static int torch_Tensor_(resizeAs)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *src = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor_(resizeAs)(tensor, src);
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(resize)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *size, *stride;
+
+  torch_Tensor_(c_readSizeStride)(L, 2, 0, &size, &stride);
+
+  THTensor_(resize)(tensor, size, stride);
+
+  THLongStorage_free(size);
+  THLongStorage_free(stride);
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(narrow)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension = luaL_checkint(L, 2)-1;
+  long firstIndex = luaL_checklong(L, 3)-1;
+  long size = luaL_checklong(L, 4);
+
+/*  THArgCheck( (dimension >= 0) && (dimension < tensor->nDimension), 2, "out of range");
+  THArgCheck( (firstIndex >= 0) && (firstIndex < tensor->size[dimension]), 3, "out of range");
+  THArgCheck( (size > 0) && (firstIndex+size <= tensor->size[dimension]), 4, "out of range");
+*/
+  tensor = THTensor_(newWithTensor)(tensor);
+  THTensor_(narrow)(tensor, NULL, dimension, firstIndex, size);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(sub)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  long d0s = -1, d0e = -1, d1s = -1, d1e = -1, d2s = -1, d2e = -1, d3s = -1, d3e = -1;
+
+  d0s = luaL_checklong(L, 2)-1;
+  d0e = luaL_checklong(L, 3)-1;
+  if(d0s < 0)
+    d0s += tensor->size[0]+1;
+  if(d0e < 0)
+    d0e += tensor->size[0]+1;
+  THArgCheck(tensor->nDimension > 0, 2, "invalid dimension");
+  THArgCheck(d0s >= 0 && d0s < tensor->size[0], 2, "out of range");
+  THArgCheck(d0e >= 0 && d0e < tensor->size[0], 3, "out of range");
+  THArgCheck(d0e >= d0s, 3, "end smaller than beginning");
+
+  if(!lua_isnone(L, 4))
+  {
+    d1s = luaL_checklong(L, 4)-1;
+    d1e = luaL_checklong(L, 5)-1;
+    if(d1s < 0)
+      d1s += tensor->size[1]+1;
+    if(d1e < 0)
+      d1e += tensor->size[1]+1;
+    THArgCheck(tensor->nDimension > 1, 4, "invalid dimension");
+    THArgCheck(d1s >= 0 && d1s < tensor->size[1], 4, "out of range");
+    THArgCheck(d1e >= 0 && d1e < tensor->size[1], 5, "out of range");
+    THArgCheck(d1e >= d1s, 5, "end smaller than beginning");
+
+    if(!lua_isnone(L, 6))
+    {
+      d2s = luaL_checklong(L, 6)-1;
+      d2e = luaL_checklong(L, 7)-1;
+      if(d2s < 0)
+        d2s += tensor->size[2]+1;
+      if(d2e < 0)
+        d2e += tensor->size[2]+1;
+      THArgCheck(tensor->nDimension > 2, 6, "invalid dimension");
+      THArgCheck(d2s >= 0 && d2s < tensor->size[2], 6, "out of range");
+      THArgCheck(d2e >= 0 && d2e < tensor->size[2], 7, "out of range");
+      THArgCheck(d2e >= d2s, 7, "end smaller than beginning");
+
+      if(!lua_isnone(L, 8))
+      {
+        d3s = luaL_checklong(L, 8)-1;
+        d3e = luaL_checklong(L, 9)-1;
+        if(d3s < 0)
+          d3s += tensor->size[3]+1;
+        if(d3e < 0)
+          d3e += tensor->size[3]+1;
+        THArgCheck(tensor->nDimension > 3, 8, "invalid dimension");
+        THArgCheck(d3s >= 0 && d3s < tensor->size[3], 8, "out of range");
+        THArgCheck(d3e >= 0 && d3e < tensor->size[3], 9, "out of range");
+        THArgCheck(d3e >= d3s, 9, "end smaller than beginning");
+      }
+    }
+  }
+
+  tensor = THTensor_(newWithTensor)(tensor);
+  THTensor_(narrow)(tensor, NULL, 0, d0s, d0e-d0s+1);
+  if(d1s >= 0)
+    THTensor_(narrow)(tensor, NULL, 1, d1s, d1e-d1s+1);
+  if(d2s >= 0)
+    THTensor_(narrow)(tensor, NULL, 2, d2s, d2e-d2s+1);
+  if(d3s >= 0)
+    THTensor_(narrow)(tensor, NULL, 3, d3s, d3e-d3s+1);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(select)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension = luaL_checkint(L, 2)-1;
+  long sliceIndex = luaL_checklong(L, 3)-1;
+
+/*   THArgCheck(src->nDimension > 1, 1, "cannot select on a vector");
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range");
+*/
+
+  if(tensor->nDimension > 1)
+  {
+    tensor = THTensor_(newWithTensor)(tensor);
+    THTensor_(select)(tensor, NULL, dimension, sliceIndex);
+    luaT_pushudata(L, tensor, torch_Tensor);
+  }
+  else
+  {
+    THArgCheck(tensor->nDimension == 1, 1, "empty Tensor");
+    luaG_(pushreal)(L, THTensor_(get1d)(tensor, sliceIndex));
+  }
+
+  return 1;
+}
+
+static int torch_Tensor_(indexSelect)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THTensor *tensor, *src;
+  THLongTensor *index;
+  int dim;
+  if (narg == 3)
+  {
+    tensor = THTensor_(new)();
+    src = luaT_checkudata(L, 1, torch_Tensor);
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_checkudata(L, 3, "torch.LongTensor");
+    luaT_pushudata(L,tensor,torch_Tensor);
+  }
+  else if(narg == 4)
+  {
+    src = luaT_checkudata(L, 2, torch_Tensor);
+    dim = luaL_checkint(L, 3) - 1;
+    index = luaT_checkudata(L, 4, "torch.LongTensor");
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    THError(torch_Tensor ", number, torch.LongTensor | " torch_Tensor ", " torch_Tensor ", number, torch.LongTensor expected");
+    return 0;
+  }
+
+  THTensor_(indexSelect)(tensor,src,dim,index);
+
+  return 1;
+}
+
+static int torch_Tensor_(indexCopy)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THTensor *tensor, *src;
+  THLongTensor *index;
+  int dim;
+  if(narg == 4)
+  {
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_checkudata(L, 3, "torch.LongTensor");
+    src = luaT_checkudata(L, 4, torch_Tensor);
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    THError( torch_Tensor ", number, torch.LongTensor, " torch_Tensor " expected");
+    return 0;
+  }
+
+  THTensor_(indexCopy)(tensor,dim,index,src);
+
+  return 1;
+}
+
+static int torch_Tensor_(indexAdd)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THTensor *tensor, *src;
+  THLongTensor *index;
+  int dim;
+  if(narg == 4)
+  {
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_checkudata(L, 3, "torch.LongTensor");
+    src = luaT_checkudata(L, 4, torch_Tensor);
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    THError( torch_Tensor ", number, torch.LongTensor, " torch_Tensor " expected");
+    return 0;
+  }
+
+  THTensor_(indexAdd)(tensor,dim,index,src);
+
+  return 1;
+}
+
+static int torch_Tensor_(indexFill)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THTensor *tensor;
+  THLongTensor *index;
+  real val;
+  int dim;
+  if(narg == 4)
+  {
+    dim = luaL_checkint(L, 2) - 1;
+    index = luaT_checkudata(L, 3, "torch.LongTensor");
+    val = luaG_(checkreal)(L, 4);
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    THError( torch_Tensor ", number, torch.LongTensor, number expected");
+    return 0;
+  }
+
+  THTensor_(indexFill)(tensor,dim,index,val);
+
+  return 1;
+}
+
+static int torch_Tensor_(maskedSelect)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THTensor *tensor, *src;
+  THByteTensor *mask;
+
+  if (narg == 2)
+  {
+    tensor = THTensor_(new)();
+    src = luaT_checkudata(L, 1, torch_Tensor);
+    mask = luaT_checkudata(L, 2, "torch.ByteTensor");
+    luaT_pushudata(L,tensor,torch_Tensor);
+  }
+  else if(narg == 3)
+  {
+    src = luaT_checkudata(L, 2, torch_Tensor);
+    mask = luaT_checkudata(L, 3, "torch.ByteTensor");
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    THError( torch_Tensor ", torch.ByteTensor | " torch_Tensor ", " torch_Tensor ", torch.ByteTensor expected");
+    return 0;
+  }
+
+  THTensor_(maskedSelect)(tensor,src,mask);
+
+  return 1;
+}
+
+static int torch_Tensor_(maskedCopy)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THTensor *tensor, *src;
+  THByteTensor *mask;
+
+  if(narg == 3)
+  {
+    mask = luaT_checkudata(L, 2, "torch.ByteTensor");
+    src = luaT_checkudata(L, 3, torch_Tensor);
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    THError( torch_Tensor ", torch.ByteTensor, " torch_Tensor " expected");
+    return 0;
+  }
+
+  THTensor_(maskedCopy)(tensor,mask,src);
+
+  /* return destination */
+  lua_pop(L, 2);
+
+  return 1;
+}
+
+static int torch_Tensor_(maskedFill)(lua_State *L)
+{
+  int narg = lua_gettop(L);
+  THTensor *tensor;
+  THByteTensor *mask;
+  real val;
+  if(narg == 3)
+  {
+    mask = luaT_checkudata(L, 2, "torch.ByteTensor");
+    val = luaG_(checkreal)(L, 3);
+    tensor = luaT_checkudata(L,1,torch_Tensor);
+  }
+  else
+  {
+    THError( torch_Tensor ", torch.ByteTensor, number expected");
+    return 0;
+  }
+
+  THTensor_(maskedFill)(tensor,mask,val);
+
+  return 1;
+}
+
+static int torch_Tensor_(transpose)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension1 = luaL_checkint(L, 2)-1;
+  int dimension2 = luaL_checkint(L, 3)-1;
+
+/*
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->nDimension), 2, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->nDimension), 3, "out of range");
+*/
+
+  tensor = THTensor_(newWithTensor)(tensor);
+  THTensor_(transpose)(tensor, NULL, dimension1, dimension2);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(t)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+
+  THArgCheck(tensor->nDimension == 2, 1, "Tensor must have 2 dimensions");
+
+  tensor = THTensor_(newWithTensor)(tensor);
+  THTensor_(transpose)(tensor, NULL, 0, 1);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(unfold)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  int dimension = luaL_checkint(L, 2)-1;
+  long size = luaL_checklong(L, 3);
+  long step = luaL_checklong(L, 4);
+
+/*
+  THArgCheck( (src->nDimension > 0), 1, "cannot unfold an empty tensor");
+  THArgCheck(dimension < src->nDimension, 2, "out of range");
+  THArgCheck(size <= src->size[dimension], 3, "out of range");
+*/
+
+  tensor = THTensor_(newWithTensor)(tensor);
+  THTensor_(unfold)(tensor, NULL, dimension, size, step);
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+/* is contiguous? [a bit like in TnXIterator] */
+static int torch_Tensor_(isContiguous)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  lua_pushboolean(L, THTensor_(isContiguous)(tensor));
+  return 1;
+}
+
+static int torch_Tensor_(isSize)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *size = luaT_checkudata(L, 2, "torch.LongStorage");
+  lua_pushboolean(L, THTensor_(isSize)(tensor, size));
+  return 1;
+}
+
+static int torch_Tensor_(isSameSizeAs)(lua_State *L)
+{
+  THTensor *tensor1 = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *tensor2 = luaT_checkudata(L, 2, torch_Tensor);
+  lua_pushboolean(L, THTensor_(isSameSizeAs)(tensor1, tensor2));
+  return 1;
+}
+
+static int torch_Tensor_(isSetTo)(lua_State *L)
+{
+  THTensor *tensor1 = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *tensor2 = luaT_checkudata(L, 2, torch_Tensor);
+  lua_pushboolean(L, THTensor_(isSetTo)(tensor1, tensor2));
+  return 1;
+}
+
+static int torch_Tensor_(nElement)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  luaT_pushlong(L, THTensor_(nElement)(tensor));
+  return 1;
+}
+
+static int torch_Tensor_(copy)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  void *src;
+  if( (src = luaT_toudata(L, 2, torch_Tensor)) )
+    THTensor_(copy)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) )
+    THTensor_(copyByte)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) )
+    THTensor_(copyChar)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) )
+    THTensor_(copyShort)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) )
+    THTensor_(copyInt)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) )
+    THTensor_(copyLong)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) )
+    THTensor_(copyFloat)(tensor, src);
+  else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
+    THTensor_(copyDouble)(tensor, src);
+  else
+    luaL_typerror(L, 2, "torch.*Tensor");
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(__newindex__)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *idx = NULL;
+  THByteTensor *mask;
+
+  if(lua_isnumber(L, 2))
+  {
+    void *src;
+    long index = luaL_checklong(L,2)-1;
+    THArgCheck(tensor->nDimension > 0, 1, "empty tensor");
+    if (index < 0) index = tensor->size[0] + index + 1;
+
+    if (lua_isnumber(L,3)) {
+      real value = luaG_(checkreal)(L,3);
+      if (tensor->nDimension == 1) {
+        THArgCheck(index >= 0 && index < tensor->size[0], 2, "out of range");
+        THStorage_(set)(tensor->storage, tensor->storageOffset+index*tensor->stride[0], value);
+      } else {
+        tensor = THTensor_(newWithTensor)(tensor);
+        THTensor_(narrow)(tensor, NULL, 0, index, 1);
+        THTensor_(fill)(tensor, value);
+        THTensor_(free)(tensor);
+      }
+    } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copy)(tensor, src);
+      THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.ByteTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyByte)(tensor, src);
+      THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.CharTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyChar)(tensor, src);
+      THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.ShortTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyShort)(tensor, src);
+      THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.IntTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyInt)(tensor, src);
+      THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.LongTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyLong)(tensor, src);
+      THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.FloatTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyFloat)(tensor, src);
+      THTensor_(free)(tensor);
+    } else if( (src = luaT_toudata(L, 3, "torch.DoubleTensor")) ) {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(narrow)(tensor, NULL, 0, index, 1);
+      THTensor_(copyDouble)(tensor, src);
+      THTensor_(free)(tensor);
+    } else {
+      luaL_typerror(L, 3, "torch.*Tensor");
+    }
+    lua_pushboolean(L, 1);
+  }
+  else if((idx = luaT_toudata(L, 2, "torch.LongStorage")))
+  {
+    long index = THTensor_(storageOffset)(tensor);
+    real value = luaG_(checkreal)(L,3);
+    int dim;
+
+    THArgCheck(idx->size == tensor->nDimension, 2, "invalid size");
+
+    for(dim = 0; dim < idx->size; dim++)
+    {
+      long z = idx->data[dim]-1;
+      if (z < 0) z = tensor->size[dim] + z + 1;
+      THArgCheck((z >= 0) && (z < tensor->size[dim]), 2, "index out of bound");
+      index += z*tensor->stride[dim];
+    }
+
+    THStorage_(set)(tensor->storage, index, value);
+    lua_pushboolean(L, 1);
+  }
+  else if(lua_istable(L, 2))
+  {
+    int dim;
+    int cdim = 0;
+    int ndims;
+    int done = 0;
+    ndims = tensor->nDimension;
+    THArgCheck(lua_objlen(L, 2) <= ndims, 2, "too many indices provided");
+    tensor = THTensor_(newWithTensor)(tensor);
+    for(dim = 0; dim < ndims; dim++)
+    {
+      lua_rawgeti(L, 2, dim+1);
+      if(lua_isnumber(L, -1))
+      {
+        long z = lua_tonumber(L, -1)-1;
+        lua_pop(L, 1);
+        if (z < 0) z = tensor->size[cdim] + z + 1;
+        THArgCheck((z >= 0) && (z < tensor->size[cdim]), 2, "index out of bound");
+        if(tensor->nDimension == 1) {
+          real value = luaG_(checkreal)(L,3);
+          done = 1;
+          THStorage_(set)(tensor->storage, tensor->storageOffset+z*tensor->stride[0], value);
+        } else {
+          THTensor_(select)(tensor, NULL, cdim, z);
+        }
+      }
+      else if (lua_istable(L, -1))
+      {
+        long start = 0;
+        long end = tensor->size[cdim]-1;
+        lua_rawgeti(L, -1, 1);
+        if(lua_isnumber(L, -1)) {
+          start = lua_tonumber(L, -1)-1;
+          end = start;
+        }
+        lua_pop(L, 1);
+        if (start < 0) start = tensor->size[cdim] + start + 1;
+        THArgCheck((start >= 0) && (start < tensor->size[cdim]), 2, "start index out of bound");
+
+        lua_rawgeti(L, -1, 2);
+        if(lua_isnumber(L, -1)) {
+          end = lua_tonumber(L, -1)-1;
+        }
+        lua_pop(L, 2);
+        if (end < 0) end = tensor->size[cdim] + end + 1;
+        THArgCheck((end >= 0) && (end < tensor->size[cdim]), 2, "end index out of bound");
+
+        THArgCheck((end >= start), 2, "end index must be greater or equal to start index");
+
+        THTensor_(narrow)(tensor, NULL, cdim++, start, end-start+1);
+      }
+      else
+      {
+        break;
+      }
+    }
+    if(!done) {
+      /* doing a copy */
+      void *src;
+      if (lua_isnumber(L,3)) {
+        THTensor_(fill)(tensor, lua_tonumber(L,3));
+      } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) {
+        THTensor_(copy)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.ByteTensor")) ) {
+        THTensor_(copyByte)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.CharTensor")) ) {
+        THTensor_(copyChar)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.ShortTensor")) ) {
+        THTensor_(copyShort)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.IntTensor")) ) {
+        THTensor_(copyInt)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.LongTensor")) ) {
+        THTensor_(copyLong)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.FloatTensor")) ) {
+        THTensor_(copyFloat)(tensor, src);
+      } else if( (src = luaT_toudata(L, 3, "torch.DoubleTensor")) ) {
+        THTensor_(copyDouble)(tensor, src);
+      } else {
+        luaL_typerror(L, 3, "torch.*Tensor");
+      }
+    }
+    THTensor_(free)(tensor);
+    lua_pushboolean(L, 1);
+  }
+  else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
+  {
+    THTensor *vals;
+    if (lua_isnumber(L, 3))
+    {
+      THTensor_(maskedFill)(tensor, mask, luaG_(checkreal)(L,3));
+    }
+    else if((vals = luaT_toudata(L, 3, torch_Tensor)))
+    {
+      THTensor_(maskedCopy)(tensor, mask, vals);
+    }
+    else
+    {
+      THError("number or " torch_Tensor " expected");
+    }
+  }
+  else
+    lua_pushboolean(L, 0);
+
+  return 1;
+}
+
+static int torch_Tensor_(__index__)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THLongStorage *idx = NULL;
+  THByteTensor *mask;
+
+  if(lua_isnumber(L, 2))
+  {
+    long index = luaL_checklong(L,2)-1;
+
+    THArgCheck(tensor->nDimension > 0, 1, "empty tensor");
+    if (index < 0) index = tensor->size[0] + index + 1;
+    THArgCheck(index >= 0 && index < tensor->size[0], 2, "out of range");
+
+    if(tensor->nDimension == 1)
+    {
+      luaG_(pushreal)(L, THStorage_(get)(tensor->storage, tensor->storageOffset+index*tensor->stride[0]));
+    }
+    else
+    {
+      tensor = THTensor_(newWithTensor)(tensor);
+      THTensor_(select)(tensor, NULL, 0, index);
+      luaT_pushudata(L, tensor, torch_Tensor);
+    }
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else if((idx = luaT_toudata(L, 2, "torch.LongStorage")))
+  {
+    long index = THTensor_(storageOffset)(tensor);
+    int dim;
+
+    THArgCheck(idx->size == tensor->nDimension, 2, "invalid size");
+
+    for(dim = 0; dim < idx->size; dim++)
+    {
+      long z = idx->data[dim]-1;
+      if (z < 0) z = tensor->size[dim] + z + 1;
+      THArgCheck((z >= 0) && (z < tensor->size[dim]), 2, "index out of bound");
+      index += z*tensor->stride[dim];
+    }
+    luaG_(pushreal)(L, (double)THStorage_(get)(THTensor_(storage)(tensor), index));
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else if(lua_istable(L, 2))
+  {
+    int dim;
+    int cdim = 0;
+    int ndims;
+    int done = 0;
+
+    ndims = tensor->nDimension;
+    THArgCheck(lua_objlen(L, 2) <= ndims, 2, "too many indices provided");
+    tensor = THTensor_(newWithTensor)(tensor);
+
+    for(dim = 0; dim < ndims; dim++)
+    {
+      lua_rawgeti(L, 2, dim+1);
+      if(lua_isnumber(L, -1))
+      {
+        long z = lua_tonumber(L, -1)-1;
+        lua_pop(L, 1);
+        if (z < 0) z = tensor->size[cdim] + z + 1;
+        THArgCheck((z >= 0) && (z < tensor->size[cdim]), 2, "index out of bound");
+        if(tensor->nDimension == 1) {
+          done = 1;
+          luaG_(pushreal)(L, THStorage_(get)(tensor->storage, tensor->storageOffset+z*tensor->stride[0]));
+        } else {
+          THTensor_(select)(tensor, NULL, cdim, z);
+        }
+      }
+      else if (lua_istable(L, -1))
+      {
+        long start = 0;
+        long end = tensor->size[cdim]-1;
+        lua_rawgeti(L, -1, 1);
+        if(lua_isnumber(L, -1)) {
+          start = lua_tonumber(L, -1)-1;
+          end = start;
+        }
+        lua_pop(L, 1);
+        if (start < 0) start = tensor->size[cdim] + start + 1;
+        THArgCheck((start >= 0) && (start < tensor->size[cdim]), 2, "start index out of bound");
+
+        lua_rawgeti(L, -1, 2);
+        if(lua_isnumber(L, -1)) {
+          end = lua_tonumber(L, -1)-1;
+        }
+        lua_pop(L, 2);
+        if (end < 0) end = tensor->size[cdim] + end + 1;
+        THArgCheck((end >= 0) && (end < tensor->size[cdim]), 2, "end index out of bound");
+
+        THArgCheck((end >= start), 2, "end index must be greater or equal to start index");
+
+        THTensor_(narrow)(tensor, NULL, cdim++, start, end-start+1);
+      }
+      else
+      {
+        break;
+      }
+    }
+    if(!done) {
+      luaT_pushudata(L, tensor, torch_Tensor);
+    } else {
+      THTensor_(free)(tensor);
+    }
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
+  {
+    THTensor *vals = THTensor_(new)();
+    THTensor_(maskedSelect)(vals, tensor, mask);
+    luaT_pushudata(L, vals, torch_Tensor);
+    lua_pushboolean(L, 1);
+    return 2;
+  }
+  else
+  {
+    lua_pushboolean(L, 0);
+    return 1;
+  }
+}
+
+static int torch_Tensor_(retain)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor_(retain)(tensor);
+  return 0;
+}
+
+static int torch_Tensor_(free)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor_(free)(tensor);
+  return 0;
+}
+
+/* helpful functions */
+static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowStride, THLongStorage **size_, THLongStorage **stride_)
+{
+  THLongStorage *size = NULL;
+  THLongStorage *stride = NULL;
+
+  if( (size = luaT_toudata(L, index, "torch.LongStorage")) )
+  {
+    if(!lua_isnoneornil(L, index+1))
+    {
+      if( (stride = luaT_toudata(L, index+1, "torch.LongStorage")) )
+        THArgCheck(stride->size == size->size, index+1, "provided stride and size are inconsistent");
+      else
+        THArgCheck(0, index+1, "torch.LongStorage expected");
+    }
+    THLongStorage_retain(size);
+    if(stride)
+      THLongStorage_retain(stride);
+  }
+  else
+  {
+    int i;
+
+    size = THLongStorage_newWithSize(8);
+    stride = THLongStorage_newWithSize(8);
+    THLongStorage_fill(size, -1);
+    THLongStorage_fill(stride, -1);
+
+    if(allowStride)
+    {
+      for(i = 0; i < 8; i++)
+      {
+        if(lua_isnone(L, index+2*i))
+          break;
+        size->data[i] = luaL_checklong(L, index+2*i);
+
+        if(lua_isnone(L, index+2*i+1))
+          break;
+        stride->data[i] = luaL_checklong(L, index+2*i+1);
+      }
+    }
+    else
+    {
+      for(i = 0; i < 8; i++)
+      {
+        if(lua_isnone(L, index+i))
+          break;
+        size->data[i] = luaL_checklong(L, index+i);
+      }
+    }
+  }
+
+  *size_ = size;
+  *stride_ = stride;
+}
+
+static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride,
+                                                         THStorage **storage_, long *storageOffset_, THLongStorage **size_, THLongStorage **stride_)
+{
+  THTensor *src = NULL;
+  THStorage *storage = NULL;
+
+  int arg1Type = lua_type(L, index);
+
+  if( allowNone && (arg1Type == LUA_TNONE) )
+  {
+    *storage_ = NULL;
+    *storageOffset_ = 0;
+    *size_ = NULL;
+    *stride_ = NULL;
+    return;
+  }
+  else if( allowTensor && (arg1Type == LUA_TUSERDATA) && (src = luaT_toudata(L, index, torch_Tensor)) )
+  {
+    *storage_ = src->storage;
+    *storageOffset_ = src->storageOffset;
+    *size_ = THTensor_(newSizeOf)(src);
+    *stride_ = THTensor_(newStrideOf)(src);
+    return;
+  }
+  else if( allowStorage && (arg1Type == LUA_TUSERDATA) && (storage = luaT_toudata(L, index, torch_Storage)) )
+  {
+    *storage_ = storage;
+    if(lua_isnone(L, index+1))
+    {
+      *storageOffset_ = 0;
+      *size_ = THLongStorage_newWithSize1(storage->size);
+      *stride_ = THLongStorage_newWithSize1(1);
+    }
+    else
+    {
+      *storageOffset_ = luaL_checklong(L, index+1)-1;
+      torch_Tensor_(c_readSizeStride)(L, index+2, allowStride, size_, stride_);
+    }
+    return;
+  }
+  else if( (arg1Type == LUA_TNUMBER) || (luaT_toudata(L, index, "torch.LongStorage")) )
+  {
+    *storage_ = NULL;
+    *storageOffset_ = 0;
+    torch_Tensor_(c_readSizeStride)(L, index, 0, size_, stride_);
+
+    return;
+  }
+
+  *storage_ = NULL;
+  *storageOffset_ = 0;
+  if(allowTensor && allowStorage)
+      THArgCheck(0, index, "expecting number or " torch_Tensor " or " torch_Storage );
+  else if(allowTensor)
+      THArgCheck(0, index, "expecting number or " torch_Tensor );
+  else if(allowStorage)
+      THArgCheck(0, index, "expecting number or " torch_Storage );
+  else
+      THArgCheck(0, index, "expecting number");
+}
+
+static int torch_Tensor_(apply)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  luaL_checktype(L, 2, LUA_TFUNCTION);
+  lua_settop(L, 2);
+
+  TH_TENSOR_APPLY(real, tensor,
+                  lua_pushvalue(L, 2);
+                  luaG_(pushreal)(L, *tensor_data);
+                  lua_call(L, 1, 1);
+                  if(lua_isnumber(L, 3))
+                  {
+                    *tensor_data = (real)lua_tonumber(L, 3);
+                    lua_pop(L, 1);
+                  }
+                  else if(lua_isnil(L, 3))
+                    lua_pop(L, 1);
+                  else
+                    THError("given function should return a number or nil"););
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(map)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *src = luaT_checkudata(L, 2, torch_Tensor);
+  luaL_checktype(L, 3, LUA_TFUNCTION);
+  lua_settop(L, 3);
+
+  TH_TENSOR_APPLY2(real, tensor, real, src,
+                  lua_pushvalue(L, 3);
+                  luaG_(pushreal)(L, *tensor_data);
+                  luaG_(pushreal)(L, *src_data);
+                  lua_call(L, 2, 1);
+                  if(lua_isnumber(L, 4))
+                  {
+                    *tensor_data = (real)lua_tonumber(L, 4);
+                    lua_pop(L, 1);
+                  }
+                  else if(lua_isnil(L, 4))
+                    lua_pop(L, 1);
+                  else
+                    THError("given function should return a number or nil"););
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(map2)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *src1 = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *src2 = luaT_checkudata(L, 3, torch_Tensor);
+  luaL_checktype(L, 4, LUA_TFUNCTION);
+  lua_settop(L, 4);
+
+  TH_TENSOR_APPLY3(real, tensor, real, src1, real, src2,
+                  lua_pushvalue(L, 4);
+                  luaG_(pushreal)(L, *tensor_data);
+                  luaG_(pushreal)(L, *src1_data);
+                  luaG_(pushreal)(L, *src2_data);
+                  lua_call(L, 3, 1);
+                  if(lua_isnumber(L, 5))
+                  {
+                    *tensor_data = (real)lua_tonumber(L, 5);
+                    lua_pop(L, 1);
+                  }
+                  else if(lua_isnil(L, 5))
+                    lua_pop(L, 1);
+                  else
+                    THError("given function should return a number or nil"););
+
+  lua_settop(L, 1);
+  return 1;
+}
+
+static int torch_Tensor_(factory)(lua_State *L)
+{
+  THTensor *tensor = THTensor_(new)();
+  luaT_pushudata(L, tensor, torch_Tensor);
+  return 1;
+}
+
+static int torch_Tensor_(write)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+  THFile_writeIntScalar(file, tensor->nDimension);
+  THFile_writeLongRaw(file, tensor->size, tensor->nDimension);
+  THFile_writeLongRaw(file, tensor->stride, tensor->nDimension);
+  THFile_writeLongScalar(file, tensor->storageOffset+1); /* to respect Lua convention */
+
+  lua_getfield(L, 2, "writeObject"); /* the method */
+  lua_pushvalue(L, 2); /* the file */
+  /* the storage */
+  if(tensor->storage)
+  {
+    THStorage_(retain)(tensor->storage);
+    luaT_pushudata(L, tensor->storage, torch_Storage);
+  }
+  else
+    lua_pushnil(L);
+
+  lua_call(L, 2, 0); /* call the method */
+
+  return 0;
+}
+
+static int torch_Tensor_(read)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THFile *file = luaT_checkudata(L, 2, "torch.File");
+
+  tensor->nDimension = THFile_readIntScalar(file);
+  tensor->size = THAlloc(sizeof(long)*tensor->nDimension);
+  tensor->stride = THAlloc(sizeof(long)*tensor->nDimension);
+  THFile_readLongRaw(file, tensor->size, tensor->nDimension);
+  THFile_readLongRaw(file, tensor->stride, tensor->nDimension);
+  tensor->storageOffset = THFile_readLongScalar(file);
+  tensor->storageOffset--;  /* to respect Lua convention */
+
+  lua_getfield(L, 2, "readObject"); /* the method */
+  lua_pushvalue(L, 2); /* the file */
+  lua_call(L, 1, 1); /* call the method */
+
+  tensor->storage = luaT_toudata(L, -1, torch_Storage);
+  if(tensor->storage)
+    THStorage_(retain)(tensor->storage);
+
+  return 0;
+}
+
+static const struct luaL_Reg torch_Tensor_(_) [] = {
+  {"retain", torch_Tensor_(retain)},
+  {"free", torch_Tensor_(free)},
+  {"contiguous", torch_Tensor_(contiguous)},
+  {"size", torch_Tensor_(size)},
+  {"elementSize", torch_Tensor_(elementSize)},
+  {"__len__", torch_Tensor_(size)},
+  {"stride", torch_Tensor_(stride)},
+  {"dim", torch_Tensor_(nDimension)},
+  {"nDimension", torch_Tensor_(nDimension)},
+  {"set", torch_Tensor_(set)},
+  {"storage", torch_Tensor_(storage)},
+  {"storageOffset", torch_Tensor_(storageOffset)},
+  {"clone", torch_Tensor_(clone)},
+  {"contiguous", torch_Tensor_(contiguous)},
+  {"resizeAs", torch_Tensor_(resizeAs)},
+  {"resize", torch_Tensor_(resize)},
+  {"narrow", torch_Tensor_(narrow)},
+  {"sub", torch_Tensor_(sub)},
+  {"select", torch_Tensor_(select)},
+  {"index", torch_Tensor_(indexSelect)},
+  {"indexCopy", torch_Tensor_(indexCopy)},
+  {"indexAdd", torch_Tensor_(indexAdd)},
+  {"indexFill", torch_Tensor_(indexFill)},
+  {"maskedSelect", torch_Tensor_(maskedSelect)},
+  {"maskedCopy", torch_Tensor_(maskedCopy)},
+  {"maskedFill", torch_Tensor_(maskedFill)},
+  {"transpose", torch_Tensor_(transpose)},
+  {"t", torch_Tensor_(t)},
+  {"unfold", torch_Tensor_(unfold)},
+  {"isContiguous", torch_Tensor_(isContiguous)},
+  {"isSameSizeAs", torch_Tensor_(isSameSizeAs)},
+  {"isSetTo", torch_Tensor_(isSetTo)},
+  {"isSize", torch_Tensor_(isSize)},
+  {"nElement", torch_Tensor_(nElement)},
+  {"copy", torch_Tensor_(copy)},
+  {"apply", torch_Tensor_(apply)},
+  {"map", torch_Tensor_(map)},
+  {"map2", torch_Tensor_(map2)},
+  {"read", torch_Tensor_(read)},
+  {"write", torch_Tensor_(write)},
+  {"__index__", torch_Tensor_(__index__)},
+  {"__newindex__", torch_Tensor_(__newindex__)},
+  {NULL, NULL}
+};
+
+void torch_Tensor_(init)(lua_State *L)
+{
+  luaT_newmetatable(L, torch_Tensor, NULL,
+                    torch_Tensor_(new), torch_Tensor_(free), torch_Tensor_(factory));
+  luaT_setfuncs(L, torch_Tensor_(_), 0);
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c
new file mode 100644
index 0000000..22506c0
--- /dev/null
+++ b/generic/TensorOperator.c
@@ -0,0 +1,191 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TensorOperator.c"
+#else
+
+static int torch_TensorOperator_(__add__)(lua_State *L)
+{
+  THTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+  THTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+  THTensor *r;
+
+  if(!tensor1 && !tensor2)
+    luaL_error(L, "expecting two " torch_Tensor "s or one " torch_Tensor " and one number");
+  else
+  {
+    r = THTensor_(new)();
+    luaT_pushudata(L, r, torch_Tensor);
+    
+    if(!tensor1 && tensor2)
+    {
+      THTensor_(resizeAs)(r, tensor2);
+      THTensor_(copy)(r, tensor2);
+      THTensor_(add)(r, r, luaL_checknumber(L, 1));
+    }
+    else if(tensor1 && !tensor2)
+    {
+      THTensor_(resizeAs)(r, tensor1);
+      THTensor_(copy)(r, tensor1);
+      THTensor_(add)(r, r, luaL_checknumber(L, 2));
+    }
+    else
+    {
+      THTensor_(resizeAs)(r, tensor1);
+      THTensor_(copy)(r, tensor1);
+      THTensor_(cadd)(r, r, 1, tensor2);
+    }
+  }
+  return 1;
+}
+
+static int torch_TensorOperator_(__sub__)(lua_State *L)
+{
+  THTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+  THTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+  THTensor *r;
+
+  if(!tensor1 && !tensor2)
+    luaL_error(L, "expecting two " torch_Tensor "s or one " torch_Tensor " and one number");
+  else
+  {
+    r = THTensor_(new)();
+    luaT_pushudata(L, r, torch_Tensor);
+    
+    if(!tensor1 && tensor2)
+    {
+      THTensor_(resizeAs)(r, tensor2);
+      THTensor_(fill)(r, luaL_checknumber(L, 1));
+      THTensor_(cadd)(r, r, -1, tensor2);
+    }
+    else if(tensor1 && !tensor2)
+    {
+      THTensor_(resizeAs)(r, tensor1);
+      THTensor_(copy)(r, tensor1);
+      THTensor_(add)(r, r, -(real)luaL_checknumber(L, 2));
+    }
+    else
+    {
+      THTensor_(resizeAs)(r, tensor1);
+      THTensor_(copy)(r, tensor1);
+      THTensor_(cadd)(r, r, -1, tensor2);
+    }
+  }
+  return 1;
+}
+
+static int torch_TensorOperator_(__unm__)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *r;
+
+  r = THTensor_(new)();
+  luaT_pushudata(L, r, torch_Tensor);
+  THTensor_(resizeAs)(r, tensor);
+  THTensor_(copy)(r, tensor);
+  THTensor_(mul)(r, r, -1);
+
+  return 1;
+}
+
+static int torch_TensorOperator_(__mul__)(lua_State *L)
+{
+  THTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+  THTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+  THTensor *r;
+
+  if(!tensor1 && !tensor2)
+    luaL_error(L, "expecting two " torch_Tensor "s or one " torch_Tensor " and one number");
+  else
+  {
+    r = THTensor_(new)();
+    luaT_pushudata(L, r, torch_Tensor);
+    
+    if(!tensor1 && tensor2)
+    {
+      THTensor_(resizeAs)(r, tensor2);
+      THTensor_(copy)(r, tensor2);
+      THTensor_(mul)(r, r, luaL_checknumber(L, 1));
+    }
+    else if(tensor1 && !tensor2)
+    {
+      THTensor_(resizeAs)(r, tensor1);
+      THTensor_(copy)(r, tensor1);
+      THTensor_(mul)(r, r, luaL_checknumber(L, 2));
+    }
+    else
+    {
+      int dimt = tensor1->nDimension;
+      int dims = tensor2->nDimension;
+      
+      if(dimt == 1 && dims == 1)
+        lua_pushnumber(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */
+      else if(dimt == 2 && dims == 1)
+      {
+        THTensor_(resize1d)(r, tensor1->size[0]);
+        THTensor_(zero)(r);
+        THTensor_(addmv)(r, 1, r, 1, tensor1, tensor2);
+      }
+      else if(dimt == 2 && dims == 2)
+      {
+        THTensor_(resize2d)(r, tensor1->size[0], tensor2->size[1]);
+        THTensor_(zero)(r);
+        THTensor_(addmm)(r, 1, r, 1, tensor1, tensor2);
+      }
+      else
+        luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension); 
+    }
+  }
+  return 1;
+}
+
+static int torch_TensorOperator_(__div__)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *r;
+
+  THArgCheck(lua_isnumber(L,2), 2, "number expected");
+
+  r = THTensor_(new)();
+  luaT_pushudata(L, r, torch_Tensor);
+
+  THTensor_(resizeAs)(r, tensor);
+  THTensor_(copy)(r, tensor);
+  THTensor_(mul)(r, r, 1/lua_tonumber(L, 2));
+
+  return 1;
+}
+
+static int torch_TensorOperator_(__mod__)(lua_State *L)
+{
+  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *r;
+
+  THArgCheck(lua_isnumber(L,2), 2, "number expected");
+
+  r = THTensor_(new)();
+  luaT_pushudata(L, r, torch_Tensor);
+
+  THTensor_(resizeAs)(r, tensor);
+  THTensor_(copy)(r, tensor);
+  THTensor_(remainder)(r, r, lua_tonumber(L, 2));
+
+  return 1;
+}
+
+static const struct luaL_Reg torch_TensorOperator_(_) [] = {
+  {"__add__", torch_TensorOperator_(__add__)},
+  {"__sub__", torch_TensorOperator_(__sub__)},
+  {"__unm__", torch_TensorOperator_(__unm__)},
+  {"__mul__", torch_TensorOperator_(__mul__)},
+  {"__div__", torch_TensorOperator_(__div__)},
+  {"__mod__", torch_TensorOperator_(__mod__)},
+  {NULL, NULL}
+};
+
+void torch_TensorOperator_(init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_setfuncs(L, torch_TensorOperator_(_), 0);
+  lua_pop(L, 1);
+}
+
+#endif
diff --git a/generic/luaG.h b/generic/luaG.h
new file mode 100644
index 0000000..fc1a3cc
--- /dev/null
+++ b/generic/luaG.h
@@ -0,0 +1,37 @@
+#if !defined(real) || !defined(TH_GENERIC_FILE)
+#error "luaG.h must not be included outside of a generic file."
+#endif
+
+#ifndef luaG_
+#define luaG_(NAME) TH_CONCAT_3(luaG_,Real,NAME)
+#endif
+
+static void luaG_(pushreal)(lua_State *L, accreal n) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
+	lua_pushnumber(L, (lua_Number)n);
+#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+	lua_pushinteger(L, (lua_Integer)n);
+#else
+	#error "unhandled real type in luaG_pushreal"
+#endif
+}
+
+static real luaG_(checkreal)(lua_State *L, int idx) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
+	return (lua_Number)luaL_checknumber(L, idx);
+#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+	return (lua_Integer)luaL_checkinteger(L, idx);
+#else
+	#error "unhandled real type in luaG_checkreal"
+#endif
+}
+
+static real luaG_(optreal)(lua_State *L, int idx, real n) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503
+	return (lua_Number)luaL_optnumber(L, idx, (lua_Number)n);
+#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+	return (lua_Integer)luaL_optinteger(L, idx, (lua_Integer)n);
+#else
+	#error "unhandled real type in luaG_checkreal"
+#endif
+}
diff --git a/init.c b/init.c
new file mode 100644
index 0000000..08eedba
--- /dev/null
+++ b/init.c
@@ -0,0 +1,88 @@
+#include "general.h"
+#include "utils.h"
+
+extern void torch_utils_init(lua_State *L);
+extern void torch_random_init(lua_State *L);
+extern void torch_File_init(lua_State *L);
+extern void torch_DiskFile_init(lua_State *L);
+extern void torch_MemoryFile_init(lua_State *L);
+extern void torch_PipeFile_init(lua_State *L);
+extern void torch_Timer_init(lua_State *L);
+
+extern void torch_ByteStorage_init(lua_State *L);
+extern void torch_CharStorage_init(lua_State *L);
+extern void torch_ShortStorage_init(lua_State *L);
+extern void torch_IntStorage_init(lua_State *L);
+extern void torch_LongStorage_init(lua_State *L);
+extern void torch_FloatStorage_init(lua_State *L);
+extern void torch_DoubleStorage_init(lua_State *L);
+
+extern void torch_ByteTensor_init(lua_State *L);
+extern void torch_CharTensor_init(lua_State *L);
+extern void torch_ShortTensor_init(lua_State *L);
+extern void torch_IntTensor_init(lua_State *L);
+extern void torch_LongTensor_init(lua_State *L);
+extern void torch_FloatTensor_init(lua_State *L);
+extern void torch_DoubleTensor_init(lua_State *L);
+
+extern void torch_ByteTensorOperator_init(lua_State *L);
+extern void torch_CharTensorOperator_init(lua_State *L);
+extern void torch_ShortTensorOperator_init(lua_State *L);
+extern void torch_IntTensorOperator_init(lua_State *L);
+extern void torch_LongTensorOperator_init(lua_State *L);
+extern void torch_FloatTensorOperator_init(lua_State *L);
+extern void torch_DoubleTensorOperator_init(lua_State *L);
+
+extern void torch_TensorMath_init(lua_State *L);
+
+LUA_EXTERNC DLL_EXPORT int luaopen_libtorch(lua_State *L);
+
+int luaopen_libtorch(lua_State *L)
+{
+
+  lua_newtable(L);
+  lua_pushvalue(L, -1);
+  lua_setglobal(L, "torch");
+
+  torch_utils_init(L);
+
+  torch_File_init(L);
+
+  torch_ByteStorage_init(L);
+  torch_CharStorage_init(L);
+  torch_ShortStorage_init(L);
+  torch_IntStorage_init(L);
+  torch_LongStorage_init(L);
+  torch_FloatStorage_init(L);
+  torch_DoubleStorage_init(L);
+
+  torch_ByteTensor_init(L);
+  torch_CharTensor_init(L);
+  torch_ShortTensor_init(L);
+  torch_IntTensor_init(L);
+  torch_LongTensor_init(L);
+  torch_FloatTensor_init(L);
+  torch_DoubleTensor_init(L);
+
+  torch_ByteTensorOperator_init(L);
+  torch_CharTensorOperator_init(L);
+  torch_ShortTensorOperator_init(L);
+  torch_IntTensorOperator_init(L);
+  torch_LongTensorOperator_init(L);
+  torch_FloatTensorOperator_init(L);
+  torch_DoubleTensorOperator_init(L);
+
+  torch_Timer_init(L);
+  torch_DiskFile_init(L);
+  torch_PipeFile_init(L);
+  torch_MemoryFile_init(L);
+
+  torch_TensorMath_init(L);
+
+  torch_random_init(L);
+
+  // Create 'torch.Allocator' type.
+  luaT_newmetatable(L, "torch.Allocator", NULL, NULL, NULL, NULL);
+
+  return 1;
+}
diff --git a/init.lua b/init.lua
new file mode 100644
index 0000000..d2eccc1
--- /dev/null
+++ b/init.lua
@@ -0,0 +1,189 @@
+-- We are using paths.require to appease mkl
+
+-- Make this work with LuaJIT in Lua 5.2 compatibility mode, which
+-- renames string.gfind (already deprecated in 5.1)
+if not string.gfind then
+   string.gfind = string.gmatch
+end
+if not table.unpack then
+   table.unpack = unpack
+end
+
+require "paths"
+paths.require "libtorch"
+
+-- Keep track of all thread local variables torch.
+-- if a Lua VM is passed to another thread thread local
+-- variables need to be updated.
+function torch.updatethreadlocals()
+   torch.updateerrorhandlers()
+   local tracking = torch._heaptracking
+   if tracking == nil then tracking = false end
+   torch.setheaptracking(tracking)
+end
+
+--- package stuff
+function torch.packageLuaPath(name)
+   if not name then
+      local ret = string.match(torch.packageLuaPath('torch'), '(.*)/')
+      if not ret then --windows?
+         ret = string.match(torch.packageLuaPath('torch'), '(.*)\\')
+      end
+      return ret
+   end
+   for path in string.gmatch(package.path, "[^;]+") do
+      path = string.gsub(path, "%?", name)
+      local f = io.open(path)
+      if f then
+         f:close()
+         local ret = string.match(path, "(.*)/")
+         if not ret then --windows?
+            ret = string.match(path, "(.*)\\")
+         end
+         return ret
+      end
+   end
+end
+
+local function include(file, depth)
+   paths.dofile(file, 3 + (depth or 0))
+end
+rawset(_G, 'include', include)
+
+function torch.include(package, file)
+   dofile(torch.packageLuaPath(package) .. '/' .. file)
+end
+
+function torch.class(...)
+   local tname, parenttname, module
+   if select('#', ...) == 3
+      and type(select(1, ...)) == 'string'
+      and type(select(2, ...)) == 'string'
+      and type(select(3, ...)) == 'table'
+   then
+      tname = select(1, ...)
+      parenttname = select(2, ...)
+      module = select(3, ...)
+   elseif select('#', ...) == 2
+      and type(select(1, ...)) == 'string'
+      and type(select(2, ...)) == 'string'
+   then
+      tname = select(1, ...)
+      parenttname = select(2, ...)
+   elseif select('#', ...) == 2
+      and type(select(1, ...)) == 'string'
+      and type(select(2, ...)) == 'table'
+   then
+      tname = select(1, ...)
+      module = select(2, ...)
+   elseif select('#', ...) == 1
+      and type(select(1, ...)) == 'string'
+   then
+      tname = select(1, ...)
+   else
+      error('<class name> [<parent class name>] [<module table>] expected')
+   end
+
+   local function constructor(...)
+      local self = {}
+      torch.setmetatable(self, tname)
+      if self.__init then
+         self:__init(...)
+      end
+      return self
+   end
+
+   local function factory()
+      local self = {}
+      torch.setmetatable(self, tname)
+      return self
+   end
+
+   local mt = torch.newmetatable(tname, parenttname, constructor, nil, factory, module)
+   local mpt
+   if parenttname then
+      mpt = torch.getmetatable(parenttname)
+   end
+   return mt, mpt
+end
+
+function torch.setdefaulttensortype(typename)
+   assert(type(typename) == 'string', 'string expected')
+   if torch.getconstructortable(typename) then
+      torch.Tensor = torch.getconstructortable(typename)
+      torch.Storage = torch.getconstructortable(torch.typename(torch.Tensor(1):storage()))
+   else
+      error(string.format("<%s> is not a string describing a torch object", typename))
+   end
+end
+
+function torch.type(obj)
+   local class = torch.typename(obj)
+   if not class then
+      class = type(obj)
+   end
+   return class
+end
+
+--[[ See if a given object is an instance of the provided torch class. ]]
+function torch.isTypeOf(obj, typeSpec)
+   -- typeSpec can be provided as either a string, pattern, or the constructor.
+   -- If the constructor is used, we look in the __typename field of the
+   -- metatable to find a string to compare to.
+   if type(typeSpec) ~= 'string' then
+      typeSpec = getmetatable(typeSpec).__typename
+	  assert(type(typeSpec) == 'string',
+             "type must be provided as [regexp] string, or factory")
+   end
+
+   local mt = getmetatable(obj)
+   while mt do
+      if type(mt) == 'table' and mt.__typename then
+         local match = mt.__typename:match(typeSpec)
+         -- Require full match for non-pattern specs
+         if match and (match ~= typeSpec or match == mt.__typename) then
+            return true
+         end
+      end
+      mt = getmetatable(mt)
+   end
+   return false
+end
+
+torch.setdefaulttensortype('torch.DoubleTensor')
+
+require('torch.Tensor')
+require('torch.File')
+require('torch.CmdLine')
+require('torch.FFI')
+require('torch.Tester')
+require('torch.TestSuite')
+require('torch.test')
+
+function torch.totable(obj)
+   if torch.isTensor(obj) or torch.isStorage(obj) then
+      return obj:totable()
+   else
+      error("obj must be a Storage or a Tensor")
+   end
+end
+
+function torch.isTensor(obj)
+   local typename = torch.typename(obj)
+   if typename and typename:find('torch.*Tensor') then
+      return true
+   end
+   return false
+end
+
+function torch.isStorage(obj)
+   local typename = torch.typename(obj)
+   if typename and typename:find('torch.*Storage') then
+      return true
+   end
+   return false
+end
+-- alias for convenience
+torch.Tensor.isTensor = torch.isTensor
+
+return torch
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
new file mode 100644
index 0000000..7d4f3c4
--- /dev/null
+++ b/lib/CMakeLists.txt
@@ -0,0 +1,7 @@
+SET(TH_INSTALL_BIN_SUBDIR "${Torch_INSTALL_BIN_SUBDIR}")
+SET(TH_INSTALL_LIB_SUBDIR "${Torch_INSTALL_LIB_SUBDIR}")
+SET(TH_INSTALL_INCLUDE_SUBDIR "${Torch_INSTALL_INCLUDE_SUBDIR}")
+SET(TH_INSTALL_CMAKE_SUBDIR "${Torch_INSTALL_CMAKE_SUBDIR}")
+
+ADD_SUBDIRECTORY(TH)
+ADD_SUBDIRECTORY(luaT)
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
new file mode 100644
index 0000000..551ea50
--- /dev/null
+++ b/lib/TH/CMakeLists.txt
@@ -0,0 +1,370 @@
+cmake_minimum_required(VERSION 2.6)
+
+# avoid some cmake warnings
+IF(POLICY CMP0026)
+ CMAKE_POLICY(SET CMP0026 OLD)
+ENDIF()
+
+SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+# Can be compiled standalone
+IF(NOT TH_INSTALL_BIN_SUBDIR
+    OR NOT TH_INSTALL_LIB_SUBDIR
+    OR NOT TH_INSTALL_INCLUDE_SUBDIR
+    OR NOT TH_INSTALL_CMAKE_SUBDIR)
+
+  SET(TH_INSTALL_BIN_SUBDIR "bin" CACHE PATH "TH install binary subdirectory")
+  SET(TH_INSTALL_LIB_SUBDIR "lib" CACHE PATH "TH install library subdirectory")
+  SET(TH_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "TH install include subdirectory")
+  SET(TH_INSTALL_CMAKE_SUBDIR "share/cmake/TH" CACHE PATH "TH install cmake subdirectory")
+ENDIF()
+
+# flags
+
+IF(MSVC)
+  # respect the standard
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+ENDIF(MSVC)
+
+# OpenMP support?
+SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
+IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
+  EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
+  STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
+  MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
+  IF (DARWIN_VERSION GREATER 9)
+    SET(APPLE_OPENMP_SUCKS 1)
+  ENDIF (DARWIN_VERSION GREATER 9)
+  EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
+    OUTPUT_VARIABLE GCC_VERSION)
+  IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
+    MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
+    MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
+    SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
+  ENDIF ()
+ENDIF ()
+
+IF (WITH_OPENMP)
+  FIND_PACKAGE(OpenMP)
+  IF(OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+# ARM specific flags
+FIND_PACKAGE(ARM)
+IF (NEON_FOUND)
+  MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
+  SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
+ENDIF (NEON_FOUND)
+IF (CORTEXA8_FOUND)
+  MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
+  SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
+ENDIF (CORTEXA8_FOUND)
+IF (CORTEXA9_FOUND)
+  MESSAGE(STATUS "Cortex-A9 Found with compiler flag : -mcpu=cortex-a9")
+  SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
+ENDIF (CORTEXA9_FOUND)
+
+IF(UNIX)
+  INCLUDE(CheckFunctionExists)
+  SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
+  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
+  IF(HAVE_MMAP)
+    ADD_DEFINITIONS(-DHAVE_MMAP=1)
+  ENDIF(HAVE_MMAP)
+  ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
+  CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
+  IF(HAVE_SHM_OPEN)
+    ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
+  ENDIF(HAVE_SHM_OPEN)
+  CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
+  IF(HAVE_SHM_UNLINK)
+    ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
+  ENDIF(HAVE_SHM_UNLINK)
+  CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+  IF(HAVE_MALLOC_USABLE_SIZE)
+    ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
+  ENDIF(HAVE_MALLOC_USABLE_SIZE)
+ENDIF(UNIX)
+
+FIND_PACKAGE(SSE)
+IF(C_SSE2_FOUND)
+  SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
+ENDIF(C_SSE2_FOUND)
+IF(C_SSE3_FOUND)
+  SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} -DUSE_SSE3 ${CMAKE_C_FLAGS}")
+ENDIF(C_SSE3_FOUND)
+
+IF(C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+  SET(simd generic/simd/convolve.c)
+  SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve.c PROPERTIES COMPILE_FLAGS "-std=c99")
+ENDIF(C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND)
+
+IF(C_SSE4_1_FOUND)
+  SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${CMAKE_C_FLAGS}")
+ENDIF(C_SSE4_1_FOUND)
+IF(C_SSE4_2_FOUND)
+  SET(CMAKE_C_FLAGS "${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}")
+ENDIF(C_SSE4_2_FOUND)
+
+IF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
+  SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -std=c99")
+  SET(simd ${simd} generic/simd/convolve5x5_sse.c)
+ENDIF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND)
+
+IF(C_AVX_FOUND)
+  SET(CMAKE_C_FLAGS "-DUSE_AVX ${CMAKE_C_FLAGS}")
+  SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -mavx -std=c99")
+  SET(simd ${simd} generic/simd/convolve5x5_avx.c)
+ENDIF(C_AVX_FOUND)
+
+SET(hdr
+  THGeneral.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
+  THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h)
+
+SET(src
+  THGeneral.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
+  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c)
+
+SET(src ${src} ${hdr} ${simd})
+ADD_LIBRARY(TH SHARED ${src})
+if(BUILD_STATIC)
+  ADD_LIBRARY(TH_static STATIC ${src})
+endif()
+
+CHECK_C_SOURCE_RUNS("
+#include <stdatomic.h>
+int main()
+{
+  int a;
+  int oa;
+  atomic_store(&a, 1);
+  atomic_fetch_add(&a, 1);
+  oa = atomic_load(&a);
+  if(!atomic_compare_exchange_strong(&a, &oa, 3))
+    return -1;
+  return 0;
+}
+" HAS_C11_ATOMICS)
+
+IF(NOT HAS_C11_ATOMICS)
+  CHECK_C_SOURCE_RUNS("
+#include <intrin.h>
+int main()
+{
+  long a;
+  _InterlockedExchange(&a, 1);
+  _InterlockedExchangeAdd(&a, 1);
+  if(_InterlockedCompareExchange(&a, 3, 2) != 2)
+    return -1;
+  return 0;
+}
+" HAS_MSC_ATOMICS)
+
+  CHECK_C_SOURCE_RUNS("
+int main()
+{
+  int a;
+  __sync_lock_test_and_set(&a, 1);
+  __sync_fetch_and_add(&a, 1);
+  if(!__sync_bool_compare_and_swap(&a, 2, 3))
+    return -1;
+  return 0;
+}
+" HAS_GCC_ATOMICS)
+ENDIF()
+
+IF(HAS_C11_ATOMICS)
+  ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1)
+  MESSAGE(STATUS "Atomics: using C11 intrinsics")
+ELSEIF(HAS_MSC_ATOMICS)
+  ADD_DEFINITIONS(-DUSE_MSC_ATOMICS=1)
+  MESSAGE(STATUS "Atomics: using MSVC intrinsics")
+ELSEIF(HAS_GCC_ATOMICS)
+  ADD_DEFINITIONS(-DUSE_GCC_ATOMICS=1)
+    MESSAGE(STATUS "Atomics: using GCC intrinsics")
+ELSE()
+  SET(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+  FIND_PACKAGE(Threads)
+  IF(THREADS_FOUND)
+    ADD_DEFINITIONS(-DUSE_PTHREAD_ATOMICS=1)
+    TARGET_LINK_LIBRARIES(TH ${CMAKE_THREAD_LIBS_INIT})
+    MESSAGE(STATUS "Atomics: using pthread")
+  ENDIF()
+ENDIF()
+
+FIND_PACKAGE(BLAS)
+IF(BLAS_FOUND)
+  SET(USE_BLAS 1)
+  TARGET_LINK_LIBRARIES(TH ${BLAS_LIBRARIES})
+ENDIF(BLAS_FOUND)
+
+FIND_PACKAGE(LAPACK)
+IF(LAPACK_FOUND)
+  SET(USE_LAPACK 1)
+  TARGET_LINK_LIBRARIES(TH ${LAPACK_LIBRARIES})
+ENDIF(LAPACK_FOUND)
+
+IF(BLAS_IS_ACCELERATE)
+  MESSAGE(STATUS "BLAS FOUND IS ACCELERATE: Fix for sdot")
+ENDIF()
+
+IF (UNIX AND NOT APPLE)
+   INCLUDE(CheckLibraryExists)
+   # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
+   CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
+   IF(NEED_LIBRT)
+     TARGET_LINK_LIBRARIES(TH rt)
+   ENDIF(NEED_LIBRT)
+ENDIF(UNIX AND NOT APPLE)
+
+IF(NOT MSVC)
+  TARGET_LINK_LIBRARIES(TH m)
+ENDIF(NOT MSVC)
+
+SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+FOREACH(KEYWORD "inline" "__inline__" "__inline")
+  IF(NOT DEFINED C_INLINE)
+
+    SET(CMAKE_REQUIRED_FLAGS "-Dinline=${KEYWORD} ${CMAKE_C_FLAGS}")
+    CHECK_C_SOURCE_RUNS("
+       static inline int static_foo()
+       {
+         return 0;
+       }
+
+       int main(int argc, char *argv[])
+       {
+         static_foo();
+         return 0;
+       }" C_HAS_${KEYWORD})
+
+    IF(C_HAS_${KEYWORD})
+      SET(C_INLINE TRUE)
+# Right now i put it in THGeneral.h -- debatable
+#      ADD_DEFINITIONS("-Dinline=${KEYWORD}")
+      SET(TH_INLINE ${KEYWORD})
+      MESSAGE(STATUS "C inline is supported (${KEYWORD})")
+    ENDIF(C_HAS_${KEYWORD})
+  ENDIF(NOT DEFINED C_INLINE)
+ENDFOREACH(KEYWORD)
+SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+IF(NOT DEFINED C_INLINE)
+  MESSAGE(STATUS "C inline seems not supported")
+# Right now i put it in THGeneral.h -- debatable
+#  ADD_DEFINITIONS("-Dinline=")
+SET(TH_INLINE "")
+ENDIF(NOT DEFINED C_INLINE)
+
+# Is __thread supported?
+INCLUDE (CheckCSourceCompiles)
+CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD)
+IF(NOT DEFINED C_HAS_THREAD)
+  MESSAGE(STATUS "Warning: __thread is not supported, generating thread-unsafe code")
+ENDIF(NOT DEFINED C_HAS_THREAD)
+IF(C_HAS_THREAD)
+  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTH_HAVE_THREAD")
+ENDIF(C_HAS_THREAD)
+
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
+CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
+
+INSTALL(TARGETS TH
+  EXPORT TH-exports
+  RUNTIME DESTINATION "${TH_INSTALL_BIN_SUBDIR}"
+  LIBRARY DESTINATION "${TH_INSTALL_LIB_SUBDIR}"
+  ARCHIVE DESTINATION "${TH_INSTALL_LIB_SUBDIR}")
+
+INSTALL(FILES
+  TH.h
+  THAllocator.h
+  THMath.h
+  THBlas.h
+  THDiskFile.h
+  THFile.h
+  THFilePrivate.h
+  ${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h
+  THGenerateAllTypes.h
+  THGenerateFloatTypes.h
+  THGenerateIntTypes.h
+  THLapack.h
+  THLogAdd.h
+  THMemoryFile.h
+  THRandom.h
+  THStorage.h
+  THTensor.h
+  THTensorApply.h
+  THTensorDimApply.h
+  THTensorMacros.h
+  THVector.h
+  THAtomic.h
+  DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH")
+
+INSTALL(FILES
+  generic/THBlas.c
+  generic/THBlas.h
+  generic/THLapack.c
+  generic/THLapack.h
+  generic/THStorage.c
+  generic/THStorage.h
+  generic/THStorageCopy.c
+  generic/THStorageCopy.h
+  generic/THTensor.c
+  generic/THTensor.h
+  generic/THTensorConv.c
+  generic/THTensorConv.h
+  generic/THTensorCopy.c
+  generic/THTensorCopy.h
+  generic/THTensorLapack.c
+  generic/THTensorLapack.h
+  generic/THTensorMath.c
+  generic/THTensorMath.h
+  generic/THTensorRandom.c
+  generic/THTensorRandom.h
+  generic/THVector.c
+  DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/generic")
+
+
+IF (WIN32 AND NOT CYGWIN)
+  SET(BLAS_INSTALL_LIBRARIES "OFF"
+    CACHE BOOL "Copy the required BLAS DLLs into the TH install dirs")
+ENDIF (WIN32 AND NOT CYGWIN)
+
+MACRO(Install_Required_Library ln)
+    get_filename_component(libpath ${ln} PATH)
+    get_filename_component(libname ${ln} NAME_WE)
+    file(GLOB libdlls "${libpath}/${libname}*.dll")
+    install(PROGRAMS ${libdlls}
+      DESTINATION "${TH_INSTALL_BIN_SUBDIR}")
+ENDMACRO(Install_Required_Library libname)
+
+IF (BLAS_FOUND AND BLAS_INSTALL_LIBRARIES)
+  IF (BLAS_goto2_LIBRARY)
+    Install_Required_Library(${BLAS_goto2_LIBRARY})
+    Install_Required_Library("${libpath}/libgfortran")
+    Install_Required_Library("${libpath}/libquadmath")
+    Install_Required_Library("${libpath}/libgcc")
+  ENDIF()
+  IF (BLAS_openblas_LIBRARY)
+    Install_Required_Library(${BLAS_openblas_LIBRARY})
+    Install_Required_Library("${libpath}/libquadmath")
+    Install_Required_Library("${libpath}/libgfortran")
+    Install_Required_Library("${libpath}/libquadmath")
+    Install_Required_Library("${libpath}/libgcc")
+  ENDIF()
+ENDIF()
+
+# Create THConfig.cmake
+GET_TARGET_PROPERTY(TH_OUTPUT_NAME TH LOCATION)
+GET_FILENAME_COMPONENT(TH_OUTPUT_NAME ${TH_OUTPUT_NAME} NAME)
+SET(TH_LIBRARIES "${CMAKE_INSTALL_PREFIX}/${TH_INSTALL_LIB_SUBDIR}/${TH_OUTPUT_NAME}")
+SET(TH_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${TH_INSTALL_INCLUDE_SUBDIR}/TH")
+CONFIGURE_FILE(THConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/THConfig.cmake")
+INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/THConfig.cmake"
+  DESTINATION "${TH_INSTALL_CMAKE_SUBDIR}")
diff --git a/lib/TH/TH.h b/lib/TH/TH.h
new file mode 100644
index 0000000..cdf331d
--- /dev/null
+++ b/lib/TH/TH.h
@@ -0,0 +1,24 @@
+#ifndef TH_INC
+#define TH_INC
+
+#include "THGeneral.h"
+
+#include "THBlas.h"
+#ifdef USE_LAPACK
+#include "THLapack.h"
+#endif
+
+#include "THAtomic.h"
+#include "THVector.h"
+#include "THLogAdd.h"
+#include "THRandom.h"
+#include "THStorage.h"
+#include "THTensor.h"
+#include "THTensorApply.h"
+#include "THTensorDimApply.h"
+
+#include "THFile.h"
+#include "THDiskFile.h"
+#include "THMemoryFile.h"
+
+#endif
diff --git a/lib/TH/THAllocator.c b/lib/TH/THAllocator.c
new file mode 100644
index 0000000..6992544
--- /dev/null
+++ b/lib/TH/THAllocator.c
@@ -0,0 +1,311 @@
+#include "THAllocator.h"
+
+/* stuff for mapped files */
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#if HAVE_MMAP
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+/* end of stuff for mapped files */
+
+static void *THDefaultAllocator_alloc(void* ctx, long size) {
+  return THAlloc(size);
+}
+
+static void *THDefaultAllocator_realloc(void* ctx, void* ptr, long size) {
+  return THRealloc(ptr, size);
+}
+
+static void THDefaultAllocator_free(void* ctx, void* ptr) {
+  THFree(ptr);
+}
+
+THAllocator THDefaultAllocator = {
+  &THDefaultAllocator_alloc,
+  &THDefaultAllocator_realloc,
+  &THDefaultAllocator_free
+};
+
+#if defined(_WIN32) || defined(HAVE_MMAP)
+
+struct THMapAllocatorContext_ {
+  char *filename; /* file name */
+  int shared; /* is shared or not */
+  long size; /* mapped size */
+};
+
+THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared)
+{
+  THMapAllocatorContext *ctx = THAlloc(sizeof(THMapAllocatorContext));
+
+  ctx->filename = THAlloc(strlen(filename)+1);
+  strcpy(ctx->filename, filename);
+  ctx->shared = shared;
+  ctx->size = 0;
+
+  return ctx;
+}
+
+long THMapAllocatorContext_size(THMapAllocatorContext *ctx)
+{
+  return ctx->size;
+}
+
+void THMapAllocatorContext_free(THMapAllocatorContext *ctx)
+{
+  THFree(ctx->filename);
+  THFree(ctx);
+}
+
+static void *THMapAllocator_alloc(void* ctx_, long size)
+{
+  THMapAllocatorContext *ctx = ctx_;
+  void *data = NULL;
+
+#ifdef _WIN32
+  {
+    HANDLE hfile;
+    HANDLE hmfile;
+    DWORD size_hi, size_lo;
+    size_t hfilesz;
+
+    /* open file */
+    /* FILE_FLAG_RANDOM_ACCESS ? */
+    if(ctx->shared)
+    {
+      hfile = CreateFileA(ctx->filename, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
+      if (hfile == INVALID_HANDLE_VALUE)
+        THError("could not open file <%s> in read-write mode", ctx->filename);
+    }
+    else
+    {
+      hfile = CreateFileA(ctx->filename, GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
+      if (hfile == INVALID_HANDLE_VALUE)
+        THError("could not open file <%s> in read-only mode", ctx->filename);
+    }
+
+    size_lo = GetFileSize(hfile, &size_hi);
+    if(sizeof(size_t) > 4)
+    {
+      hfilesz = ((size_t)size_hi) << 32;
+      hfilesz |= size_lo;
+    }
+    else
+      hfilesz = (size_t)(size_lo);
+
+    if(size > 0)
+    {
+      if(size > hfilesz)
+      {
+        if(ctx->shared)
+        {
+#if SIZEOF_SIZE_T > 4
+          size_hi = (DWORD)((size) >> 32);
+          size_lo = (DWORD)((size) & 0xFFFFFFFF);
+#else
+          size_hi = 0;
+          size_lo = (DWORD)(size);
+#endif
+          if((SetFilePointer(hfile, size_lo, &size_hi, FILE_BEGIN)) == INVALID_SET_FILE_POINTER)
+          {
+            CloseHandle(hfile);
+            THError("unable to stretch file <%s> to the right size", ctx->filename);
+          }
+          if(SetEndOfFile(hfile) == 0)
+          {
+            CloseHandle(hfile);
+            THError("unable to write to file <%s>", ctx->filename);
+          }
+        }
+        else
+        {
+          CloseHandle(hfile);
+          THError("file <%s> size is smaller than the required mapping size <%ld>", ctx->filename, size);
+        }
+      }
+    }
+    else
+      size = hfilesz;
+
+    ctx->size = size; /* if we are here, it must be the right size */
+
+#if SIZEOF_SIZE_T > 4
+    size_hi = (DWORD)((ctx->size) >> 32);
+    size_lo = (DWORD)((ctx->size) & 0xFFFFFFFF);
+#else
+    size_hi = 0;
+    size_lo = (DWORD)(ctx->size);
+#endif
+
+    /* get map handle */
+    if(ctx->shared)
+    {
+      if( (hmfile = CreateFileMapping(hfile, NULL, PAGE_READWRITE, size_hi, size_lo, NULL)) == NULL )
+        THError("could not create a map on file <%s>", ctx->filename);
+    }
+    else
+    {
+      if( (hmfile = CreateFileMapping(hfile, NULL, PAGE_WRITECOPY, size_hi, size_lo, NULL)) == NULL )
+        THError("could not create a map on file <%s>", ctx->filename);
+    }
+
+    /* map the stuff */
+    if(ctx->shared)
+      data = MapViewOfFile(hmfile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
+    else
+      data = MapViewOfFile(hmfile, FILE_MAP_COPY, 0, 0, 0);
+
+    CloseHandle(hfile); 
+    CloseHandle(hmfile); 
+  }
+#else /* _WIN32 */
+  {
+    /* open file */
+    int fd;
+    long fdsz;
+
+    if(ctx->shared == TH_ALLOCATOR_MAPPED_SHARED)
+    {
+      if((fd = open(ctx->filename, O_RDWR | O_CREAT, (mode_t)0600)) == -1)
+        THError("unable to open file <%s> in read-write mode", ctx->filename);
+    }
+    else if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
+    {
+#ifdef HAVE_SHM_OPEN
+      if((fd = shm_open(ctx->filename, O_RDWR | O_CREAT, (mode_t)0600)) == -1)
+        THError("unable to open file <%s> in read-write mode", ctx->filename);
+#else
+      THError("unable to open file <%s> in sharedmem mode, shm_open unavailable on this platform");
+#endif
+    }
+    else
+    {
+      if((fd = open(ctx->filename, O_RDONLY)) == -1)
+        THError("unable to open file <%s> in read-only mode", ctx->filename);
+    }
+    if((fdsz = lseek(fd, 0, SEEK_END)) == -1)
+    {
+      close(fd);
+      THError("unable to seek at end of file <%s>", ctx->filename);
+    }
+    if(size > 0)
+    {
+      if(size > fdsz)
+      {
+        if(ctx->shared)
+        {
+          /* if it is shared mem, let's put it in correct size */
+          if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
+          {
+            if(ftruncate(fd, size) == -1)
+              THError("unable to resize shared memory file <%s> to the right size", ctx->filename);
+          }
+          if((fdsz = lseek(fd, size-1, SEEK_SET)) == -1)
+          {
+            close(fd);
+            THError("unable to stretch file <%s> to the right size", ctx->filename);
+          }
+          if((write(fd, "", 1)) != 1) /* note that the string "" contains the '\0' byte ... */
+          {
+            close(fd);
+            THError("unable to write to file <%s>", ctx->filename);
+          }
+        }
+        else
+        {
+          close(fd);
+          THError("file <%s> size is smaller than the required mapping size <%ld>", ctx->filename, size);
+        }
+      }
+    }
+    else
+      size = fdsz;
+
+    ctx->size = size; /* if we are here, it must be the right size */
+    
+    /* map it */
+    if(ctx->shared)
+      data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+    else
+      data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+
+    if(close(fd) == -1)
+      THError("Error closing file <%s>", ctx->filename);
+
+    if(data == MAP_FAILED)
+    {
+      data = NULL; /* let's be sure it is NULL */
+      THError("$ Torch: unable to mmap memory: you tried to mmap %dGB.", ctx->size/1073741824);
+    }
+  }
+#endif
+
+  return data;
+}
+
+static void *THMapAllocator_realloc(void* ctx, void* ptr, long size) {
+  THError("cannot realloc mapped data");
+  return NULL;
+}
+
+static void THMapAllocator_free(void* ctx_, void* data) {
+  THMapAllocatorContext *ctx = ctx_;
+
+#ifdef _WIN32
+  if(!UnmapViewOfFile((LPINT)data))
+    THError("could not unmap the shared memory file");
+#else
+  if (munmap(data, ctx->size))
+    THError("could not unmap the shared memory file");
+  if (ctx->shared == TH_ALLOCATOR_MAPPED_SHAREDMEM)
+  {
+#ifdef HAVE_SHM_UNLINK
+    if (shm_unlink(ctx->filename) == -1)
+      THError("could not unlink the shared memory file %s", ctx->filename);
+#else
+    THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
+#endif
+  }
+#endif
+
+  THMapAllocatorContext_free(ctx);
+}
+
+#else
+
+THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared) {
+  THError("file mapping not supported on your system");
+  return NULL;
+}
+
+void THMapAllocatorContext_free(THMapAllocatorContext *ctx) {
+  THError("file mapping not supported on your system");
+}
+
+static void *THMapAllocator_alloc(void* ctx_, long size) {
+  THError("file mapping not supported on your system");
+  return NULL;
+}
+
+static void *THMapAllocator_realloc(void* ctx, void* ptr, long size) {
+  THError("file mapping not supported on your system");
+  return NULL;
+}
+
+static void THMapAllocator_free(void* ctx, void* data) {
+  THError("file mapping not supported on your system");
+}
+
+#endif
+
+THAllocator THMapAllocator = {
+  &THMapAllocator_alloc,
+  &THMapAllocator_realloc,
+  &THMapAllocator_free
+};
diff --git a/lib/TH/THAllocator.h b/lib/TH/THAllocator.h
new file mode 100644
index 0000000..dbc75a8
--- /dev/null
+++ b/lib/TH/THAllocator.h
@@ -0,0 +1,31 @@
+#ifndef TH_ALLOCATOR_INC
+#define TH_ALLOCATOR_INC
+
+#include "THGeneral.h"
+
+#define TH_ALLOCATOR_MAPPED_SHARED 1
+#define TH_ALLOCATOR_MAPPED_SHAREDMEM 2
+
+/* Custom allocator
+ */
+typedef struct THAllocator {
+  void* (*malloc)(void*, long);
+  void* (*realloc)(void*, void*, long);
+  void (*free)(void*, void*);
+} THAllocator;
+
+/* default malloc/free allocator. malloc and realloc raise an error (using
+ * THError) on allocation failure.
+ */
+extern THAllocator THDefaultAllocator;
+
+/* file map allocator
+ */
+typedef struct THMapAllocatorContext_  THMapAllocatorContext;
+THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int shared);
+long THMapAllocatorContext_size(THMapAllocatorContext *ctx);
+void THMapAllocatorContext_free(THMapAllocatorContext *ctx);
+
+extern THAllocator THMapAllocator;
+
+#endif
diff --git a/lib/TH/THAtomic.c b/lib/TH/THAtomic.c
new file mode 100644
index 0000000..e04dcb3
--- /dev/null
+++ b/lib/TH/THAtomic.c
@@ -0,0 +1,177 @@
+#include "THAtomic.h"
+
+/*
+  Note: I thank Leon Bottou for his useful comments.
+  Ronan.
+*/
+
+#if defined(USE_C11_ATOMICS)
+#include <stdatomic.h>
+#endif
+
+#if defined(USE_MSC_ATOMICS)
+#include <intrin.h>
+#endif
+
+#if !defined(USE_MSC_ATOMICS) && !defined(USE_GCC_ATOMICS) && defined(USE_PTHREAD_ATOMICS)
+#include <pthread.h>
+static pthread_mutex_t ptm = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+void THAtomicSet(int volatile *a, int newvalue)
+{
+#if defined(USE_C11_ATOMICS)
+  atomic_store(a, newvalue);
+#elif defined(USE_MSC_ATOMICS)
+  _InterlockedExchange((long*)a, newvalue);
+#elif defined(USE_GCC_ATOMICS)
+  __sync_lock_test_and_set(a, newvalue);
+#else
+  int oldvalue;
+  do {
+    oldvalue = *a;
+  } while (!THAtomicCompareAndSwap(a, oldvalue, newvalue));
+#endif
+}
+
+int THAtomicGet(int volatile *a)
+{
+#if defined(USE_C11_ATOMICS)
+  return atomic_load(a);
+#else
+  int value;
+  do {
+    value = *a;
+  } while (!THAtomicCompareAndSwap(a, value, value));
+  return value;
+#endif
+}
+
+int THAtomicAdd(int volatile *a, int value)
+{
+#if defined(USE_C11_ATOMICS)
+  return atomic_fetch_add(a, value);
+#elif defined(USE_MSC_ATOMICS)
+  return _InterlockedExchangeAdd((long*)a, value);
+#elif defined(USE_GCC_ATOMICS)
+  return __sync_fetch_and_add(a, value);
+#else
+  int oldvalue;
+  do {
+    oldvalue = *a;
+  } while (!THAtomicCompareAndSwap(a, oldvalue, (oldvalue + value)));
+  return oldvalue;
+#endif
+}
+
+void THAtomicIncrementRef(int volatile *a)
+{
+  THAtomicAdd(a, 1);
+}
+
+int THAtomicDecrementRef(int volatile *a)
+{
+  return (THAtomicAdd(a, -1) == 1);
+}
+
+int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue)
+{
+#if defined(USE_C11_ATOMICS)
+  return atomic_compare_exchange_strong(a, &oldvalue, newvalue);
+#elif defined(USE_MSC_ATOMICS)
+  return (_InterlockedCompareExchange((long*)a, (long)newvalue, (long)oldvalue) == (long)oldvalue);
+#elif defined(USE_GCC_ATOMICS)
+  return __sync_bool_compare_and_swap(a, oldvalue, newvalue);
+#elif defined(USE_PTHREAD_ATOMICS)
+  int ret = 0;
+  pthread_mutex_lock(&ptm);
+  if(*a == oldvalue) {
+    *a = newvalue;
+    ret = 1;
+  }
+  pthread_mutex_unlock(&ptm);
+  return ret;
+#else
+#warning THAtomic is not thread safe
+  if(*a == oldvalue) {
+    *a = newvalue;
+    return 1;
+  }
+  else
+    return 0;
+#endif
+}
+
+void THAtomicSetLong(long volatile *a, long newvalue)
+{
+#if defined(USE_C11_ATOMICS)
+  atomic_store(a, newvalue);
+#elif defined(USE_MSC_ATOMICS)
+  _InterlockedExchange(a, newvalue);
+#elif defined(USE_GCC_ATOMICS)
+  __sync_lock_test_and_set(a, newvalue);
+#else
+  long oldvalue;
+  do {
+    oldvalue = *a;
+  } while (!THAtomicCompareAndSwapLong(a, oldvalue, newvalue));
+#endif
+}
+
+long THAtomicGetLong(long volatile *a)
+{
+#if defined(USE_C11_ATOMICS)
+  return atomic_load(a);
+#else
+  long value;
+  do {
+    value = *a;
+  } while (!THAtomicCompareAndSwapLong(a, value, value));
+  return value;
+#endif
+}
+
+long THAtomicAddLong(long volatile *a, long value)
+{
+#if defined(USE_C11_ATOMICS)
+  return atomic_fetch_add(a, value);
+#elif defined(USE_MSC_ATOMICS)
+  return _InterlockedExchangeAdd(a, value);
+#elif defined(USE_GCC_ATOMICS)
+  return __sync_fetch_and_add(a, value);
+#else
+  long oldvalue;
+  do {
+    oldvalue = *a;
+  } while (!THAtomicCompareAndSwapLong(a, oldvalue, (oldvalue + value)));
+  return oldvalue;
+#endif
+}
+
+long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue)
+{
+#if defined(USE_C11_ATOMICS)
+  return atomic_compare_exchange_strong(a, &oldvalue, newvalue);
+#elif defined(USE_MSC_ATOMICS)
+  return (_InterlockedCompareExchange(a, newvalue, oldvalue) == oldvalue);
+#elif defined(USE_GCC_ATOMICS)
+  return __sync_bool_compare_and_swap(a, oldvalue, newvalue);
+#elif defined(USE_PTHREAD_ATOMICS)
+  long ret = 0;
+  pthread_mutex_lock(&ptm);
+  if(*a == oldvalue) {
+    *a = newvalue;
+    ret = 1;
+  }
+  pthread_mutex_unlock(&ptm);
+  return ret;
+#else
+#warning THAtomic is not thread safe
+  if(*a == oldvalue) {
+    *a = newvalue;
+    return 1;
+  }
+  else
+    return 0;
+#endif
+}
diff --git a/lib/TH/THAtomic.h b/lib/TH/THAtomic.h
new file mode 100644
index 0000000..3a37c31
--- /dev/null
+++ b/lib/TH/THAtomic.h
@@ -0,0 +1,89 @@
+#ifndef TH_ATOMIC_INC
+#define TH_ATOMIC_INC
+
+#include "THGeneral.h"
+
+/******************************************************************************
+ * Atomic operations for TH
+ *  Five backends are integrated:
+ *  - C11 atomic operations
+ *  - MSVC intrinsics
+ *  - GCC intrinsics
+ *  - Pthread if none of the above is available
+ *  - Unsafe mode in none of the above is available
+ ******************************************************************************/
+
+
+/******************************************************************************
+ * all-purpose functions
+ ******************************************************************************/
+
+/*
+ * *a = newvalue
+*/
+TH_API void THAtomicSet(int volatile *a, int newvalue);
+
+/*
+ * return *a
+*/
+TH_API int THAtomicGet(int volatile *a);
+
+/*
+ * *a += value,
+ * return previous *a
+*/
+TH_API int THAtomicAdd(int volatile *a, int value);
+
+/*
+ * check if (*a == oldvalue)
+ * if true: set *a to newvalue, return 1
+ * if false: return 0
+*/
+TH_API int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue);
+
+
+/******************************************************************************
+ * refcounting functions
+ ******************************************************************************/
+
+/*
+ * *a++
+*/
+TH_API void THAtomicIncrementRef(int volatile *a);
+
+/*
+ * *a--,
+ * return 1 if *a == 0 after the operation, 0 otherwise
+*/
+TH_API int THAtomicDecrementRef(int volatile *a);
+
+
+
+/******************************************************************************
+ * functions for long type
+ ******************************************************************************/
+
+/*
+ * *a = newvalue
+*/
+TH_API void THAtomicSetLong(long volatile *a, long newvalue);
+
+/*
+ * return *a
+*/
+TH_API long THAtomicGetLong(long volatile *a);
+
+/*
+ * *a += value,
+ * return previous *a
+*/
+TH_API long THAtomicAddLong(long volatile *a, long value);
+
+/*
+ * check if (*a == oldvalue)
+ * if true: set *a to newvalue, return 1
+ * if false: return 0
+*/
+TH_API long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue);
+
+#endif
diff --git a/lib/TH/THBlas.c b/lib/TH/THBlas.c
new file mode 100644
index 0000000..35618b2
--- /dev/null
+++ b/lib/TH/THBlas.c
@@ -0,0 +1,4 @@
+#include "THBlas.h"
+
+#include "generic/THBlas.c"
+#include "THGenerateAllTypes.h"
diff --git a/lib/TH/THBlas.h b/lib/TH/THBlas.h
new file mode 100644
index 0000000..5fef0fe
--- /dev/null
+++ b/lib/TH/THBlas.h
@@ -0,0 +1,11 @@
+#ifndef TH_BLAS_INC
+#define TH_BLAS_INC
+
+#include "THGeneral.h"
+
+#define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME)
+
+#include "generic/THBlas.h"
+#include "THGenerateAllTypes.h"
+
+#endif
diff --git a/lib/TH/THConfig.cmake.in b/lib/TH/THConfig.cmake.in
new file mode 100644
index 0000000..306cd87
--- /dev/null
+++ b/lib/TH/THConfig.cmake.in
@@ -0,0 +1,9 @@
+# Find the TH includes and library
+#
+# TH_INCLUDE_DIR -- where to find the includes
+# TH_LIBRARIES -- list of libraries to link against
+# TH_FOUND -- set to 1 if found
+
+SET(TH_FOUND 1)
+SET(TH_INCLUDE_DIR "@TH_INCLUDE_DIR@")
+SET(TH_LIBRARIES "@TH_LIBRARIES@")
diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c
new file mode 100644
index 0000000..dff9710
--- /dev/null
+++ b/lib/TH/THDiskFile.c
@@ -0,0 +1,771 @@
+#include "THGeneral.h"
+#include "THDiskFile.h"
+#include "THFilePrivate.h"
+
+typedef struct THDiskFile__
+{
+    THFile file;
+
+    FILE *handle;
+    char *name;
+    int isNativeEncoding;
+    int longSize;
+
+} THDiskFile;
+
+static int THDiskFile_isOpened(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)self;
+  return (dfself->handle != NULL);
+}
+
+const char *THDiskFile_name(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)self;
+  return dfself->name;
+}
+
+/* workaround mac osx lion ***insane*** fread bug */
+#ifdef __APPLE__
+size_t fread__(void *ptr, size_t size, size_t nitems, FILE *stream)
+{
+  size_t nread = 0;
+  while(!feof(stream) && !ferror(stream) && (nread < nitems))
+    nread += fread((char*)ptr+nread*size, size, THMin(2147483648/size, nitems-nread), stream);
+  return nread;
+}
+#else
+#define fread__ fread
+#endif
+
+#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM) \
+  static size_t THDiskFile_read##TYPEC(THFile *self, TYPE *data, size_t n)  \
+  {                                                                     \
+    THDiskFile *dfself = (THDiskFile*)(self);                           \
+    size_t nread = 0L;                                                    \
+                                                                        \
+    THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \
+    THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file"); \
+                                                                        \
+    if(dfself->file.isBinary)                                           \
+    {                                                                   \
+      nread = fread__(data, sizeof(TYPE), n, dfself->handle);           \
+      if(!dfself->isNativeEncoding && (sizeof(TYPE) > 1) && (nread > 0)) \
+        THDiskFile_reverseMemory(data, data, sizeof(TYPE), nread);      \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      size_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        ASCII_READ_ELEM; /* increment here result and break if wrong */ \
+      }                                                                 \
+      if(dfself->file.isAutoSpacing && (n > 0))                         \
+      {                                                                 \
+        int c = fgetc(dfself->handle);                                  \
+        if( (c != '\n') && (c != EOF) )                                 \
+          ungetc(c, dfself->handle);                                    \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if(nread != n)                                                      \
+    {                                                                   \
+      dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \
+      if(!dfself->file.isQuiet)                                         \
+        THError("read error: read %d blocks instead of %d", nread, n);  \
+    }                                                                   \
+                                                                        \
+    return nread;                                                       \
+  }                                                                     \
+                                                                        \
+  static size_t THDiskFile_write##TYPEC(THFile *self, TYPE *data, size_t n) \
+  {                                                                     \
+    THDiskFile *dfself = (THDiskFile*)(self);                           \
+    size_t nwrite = 0L;                                                   \
+                                                                        \
+    THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \
+    THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file"); \
+                                                                        \
+    if(dfself->file.isBinary)                                           \
+    {                                                                   \
+      if(dfself->isNativeEncoding)                                      \
+      {                                                                 \
+        nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle);         \
+      }                                                                 \
+      else                                                              \
+      {                                                                 \
+        if(sizeof(TYPE) > 1)                                            \
+        {                                                               \
+          char *buffer = THAlloc(sizeof(TYPE)*n);                       \
+          THDiskFile_reverseMemory(buffer, data, sizeof(TYPE), n);      \
+          nwrite = fwrite(buffer, sizeof(TYPE), n, dfself->handle);     \
+          THFree(buffer);                                               \
+        }                                                               \
+        else                                                            \
+          nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle);       \
+      }                                                                 \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      size_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        ASCII_WRITE_ELEM;                                               \
+        if( dfself->file.isAutoSpacing && (i < n-1) )                   \
+          fprintf(dfself->handle, " ");                                 \
+      }                                                                 \
+      if(dfself->file.isAutoSpacing && (n > 0))                         \
+        fprintf(dfself->handle, "\n");                                  \
+    }                                                                   \
+                                                                        \
+    if(nwrite != n)                                                     \
+    {                                                                   \
+      dfself->file.hasError = 1;                                        \
+      if(!dfself->file.isQuiet)                                         \
+        THError("write error: wrote %d blocks instead of %d", nwrite, n); \
+    }                                                                   \
+                                                                        \
+    return nwrite;                                                      \
+}
+
+static int THDiskFile_mode(const char *mode, int *isReadable, int *isWritable)
+{
+  *isReadable = 0;
+  *isWritable = 0;
+  if(strlen(mode) == 1)
+  {
+    if(*mode == 'r')
+    {
+      *isReadable = 1;
+      return 1;
+    }
+    else if(*mode == 'w')
+    {
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  else if(strlen(mode) == 2)
+  {
+    if(mode[0] == 'r' && mode[1] == 'w')
+    {
+      *isReadable = 1;
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static void THDiskFile_synchronize(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  fflush(dfself->handle);
+}
+
+static void THDiskFile_seek(THFile *self, size_t position)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+
+#ifdef _WIN32
+  THArgCheck(position <= (size_t)LONG_MAX, 2, "position must be smaller than LONG_MAX");
+  if(fseek(dfself->handle, (long)position, SEEK_SET) < 0)
+#else
+  THArgCheck(position <= (size_t)LLONG_MAX, 2, "position must be smaller than LLONG_MAX");
+  if(fseeko(dfself->handle, (off_t)position, SEEK_SET) < 0)
+#endif
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("unable to seek to position %zu", position);
+  }
+}
+
+static void THDiskFile_seekEnd(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+
+  if(fseek(dfself->handle, 0L, SEEK_END) < 0)
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("unable to seek at end of file");
+  }
+}
+
+static size_t THDiskFile_position(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+
+  long offset = ftell(dfself->handle);
+  if (offset > -1)
+      return (size_t)offset;
+  else if(!dfself->file.isQuiet)
+      THError("unable to obtain disk file offset (maybe a long overflow occured)");
+
+  return 0;
+}
+
+static void THDiskFile_close(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  fclose(dfself->handle);
+  dfself->handle = NULL;
+}
+
+/* Little and Big Endian */
+
+static void THDiskFile_reverseMemory(void *dst, const void *src, size_t blockSize, size_t numBlocks)
+{
+  if(blockSize > 1)
+  {
+    size_t halfBlockSize = blockSize/2;
+    char *charSrc = (char*)src;
+    char *charDst = (char*)dst;
+    size_t b, i;
+    for(b = 0; b < numBlocks; b++)
+    {
+      for(i = 0; i < halfBlockSize; i++)
+      {
+        char z = charSrc[i];
+        charDst[i] = charSrc[blockSize-1-i];
+        charDst[blockSize-1-i] = z;
+      }
+      charSrc += blockSize;
+      charDst += blockSize;
+    }
+  }
+}
+
+int THDiskFile_isLittleEndianCPU(void)
+{
+  int x = 7;
+  char *ptr = (char *)&x;
+
+  if(ptr[0] == 0)
+    return 0;
+  else
+    return 1;
+}
+
+int THDiskFile_isBigEndianCPU(void)
+{
+  return(!THDiskFile_isLittleEndianCPU());
+}
+
+void THDiskFile_nativeEndianEncoding(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  dfself->isNativeEncoding = 1;
+}
+
+void THDiskFile_littleEndianEncoding(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  dfself->isNativeEncoding = THDiskFile_isLittleEndianCPU();
+}
+
+void THDiskFile_bigEndianEncoding(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  dfself->isNativeEncoding = !THDiskFile_isLittleEndianCPU();
+}
+
+/* End of Little and Big Endian Stuff */
+
+void THDiskFile_longSize(THFile *self, int size)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified");
+  dfself->longSize = size;
+}
+
+void THDiskFile_noBuffer(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  if (setvbuf(dfself->handle, NULL, _IONBF, 0)) {
+    THError("error: cannot disable buffer");
+  }
+}
+
+static void THDiskFile_free(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  if(dfself->handle)
+    fclose(dfself->handle);
+  THFree(dfself->name);
+  THFree(dfself);
+}
+
+/* READ_WRITE_METHODS(int, Bool, */
+/*                    int value = 0; int ret = fscanf(file->handle, "%d", &value); array[i] = (value ? 1 : 0); if(ret <= 0) break; else result++, */
+/*                    int value = (array[i] ? 1 : 0); nElemWritten = fprintf(file->handle, "%d", value), */
+/*                    true) */
+
+/* Note that we do a trick */
+READ_WRITE_METHODS(unsigned char, Byte,
+                   nread = fread(data, 1, n, dfself->handle); break,
+                   nwrite = fwrite(data, 1, n, dfself->handle); break)
+
+READ_WRITE_METHODS(char, Char,
+                   nread = fread(data, 1, n, dfself->handle); break,
+                   nwrite = fwrite(data, 1, n, dfself->handle); break)
+
+READ_WRITE_METHODS(short, Short,
+                   int ret = fscanf(dfself->handle, "%hd", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%hd", data[i]); if(ret <= 0) break; else nwrite++)
+
+READ_WRITE_METHODS(int, Int,
+                   int ret = fscanf(dfself->handle, "%d", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%d", data[i]); if(ret <= 0) break; else nwrite++)
+
+/*READ_WRITE_METHODS(long, Long,
+                   int ret = fscanf(dfself->handle, "%ld", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%ld", data[i]); if(ret <= 0) break; else nwrite++)*/
+
+READ_WRITE_METHODS(float, Float,
+                   int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++)
+
+READ_WRITE_METHODS(double, Double,
+                   int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++,
+                   int ret = fprintf(dfself->handle, "%.17g", data[i]); if(ret <= 0) break; else nwrite++)
+
+
+/* For Long we need to rewrite everything, because of the special management of longSize */
+static size_t THDiskFile_readLong(THFile *self, long *data, size_t n)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  size_t nread = 0L;
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file");
+
+  if(dfself->file.isBinary)
+  {
+    if(dfself->longSize == 0 || dfself->longSize == sizeof(long))
+    {
+      nread = fread__(data, sizeof(long), n, dfself->handle);
+      if(!dfself->isNativeEncoding && (sizeof(long) > 1) && (nread > 0))
+        THDiskFile_reverseMemory(data, data, sizeof(long), nread);
+    } else if(dfself->longSize == 4)
+    {
+      int i;
+      nread = fread__(data, 4, n, dfself->handle);
+      if(!dfself->isNativeEncoding && (nread > 0))
+        THDiskFile_reverseMemory(data, data, 4, nread);
+      for(i = nread-1; i >= 0; i--)
+        data[i] = ((int *)data)[i];
+    }
+    else /* if(dfself->longSize == 8) */
+    {
+      int i, big_endian = !THDiskFile_isLittleEndianCPU();
+      long *buffer = THAlloc(8*n);
+      nread = fread__(buffer, 8, n, dfself->handle);
+      for(i = nread-1; i >= 0; i--)
+        data[i] = buffer[2*i + big_endian];
+      THFree(buffer);
+      if(!dfself->isNativeEncoding && (nread > 0))
+        THDiskFile_reverseMemory(data, data, 4, nread);
+     }
+  }
+  else
+  {
+    size_t i;
+    for(i = 0; i < n; i++)
+    {
+      int ret = fscanf(dfself->handle, "%ld", &data[i]); if(ret <= 0) break; else nread++;
+    }
+    if(dfself->file.isAutoSpacing && (n > 0))
+    {
+      int c = fgetc(dfself->handle);
+      if( (c != '\n') && (c != EOF) )
+        ungetc(c, dfself->handle);
+    }
+  }
+
+  if(nread != n)
+  {
+    dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */
+    if(!dfself->file.isQuiet)
+      THError("read error: read %d blocks instead of %d", nread, n);
+  }
+
+  return nread;
+}
+
+static size_t THDiskFile_writeLong(THFile *self, long *data, size_t n)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  size_t nwrite = 0L;
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  if(dfself->file.isBinary)
+  {
+    if(dfself->longSize == 0 || dfself->longSize == sizeof(long))
+    {
+      if(dfself->isNativeEncoding)
+      {
+        nwrite = fwrite(data, sizeof(long), n, dfself->handle);
+      }
+      else
+      {
+        char *buffer = THAlloc(sizeof(long)*n);
+        THDiskFile_reverseMemory(buffer, data, sizeof(long), n);
+        nwrite = fwrite(buffer, sizeof(long), n, dfself->handle);
+        THFree(buffer);
+      }
+    } else if(dfself->longSize == 4)
+    {
+      int i;
+      int *buffer = THAlloc(4*n);
+      for(i = 0; i < n; i++)
+        buffer[i] = data[i];
+      if(!dfself->isNativeEncoding)
+        THDiskFile_reverseMemory(buffer, buffer, 4, n);
+      nwrite = fwrite(buffer, 4, n, dfself->handle);
+      THFree(buffer);
+    }
+    else /* if(dfself->longSize == 8) */
+    {
+      int i, big_endian = !THDiskFile_isLittleEndianCPU();
+      long *buffer = THAlloc(8*n);
+      for(i = 0; i < n; i++)
+      {
+        buffer[2*i + !big_endian] = 0;
+        buffer[2*i + big_endian] = data[i];
+      }
+      if(!dfself->isNativeEncoding)
+        THDiskFile_reverseMemory(buffer, buffer, 8, n);
+      nwrite = fwrite(buffer, 8, n, dfself->handle);
+      THFree(buffer);
+    }
+  }
+  else
+  {
+    size_t i;
+    for(i = 0; i < n; i++)
+    {
+      int ret = fprintf(dfself->handle, "%ld", data[i]); if(ret <= 0) break; else nwrite++;
+      if( dfself->file.isAutoSpacing && (i < n-1) )
+        fprintf(dfself->handle, " ");
+    }
+    if(dfself->file.isAutoSpacing && (n > 0))
+      fprintf(dfself->handle, "\n");
+  }
+
+  if(nwrite != n)
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("write error: wrote %d blocks instead of %d", nwrite, n);
+  }
+
+  return nwrite;
+}
+
+static size_t THDiskFile_readString(THFile *self, const char *format, char **str_)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file");
+  THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'");
+
+/* note: the string won't survive long, as it is copied into lua */
+/* so 1024 is not that big... */
+#define TBRS_BSZ 1024L
+
+  if(format[1] == 'a')
+  {
+    char *p = THAlloc(TBRS_BSZ);
+    size_t total = TBRS_BSZ;
+    size_t pos = 0;
+
+    for (;;)
+    {
+      if(total-pos == 0) /* we need more space! */
+      {
+        total += TBRS_BSZ;
+        p = THRealloc(p, total);
+      }
+      pos += fread(p+pos, 1, total-pos, dfself->handle);
+      if (pos < total) /* eof? */
+      {
+        if(pos == 0)
+        {
+          THFree(p);
+          dfself->file.hasError = 1;
+          if(!dfself->file.isQuiet)
+            THError("read error: read 0 blocks instead of 1");
+
+          *str_ = NULL;
+          return 0;
+        }
+        *str_ = p;
+        return pos;
+      }
+    }
+  }
+  else
+  {
+    char *p = THAlloc(TBRS_BSZ);
+    size_t total = TBRS_BSZ;
+    size_t pos = 0;
+    size_t size;
+
+    for (;;)
+    {
+      if(total-pos <= 1) /* we can only write '\0' in there! */
+      {
+        total += TBRS_BSZ;
+        p = THRealloc(p, total);
+      }
+      if (fgets(p+pos, total-pos, dfself->handle) == NULL) /* eof? */
+      {
+        if(pos == 0)
+        {
+          THFree(p);
+          dfself->file.hasError = 1;
+          if(!dfself->file.isQuiet)
+            THError("read error: read 0 blocks instead of 1");
+
+          *str_ = NULL;
+          return 0;
+        }
+        *str_ = p;
+        return pos;
+      }
+      size = strlen(p+pos);
+      if (size == 0 || (p+pos)[size-1] != '\n')
+      {
+        pos += size;
+      }
+      else
+      {
+        pos += size-1; /* do not include `eol' */
+        *str_ = p;
+        return pos;
+      }
+    }
+  }
+
+  *str_ = NULL;
+  return 0;
+}
+
+
+static size_t THDiskFile_writeString(THFile *self, const char *str, size_t size)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  size_t nwrite;
+
+  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
+  THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  nwrite = fwrite(str, 1, size, dfself->handle);
+  if(nwrite != size)
+  {
+    dfself->file.hasError = 1;
+    if(!dfself->file.isQuiet)
+      THError("write error: wrote %zu blocks instead of %zu", nwrite, size);
+  }
+
+  return nwrite;
+}
+
+THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet)
+{
+  static struct THFileVTable vtable = {
+    THDiskFile_isOpened,
+
+    THDiskFile_readByte,
+    THDiskFile_readChar,
+    THDiskFile_readShort,
+    THDiskFile_readInt,
+    THDiskFile_readLong,
+    THDiskFile_readFloat,
+    THDiskFile_readDouble,
+    THDiskFile_readString,
+
+    THDiskFile_writeByte,
+    THDiskFile_writeChar,
+    THDiskFile_writeShort,
+    THDiskFile_writeInt,
+    THDiskFile_writeLong,
+    THDiskFile_writeFloat,
+    THDiskFile_writeDouble,
+    THDiskFile_writeString,
+
+    THDiskFile_synchronize,
+    THDiskFile_seek,
+    THDiskFile_seekEnd,
+    THDiskFile_position,
+    THDiskFile_close,
+    THDiskFile_free
+  };
+
+  int isReadable;
+  int isWritable;
+  FILE *handle;
+  THDiskFile *self;
+
+  THArgCheck(THDiskFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
+
+  if( isReadable && isWritable )
+  {
+    handle = fopen(name, "r+b");
+    if(!handle)
+    {
+      handle = fopen(name, "wb");
+      if(handle)
+      {
+        fclose(handle);
+        handle = fopen(name, "r+b");
+      }
+    }
+  }
+  else
+    handle = fopen(name, (isReadable ? "rb" : "wb"));
+
+  if(!handle)
+  {
+    if(isQuiet)
+      return 0;
+    else
+      THError("cannot open <%s> in mode %c%c", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' '));
+  }
+
+  self = THAlloc(sizeof(THDiskFile));
+
+  self->handle = handle;
+  self->name = THAlloc(strlen(name)+1);
+  strcpy(self->name, name);
+  self->isNativeEncoding = 1;
+  self->longSize = 0;
+
+  self->file.vtable = &vtable;
+  self->file.isQuiet = isQuiet;
+  self->file.isReadable = isReadable;
+  self->file.isWritable = isWritable;
+  self->file.isBinary = 0;
+  self->file.isAutoSpacing = 1;
+  self->file.hasError = 0;
+
+  return (THFile*)self;
+}
+
+/* PipeFile */
+
+static int THPipeFile_mode(const char *mode, int *isReadable, int *isWritable)
+{
+  *isReadable = 0;
+  *isWritable = 0;
+  if(strlen(mode) == 1)
+  {
+    if(*mode == 'r')
+    {
+      *isReadable = 1;
+      return 1;
+    }
+    else if(*mode == 'w')
+    {
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static void THPipeFile_free(THFile *self)
+{
+  THDiskFile *dfself = (THDiskFile*)(self);
+  if(dfself->handle)
+    pclose(dfself->handle);
+  THFree(dfself->name);
+  THFree(dfself);
+}
+
+THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet)
+{
+  static struct THFileVTable vtable = {
+    THDiskFile_isOpened,
+
+    THDiskFile_readByte,
+    THDiskFile_readChar,
+    THDiskFile_readShort,
+    THDiskFile_readInt,
+    THDiskFile_readLong,
+    THDiskFile_readFloat,
+    THDiskFile_readDouble,
+    THDiskFile_readString,
+
+    THDiskFile_writeByte,
+    THDiskFile_writeChar,
+    THDiskFile_writeShort,
+    THDiskFile_writeInt,
+    THDiskFile_writeLong,
+    THDiskFile_writeFloat,
+    THDiskFile_writeDouble,
+    THDiskFile_writeString,
+
+    THDiskFile_synchronize,
+    THDiskFile_seek,
+    THDiskFile_seekEnd,
+    THDiskFile_position,
+    THDiskFile_close,
+    THPipeFile_free
+  };
+
+  int isReadable;
+  int isWritable;
+  FILE *handle;
+  THDiskFile *self;
+
+  THArgCheck(THPipeFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w'");
+
+#ifdef _WIN32
+  handle = popen(name, (isReadable ? "rb" : "wb"));
+#else
+  handle = popen(name, (isReadable ? "r" : "w"));
+#endif
+
+  if(!handle)
+  {
+    if(isQuiet)
+      return 0;
+    else
+      THError("cannot open <%s> in mode %c%c.  This might be because eg the executable doesn't exist, but it could also be because you are out of memory.", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' '));
+  }
+
+  self = THAlloc(sizeof(THDiskFile));
+
+  self->handle = handle;
+  self->name = THAlloc(strlen(name)+1);
+  strcpy(self->name, name);
+  self->isNativeEncoding = 1;
+  self->longSize = 0;
+
+  self->file.vtable = &vtable;
+  self->file.isQuiet = isQuiet;
+  self->file.isReadable = isReadable;
+  self->file.isWritable = isWritable;
+  self->file.isBinary = 0;
+  self->file.isAutoSpacing = 1;
+  self->file.hasError = 0;
+
+  return (THFile*)self;
+}
diff --git a/lib/TH/THDiskFile.h b/lib/TH/THDiskFile.h
new file mode 100644
index 0000000..bc5c001
--- /dev/null
+++ b/lib/TH/THDiskFile.h
@@ -0,0 +1,19 @@
+#ifndef TH_DISK_FILE_INC
+#define TH_DISK_FILE_INC
+
+#include "THFile.h"
+
+TH_API THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet);
+TH_API THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet);
+
+TH_API const char *THDiskFile_name(THFile *self);
+
+TH_API int THDiskFile_isLittleEndianCPU(void);
+TH_API int THDiskFile_isBigEndianCPU(void);
+TH_API void THDiskFile_nativeEndianEncoding(THFile *self);
+TH_API void THDiskFile_littleEndianEncoding(THFile *self);
+TH_API void THDiskFile_bigEndianEncoding(THFile *self);
+TH_API void THDiskFile_longSize(THFile *self, int size);
+TH_API void THDiskFile_noBuffer(THFile *self);
+
+#endif
diff --git a/lib/TH/THFile.c b/lib/TH/THFile.c
new file mode 100644
index 0000000..c8913af
--- /dev/null
+++ b/lib/TH/THFile.c
@@ -0,0 +1,154 @@
+#include "THFile.h"
+#include "THFilePrivate.h"
+
+#define IMPLEMENT_THFILE_RW(TYPEC, TYPE)                          \
+  size_t THFile_read##TYPEC##Raw(THFile *self, TYPE *data, size_t n)  \
+  {                                                               \
+    return (*self->vtable->read##TYPEC)(self, data, n);           \
+  }                                                               \
+                                                                  \
+  size_t THFile_write##TYPEC##Raw(THFile *self, TYPE *data, size_t n) \
+  {                                                               \
+    return (*self->vtable->write##TYPEC)(self, data, n);          \
+  }
+
+IMPLEMENT_THFILE_RW(Byte, unsigned char)
+IMPLEMENT_THFILE_RW(Char, char)
+IMPLEMENT_THFILE_RW(Short, short)
+IMPLEMENT_THFILE_RW(Int, int)
+IMPLEMENT_THFILE_RW(Long, long)
+IMPLEMENT_THFILE_RW(Float, float)
+IMPLEMENT_THFILE_RW(Double, double)
+
+size_t THFile_readStringRaw(THFile *self, const char *format, char **str_)
+{
+  return self->vtable->readString(self, format, str_);
+}
+
+size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size)
+{
+  return self->vtable->writeString(self, str, size);
+}
+
+void THFile_synchronize(THFile *self)
+{
+  self->vtable->synchronize(self);
+}
+
+void THFile_seek(THFile *self, size_t position)
+{
+  self->vtable->seek(self, position);
+}
+
+void THFile_seekEnd(THFile *self)
+{
+  self->vtable->seekEnd(self);
+}
+
+size_t THFile_position(THFile *self)
+{
+  return self->vtable->position(self);
+}
+
+void THFile_close(THFile *self)
+{
+  self->vtable->close(self);
+}
+
+void THFile_free(THFile *self)
+{
+  self->vtable->free(self);
+}
+
+int THFile_isOpened(THFile *self)
+{
+  return self->vtable->isOpened(self);
+}
+
+#define IMPLEMENT_THFILE_FLAGS(FLAG) \
+  int THFile_##FLAG(THFile *self)    \
+  {                                  \
+    return self->FLAG;               \
+  }
+
+IMPLEMENT_THFILE_FLAGS(isQuiet)
+IMPLEMENT_THFILE_FLAGS(isReadable)
+IMPLEMENT_THFILE_FLAGS(isWritable)
+IMPLEMENT_THFILE_FLAGS(isBinary)
+IMPLEMENT_THFILE_FLAGS(isAutoSpacing)
+IMPLEMENT_THFILE_FLAGS(hasError)
+
+void THFile_binary(THFile *self)
+{
+  self->isBinary = 1;
+}
+
+void THFile_ascii(THFile *self)
+{
+  self->isBinary = 0;
+}
+
+void THFile_autoSpacing(THFile *self)
+{
+  self->isAutoSpacing = 1;
+}
+
+void THFile_noAutoSpacing(THFile *self)
+{
+  self->isAutoSpacing = 0;
+}
+
+void THFile_quiet(THFile *self)
+{
+  self->isQuiet = 1;
+}
+
+void THFile_pedantic(THFile *self)
+{
+  self->isQuiet = 0;
+}
+
+void THFile_clearError(THFile *self)
+{
+  self->hasError = 0;
+}
+
+#define IMPLEMENT_THFILE_SCALAR(TYPEC, TYPE)                  \
+  TYPE THFile_read##TYPEC##Scalar(THFile *self)               \
+  {                                                           \
+    TYPE scalar;                                              \
+    THFile_read##TYPEC##Raw(self, &scalar, 1);                \
+    return scalar;                                            \
+  }                                                           \
+                                                              \
+  void THFile_write##TYPEC##Scalar(THFile *self, TYPE scalar) \
+  {                                                           \
+    THFile_write##TYPEC##Raw(self, &scalar, 1);               \
+  }
+
+IMPLEMENT_THFILE_SCALAR(Byte, unsigned char)
+IMPLEMENT_THFILE_SCALAR(Char, char)
+IMPLEMENT_THFILE_SCALAR(Short, short)
+IMPLEMENT_THFILE_SCALAR(Int, int)
+IMPLEMENT_THFILE_SCALAR(Long, long)
+IMPLEMENT_THFILE_SCALAR(Float, float)
+IMPLEMENT_THFILE_SCALAR(Double, double)
+
+#define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE)                           \
+  size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage)    \
+  {                                                                     \
+    return THFile_read##TYPEC##Raw(self, storage->data, storage->size); \
+  }                                                                     \
+                                                                        \
+  size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage)   \
+  {                                                                     \
+    return THFile_write##TYPEC##Raw(self, storage->data, storage->size); \
+  }
+
+IMPLEMENT_THFILE_STORAGE(Byte, unsigned char)
+IMPLEMENT_THFILE_STORAGE(Char, char)
+IMPLEMENT_THFILE_STORAGE(Short, short)
+IMPLEMENT_THFILE_STORAGE(Int, int)
+IMPLEMENT_THFILE_STORAGE(Long, long)
+IMPLEMENT_THFILE_STORAGE(Float, float)
+IMPLEMENT_THFILE_STORAGE(Double, double)
diff --git a/lib/TH/THFile.h b/lib/TH/THFile.h
new file mode 100644
index 0000000..64dd2da
--- /dev/null
+++ b/lib/TH/THFile.h
@@ -0,0 +1,84 @@
+#ifndef TH_FILE_INC
+#define TH_FILE_INC
+
+#include "THStorage.h"
+
+typedef struct THFile__ THFile;
+
+TH_API int THFile_isOpened(THFile *self);
+TH_API int THFile_isQuiet(THFile *self);
+TH_API int THFile_isReadable(THFile *self);
+TH_API int THFile_isWritable(THFile *self);
+TH_API int THFile_isBinary(THFile *self);
+TH_API int THFile_isAutoSpacing(THFile *self);
+TH_API int THFile_hasError(THFile *self);
+
+TH_API void THFile_binary(THFile *self);
+TH_API void THFile_ascii(THFile *self);
+TH_API void THFile_autoSpacing(THFile *self);
+TH_API void THFile_noAutoSpacing(THFile *self);
+TH_API void THFile_quiet(THFile *self);
+TH_API void THFile_pedantic(THFile *self);
+TH_API void THFile_clearError(THFile *self);
+
+/* scalar */
+TH_API unsigned char THFile_readByteScalar(THFile *self);
+TH_API char THFile_readCharScalar(THFile *self);
+TH_API short THFile_readShortScalar(THFile *self);
+TH_API int THFile_readIntScalar(THFile *self);
+TH_API long THFile_readLongScalar(THFile *self);
+TH_API float THFile_readFloatScalar(THFile *self);
+TH_API double THFile_readDoubleScalar(THFile *self);
+
+TH_API void THFile_writeByteScalar(THFile *self, unsigned char scalar);
+TH_API void THFile_writeCharScalar(THFile *self, char scalar);
+TH_API void THFile_writeShortScalar(THFile *self, short scalar);
+TH_API void THFile_writeIntScalar(THFile *self, int scalar);
+TH_API void THFile_writeLongScalar(THFile *self, long scalar);
+TH_API void THFile_writeFloatScalar(THFile *self, float scalar);
+TH_API void THFile_writeDoubleScalar(THFile *self, double scalar);
+
+/* storage */
+TH_API size_t THFile_readByte(THFile *self, THByteStorage *storage);
+TH_API size_t THFile_readChar(THFile *self, THCharStorage *storage);
+TH_API size_t THFile_readShort(THFile *self, THShortStorage *storage);
+TH_API size_t THFile_readInt(THFile *self, THIntStorage *storage);
+TH_API size_t THFile_readLong(THFile *self, THLongStorage *storage);
+TH_API size_t THFile_readFloat(THFile *self, THFloatStorage *storage);
+TH_API size_t THFile_readDouble(THFile *self, THDoubleStorage *storage);
+
+TH_API size_t THFile_writeByte(THFile *self, THByteStorage *storage);
+TH_API size_t THFile_writeChar(THFile *self, THCharStorage *storage);
+TH_API size_t THFile_writeShort(THFile *self, THShortStorage *storage);
+TH_API size_t THFile_writeInt(THFile *self, THIntStorage *storage);
+TH_API size_t THFile_writeLong(THFile *self, THLongStorage *storage);
+TH_API size_t THFile_writeFloat(THFile *self, THFloatStorage *storage);
+TH_API size_t THFile_writeDouble(THFile *self, THDoubleStorage *storage);
+
+/* raw */
+TH_API size_t THFile_readByteRaw(THFile *self, unsigned char *data, size_t n);
+TH_API size_t THFile_readCharRaw(THFile *self, char *data, size_t n);
+TH_API size_t THFile_readShortRaw(THFile *self, short *data, size_t n);
+TH_API size_t THFile_readIntRaw(THFile *self, int *data, size_t n);
+TH_API size_t THFile_readLongRaw(THFile *self, long *data, size_t n);
+TH_API size_t THFile_readFloatRaw(THFile *self, float *data, size_t n);
+TH_API size_t THFile_readDoubleRaw(THFile *self, double *data, size_t n);
+TH_API size_t THFile_readStringRaw(THFile *self, const char *format, char **str_); /* you must deallocate str_ */
+
+TH_API size_t THFile_writeByteRaw(THFile *self, unsigned char *data, size_t n);
+TH_API size_t THFile_writeCharRaw(THFile *self, char *data, size_t n);
+TH_API size_t THFile_writeShortRaw(THFile *self, short *data, size_t n);
+TH_API size_t THFile_writeIntRaw(THFile *self, int *data, size_t n);
+TH_API size_t THFile_writeLongRaw(THFile *self, long *data, size_t n);
+TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n);
+TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n);
+TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size);
+
+TH_API void THFile_synchronize(THFile *self);
+TH_API void THFile_seek(THFile *self, size_t position);
+TH_API void THFile_seekEnd(THFile *self);
+TH_API size_t THFile_position(THFile *self);
+TH_API void THFile_close(THFile *self);
+TH_API void THFile_free(THFile *self);
+
+#endif
diff --git a/lib/TH/THFilePrivate.h b/lib/TH/THFilePrivate.h
new file mode 100644
index 0000000..d268041
--- /dev/null
+++ b/lib/TH/THFilePrivate.h
@@ -0,0 +1,43 @@
+struct THFile__
+{
+    struct THFileVTable *vtable;
+
+    int isQuiet;
+    int isReadable;
+    int isWritable;
+    int isBinary;
+    int isAutoSpacing;
+    int hasError;
+};
+
+/* virtual table definition */
+
+struct THFileVTable
+{
+    int (*isOpened)(THFile *self);
+
+    size_t (*readByte)(THFile *self, unsigned char *data, size_t n);
+    size_t (*readChar)(THFile *self, char *data, size_t n);
+    size_t (*readShort)(THFile *self, short *data, size_t n);
+    size_t (*readInt)(THFile *self, int *data, size_t n);
+    size_t (*readLong)(THFile *self, long *data, size_t n);
+    size_t (*readFloat)(THFile *self, float *data, size_t n);
+    size_t (*readDouble)(THFile *self, double *data, size_t n);
+    size_t (*readString)(THFile *self, const char *format, char **str_);
+
+    size_t (*writeByte)(THFile *self, unsigned char *data, size_t n);
+    size_t (*writeChar)(THFile *self, char *data, size_t n);
+    size_t (*writeShort)(THFile *self, short *data, size_t n);
+    size_t (*writeInt)(THFile *self, int *data, size_t n);
+    size_t (*writeLong)(THFile *self, long *data, size_t n);
+    size_t (*writeFloat)(THFile *self, float *data, size_t n);
+    size_t (*writeDouble)(THFile *self, double *data, size_t n);
+    size_t (*writeString)(THFile *self, const char *str, size_t size);
+
+    void (*synchronize)(THFile *self);
+    void (*seek)(THFile *self, size_t position);
+    void (*seekEnd)(THFile *self);
+    size_t (*position)(THFile *self);
+    void (*close)(THFile *self);
+    void (*free)(THFile *self);
+};
diff --git a/lib/TH/THGeneral.c b/lib/TH/THGeneral.c
new file mode 100644
index 0000000..4bd4c67
--- /dev/null
+++ b/lib/TH/THGeneral.c
@@ -0,0 +1,274 @@
+#include "THGeneral.h"
+#include "THAtomic.h"
+
+#ifndef TH_HAVE_THREAD
+#define __thread
+#endif
+
+#if (defined(__unix) || defined(_WIN32))
+  #if defined(__FreeBSD__)
+    #include <malloc_np.h>
+  #else
+    #include <malloc.h>
+  #endif
+#elif defined(__APPLE__)
+#include <malloc/malloc.h>
+#endif
+
+/* Torch Error Handling */
+static void defaultTorchErrorHandlerFunction(const char *msg, void *data)
+{
+  printf("$ Error: %s\n", msg);
+  exit(-1);
+}
+
+static __thread void (*torchErrorHandlerFunction)(const char *msg, void *data) = defaultTorchErrorHandlerFunction;
+static __thread void *torchErrorHandlerData;
+
+void _THError(const char *file, const int line, const char *fmt, ...)
+{
+  char msg[2048];
+  va_list args;
+
+  /* vasprintf not standard */
+  /* vsnprintf: how to handle if does not exists? */
+  va_start(args, fmt);
+  int n = vsnprintf(msg, 2048, fmt, args);
+  va_end(args);
+
+  if(n < 2048) {
+    snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
+  }
+
+  (*torchErrorHandlerFunction)(msg, torchErrorHandlerData);
+}
+
+void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...) {
+  char msg[1024];
+  va_list args;
+  va_start(args, fmt);
+  vsnprintf(msg, 1024, fmt, args);
+  va_end(args);
+  _THError(file, line, "Assertion `%s' failed. %s", exp, msg);
+}
+
+void THSetErrorHandler( void (*torchErrorHandlerFunction_)(const char *msg, void *data), void *data )
+{
+  if(torchErrorHandlerFunction_)
+    torchErrorHandlerFunction = torchErrorHandlerFunction_;
+  else
+    torchErrorHandlerFunction = defaultTorchErrorHandlerFunction;
+  torchErrorHandlerData = data;
+}
+
+/* Torch Arg Checking Handling */
+static void defaultTorchArgErrorHandlerFunction(int argNumber, const char *msg, void *data)
+{
+  if(msg)
+    printf("$ Invalid argument %d: %s\n", argNumber, msg);
+  else
+    printf("$ Invalid argument %d\n", argNumber);
+  exit(-1);
+}
+
+static __thread void (*torchArgErrorHandlerFunction)(int argNumber, const char *msg, void *data) = defaultTorchArgErrorHandlerFunction;
+static __thread void *torchArgErrorHandlerData;
+
+void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...)
+{
+  if(!condition) {
+    char msg[2048];
+    va_list args;
+
+    /* vasprintf not standard */
+    /* vsnprintf: how to handle if does not exists? */
+    va_start(args, fmt);
+    int n = vsnprintf(msg, 2048, fmt, args);
+    va_end(args);
+
+    if(n < 2048) {
+      snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
+    }
+
+    (*torchArgErrorHandlerFunction)(argNumber, msg, torchArgErrorHandlerData);
+  }
+}
+
+void THSetArgErrorHandler( void (*torchArgErrorHandlerFunction_)(int argNumber, const char *msg, void *data), void *data )
+{
+  if(torchArgErrorHandlerFunction_)
+    torchArgErrorHandlerFunction = torchArgErrorHandlerFunction_;
+  else
+    torchArgErrorHandlerFunction = defaultTorchArgErrorHandlerFunction;
+  torchArgErrorHandlerData = data;
+}
+
+static __thread void (*torchGCFunction)(void *data) = NULL;
+static __thread void *torchGCData;
+static long heapSize = 0;
+static __thread long heapDelta = 0;
+static const long heapMaxDelta = 1e6; // limit to +/- 1MB before updating heapSize
+static __thread long heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
+static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
+static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%
+
+/* Optional hook for integrating with a garbage-collected frontend.
+ *
+ * If torch is running with a garbage-collected frontend (e.g. Lua),
+ * the GC isn't aware of TH-allocated memory so may not know when it
+ * needs to run. These hooks trigger the GC to run in two cases:
+ *
+ * (1) When a memory allocation (malloc, realloc, ...) fails
+ * (2) When the total TH-allocated memory hits a dynamically-adjusted
+ *     soft maximum.
+ */
+void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data )
+{
+  torchGCFunction = torchGCFunction_;
+  torchGCData = data;
+}
+
+static long getAllocSize(void *ptr) {
+#if defined(__unix) && defined(HAVE_MALLOC_USABLE_SIZE)
+  return malloc_usable_size(ptr);
+#elif defined(__APPLE__)
+  return malloc_size(ptr);
+#elif defined(_WIN32)
+  if(ptr) { return _msize(ptr); } else { return 0; }
+#else
+  return 0;
+#endif
+}
+
+static long applyHeapDelta() {
+  long newHeapSize = THAtomicAddLong(&heapSize, heapDelta) + heapDelta;
+  heapDelta = 0;
+  return newHeapSize;
+}
+
+/* (1) if the torch-allocated heap size exceeds the soft max, run GC
+ * (2) if post-GC heap size exceeds 80% of the soft max, increase the
+ *     soft max by 40%
+ */
+static void maybeTriggerGC(long curHeapSize) {
+  if (torchGCFunction && curHeapSize > heapSoftmax) {
+    torchGCFunction(torchGCData);
+
+    // ensure heapSize is accurate before updating heapSoftmax
+    long newHeapSize = applyHeapDelta();
+
+    if (newHeapSize > heapSoftmax * heapSoftmaxGrowthThresh) {
+      heapSoftmax = heapSoftmax * heapSoftmaxGrowthFactor;
+    }
+  }
+}
+
+// hooks into the TH heap tracking
+void THHeapUpdate(long size) {
+  heapDelta += size;
+
+  // batch updates to global heapSize to minimize thread contention
+  if (labs(heapDelta) < heapMaxDelta) {
+    return;
+  }
+
+  long newHeapSize = applyHeapDelta();
+
+  if (size > 0) {
+    maybeTriggerGC(newHeapSize);
+  }
+}
+
+static void* THAllocInternal(long size)
+{
+  void *ptr;
+
+  if (size > 5120)
+  {
+#if (defined(__unix) || defined(__APPLE__)) && (!defined(DISABLE_POSIX_MEMALIGN))
+    if (posix_memalign(&ptr, 64, size) != 0)
+      ptr = NULL;
+/*
+#elif defined(_WIN32)
+    ptr = _aligned_malloc(size, 64);
+*/
+#else
+    ptr = malloc(size);
+#endif
+  }
+  else
+  {
+    ptr = malloc(size);
+  }
+
+  THHeapUpdate(getAllocSize(ptr));
+  return ptr;
+}
+
+void* THAlloc(long size)
+{
+  void *ptr;
+
+  if(size < 0)
+    THError("$ Torch: invalid memory size -- maybe an overflow?");
+
+  if(size == 0)
+    return NULL;
+
+  ptr = THAllocInternal(size);
+
+  if(!ptr && torchGCFunction) {
+    torchGCFunction(torchGCData);
+    ptr = THAllocInternal(size);
+  }
+
+  if(!ptr)
+    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
+
+  return ptr;
+}
+
+void* THRealloc(void *ptr, long size)
+{
+  if(!ptr)
+    return(THAlloc(size));
+  
+  if(size == 0)
+  {
+    THFree(ptr);
+    return NULL;
+  }
+
+  if(size < 0)
+    THError("$ Torch: invalid memory size -- maybe an overflow?");
+
+  THHeapUpdate(-getAllocSize(ptr));
+  void *newptr = realloc(ptr, size);
+
+  if(!newptr && torchGCFunction) {
+    torchGCFunction(torchGCData);
+    newptr = realloc(ptr, size);
+  }
+  THHeapUpdate(getAllocSize(newptr ? newptr : ptr));
+
+  if(!newptr)
+    THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
+
+  return newptr;
+}
+
+void THFree(void *ptr)
+{
+  THHeapUpdate(-getAllocSize(ptr));
+  free(ptr);
+}
+
+double THLog1p(const double x)
+{
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+  volatile double y = 1 + x;
+  return log(y) - ((y-1)-x)/y ;  /* cancels errors with IEEE arithmetic */
+#else
+  return log1p(x);
+#endif
+}
diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in
new file mode 100644
index 0000000..5c19da2
--- /dev/null
+++ b/lib/TH/THGeneral.h.in
@@ -0,0 +1,116 @@
+#ifndef TH_GENERAL_INC
+#define TH_GENERAL_INC
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+#include <time.h>
+#include <string.h>
+
+#cmakedefine USE_BLAS
+#cmakedefine USE_LAPACK
+#cmakedefine BLAS_IS_ACCELERATE
+#cmakedefine BLAS_F2C
+
+#ifdef __cplusplus
+# define TH_EXTERNC extern "C"
+#else
+# define TH_EXTERNC extern
+#endif
+
+#ifdef _WIN32
+# ifdef TH_EXPORTS
+#  define TH_API TH_EXTERNC __declspec(dllexport)
+# else
+#  define TH_API TH_EXTERNC __declspec(dllimport)
+# endif
+#else
+# define TH_API TH_EXTERNC
+#endif
+
+#define TH_INLINE @TH_INLINE@
+
+#ifndef __cplusplus
+#define inline @TH_INLINE@
+#endif
+
+#ifndef M_PI
+# define M_PI 3.14159265358979323846
+#endif
+
+TH_API double THLog1p(const double x);
+TH_API void _THError(const char *file, const int line, const char *fmt, ...);
+TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...);
+TH_API void THSetErrorHandler( void (*torchErrorHandlerFunction)(const char *msg, void *data), void *data );
+TH_API void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...);
+TH_API void THSetArgErrorHandler( void (*torchArgErrorHandlerFunction)(int argNumber, const char *msg, void *data), void *data );
+TH_API void* THAlloc(long size);
+TH_API void* THRealloc(void *ptr, long size);
+TH_API void THFree(void *ptr);
+TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data );
+// this hook should only be called by custom allocator functions
+TH_API void THHeapUpdate(long size);
+
+#define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)
+
+#define THCleanup(...) __VA_ARGS__
+
+#define THArgCheck(...)                                               \
+do {                                                                  \
+  _THArgCheck(__FILE__, __LINE__, __VA_ARGS__);                       \
+} while(0)
+
+#define THArgCheckWithCleanup(condition, cleanup, ...)                \
+do if (!(condition)) {                                                \
+  cleanup                                                             \
+  _THArgCheck(__FILE__, __LINE__, 0, __VA_ARGS__);                    \
+} while(0)
+
+#define THAssert(exp)                                                 \
+do {                                                                  \
+  if (!(exp)) {                                                       \
+    _THAssertionFailed(__FILE__, __LINE__, #exp, "");                 \
+  }                                                                   \
+} while(0)
+
+#define THAssertMsg(exp, ...)                                         \
+do {                                                                  \
+  if (!(exp)) {                                                       \
+    _THAssertionFailed(__FILE__, __LINE__, #exp, __VA_ARGS__);        \
+  }                                                                   \
+} while(0)
+
+#define TH_CONCAT_STRING_2(x,y) TH_CONCAT_STRING_2_EXPAND(x,y)
+#define TH_CONCAT_STRING_2_EXPAND(x,y) #x #y
+
+#define TH_CONCAT_STRING_3(x,y,z) TH_CONCAT_STRING_3_EXPAND(x,y,z)
+#define TH_CONCAT_STRING_3_EXPAND(x,y,z) #x #y #z
+
+#define TH_CONCAT_STRING_4(x,y,z,w) TH_CONCAT_STRING_4_EXPAND(x,y,z,w)
+#define TH_CONCAT_STRING_4_EXPAND(x,y,z,w) #x #y #z #w
+
+#define TH_CONCAT_2(x,y) TH_CONCAT_2_EXPAND(x,y)
+#define TH_CONCAT_2_EXPAND(x,y) x ## y
+
+#define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z)
+#define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z
+
+#define TH_CONCAT_4_EXPAND(x,y,z,w) x ## y ## z ## w
+#define TH_CONCAT_4(x,y,z,w) TH_CONCAT_4_EXPAND(x,y,z,w)
+
+#define THMin(X, Y)  ((X) < (Y) ? (X) : (Y))
+#define THMax(X, Y)  ((X) > (Y) ? (X) : (Y))
+
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+# define log1p(x) THLog1p(x)
+#define snprintf _snprintf
+#define popen _popen
+#define pclose _pclose
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
+#endif
diff --git a/lib/TH/THGenerateAllTypes.h b/lib/TH/THGenerateAllTypes.h
new file mode 100644
index 0000000..539629b
--- /dev/null
+++ b/lib/TH/THGenerateAllTypes.h
@@ -0,0 +1,97 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
+#endif
+
+#define real unsigned char
+#define accreal long
+#define Real Byte
+#define THInf UCHAR_MAX
+#define TH_REAL_IS_BYTE
+#line 1 TH_GENERIC_FILE
+/*#line 1 "THByteStorage.h"*/
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_BYTE
+
+#define real char
+#define accreal long
+#define Real Char
+#define THInf CHAR_MAX
+#define TH_REAL_IS_CHAR
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_CHAR
+
+#define real short
+#define accreal long
+#define Real Short
+#define THInf SHRT_MAX
+#define TH_REAL_IS_SHORT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_SHORT
+
+#define real int
+#define accreal long
+#define Real Int
+#define THInf INT_MAX
+#define TH_REAL_IS_INT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_INT
+
+#define real long
+#define accreal long
+#define Real Long
+#define THInf LONG_MAX
+#define TH_REAL_IS_LONG
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_LONG
+
+#define real float
+#define accreal double
+#define Real Float
+#define THInf FLT_MAX
+#define TH_REAL_IS_FLOAT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_FLOAT
+
+#define real double
+#define accreal double
+#define Real Double
+#define THInf DBL_MAX
+#define TH_REAL_IS_DOUBLE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_DOUBLE
+
+#undef TH_GENERIC_FILE
diff --git a/lib/TH/THGenerateFloatTypes.h b/lib/TH/THGenerateFloatTypes.h
new file mode 100644
index 0000000..b6bdd07
--- /dev/null
+++ b/lib/TH/THGenerateFloatTypes.h
@@ -0,0 +1,31 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
+#endif
+
+#define real float
+#define accreal double
+#define Real Float
+#define THInf FLT_MAX
+#define TH_REAL_IS_FLOAT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef accreal
+#undef real
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_FLOAT
+
+#define real double
+#define accreal double
+#define Real Double
+#define THInf DBL_MAX
+#define TH_REAL_IS_DOUBLE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef accreal
+#undef real
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_DOUBLE
+
+#undef TH_GENERIC_FILE
diff --git a/lib/TH/THGenerateIntTypes.h b/lib/TH/THGenerateIntTypes.h
new file mode 100644
index 0000000..a561d89
--- /dev/null
+++ b/lib/TH/THGenerateIntTypes.h
@@ -0,0 +1,70 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateIntTypes.h"
+#endif
+
+#define real unsigned char
+#define accreal long
+#define Real Byte
+#define THInf UCHAR_MAX
+#define TH_REAL_IS_BYTE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_BYTE
+
+#define real char
+#define accreal long
+#define Real Char
+#define THInf CHAR_MAX
+#define TH_REAL_IS_CHAR
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_CHAR
+
+#define real short
+#define accreal long
+#define Real Short
+#define THInf SHRT_MAX
+#define TH_REAL_IS_SHORT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_SHORT
+
+#define real int
+#define accreal long
+#define Real Int
+#define THInf INT_MAX
+#define TH_REAL_IS_INT
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_INT
+
+#define real long
+#define accreal long
+#define Real Long
+#define THInf LONG_MAX
+#define TH_REAL_IS_LONG
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef real
+#undef accreal
+#undef Real
+#undef THInf
+#undef TH_REAL_IS_LONG
+
+#undef TH_GENERIC_FILE
diff --git a/lib/TH/THLapack.c b/lib/TH/THLapack.c
new file mode 100644
index 0000000..bd4dc71
--- /dev/null
+++ b/lib/TH/THLapack.c
@@ -0,0 +1,4 @@
+#include "THLapack.h"
+
+#include "generic/THLapack.c"
+#include "THGenerateFloatTypes.h"
diff --git a/lib/TH/THLapack.h b/lib/TH/THLapack.h
new file mode 100644
index 0000000..614d15f
--- /dev/null
+++ b/lib/TH/THLapack.h
@@ -0,0 +1,27 @@
+#ifndef TH_LAPACK_INC
+#define TH_LAPACK_INC
+
+#include "THGeneral.h"
+
+#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
+
+#define THLapackCheck(fmt, func, info , ...)						\
+if (info < 0) {														\
+  THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
+} else if(info > 0) {												\
+  THError(fmt, func, info, ##__VA_ARGS__);							\
+}																	\
+
+#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...)    \
+if (info < 0) {                                                     \
+  cleanup                                                           \
+  THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
+} else if(info > 0) {                                               \
+  cleanup                                                           \
+  THError(fmt, func, info, ##__VA_ARGS__);                          \
+}
+
+#include "generic/THLapack.h"
+#include "THGenerateAllTypes.h"
+
+#endif
diff --git a/lib/TH/THLogAdd.c b/lib/TH/THLogAdd.c
new file mode 100644
index 0000000..a503d7d
--- /dev/null
+++ b/lib/TH/THLogAdd.c
@@ -0,0 +1,88 @@
+#include "THLogAdd.h"
+
+#include <float.h>
+
+#ifdef USE_DOUBLE
+#define MINUS_LOG_THRESHOLD -39.14
+#else
+#define MINUS_LOG_THRESHOLD -18.42
+#endif
+
+const double THLog2Pi=1.83787706640934548355;
+const double THLogZero=-DBL_MAX;
+const double THLogOne=0;
+
+double THLogAdd(double log_a, double log_b)
+{
+  double minusdif;
+
+  if (log_a < log_b)
+  {
+    double tmp = log_a;
+    log_a = log_b;
+    log_b = tmp;
+  }
+
+  minusdif = log_b - log_a;
+#ifdef DEBUG
+  if (isnan(minusdif))
+    THError("THLogAdd: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a);
+#endif
+  if (minusdif < MINUS_LOG_THRESHOLD)
+    return log_a;
+  else
+    return log_a + log1p(exp(minusdif));
+}
+
+double THLogSub(double log_a, double log_b)
+{
+  double minusdif;
+
+  if (log_a < log_b)
+    THError("LogSub: log_a (%f) should be greater than log_b (%f)", log_a, log_b);
+
+  minusdif = log_b - log_a;
+#ifdef DEBUG
+  if (isnan(minusdif))
+    THError("LogSub: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a);
+#endif
+  if (log_a == log_b)
+    return THLogZero;
+  else if (minusdif < MINUS_LOG_THRESHOLD)
+    return log_a;
+  else
+    return log_a + log1p(-exp(minusdif));
+}
+
+/* Credits to Leon Bottou */
+double THExpMinusApprox(double x)
+{
+#define EXACT_EXPONENTIAL 0
+#if EXACT_EXPONENTIAL
+  return exp(-x);
+#else
+  /* fast approximation of exp(-x) for x positive */
+# define A0   (1.0)
+# define A1   (0.125)
+# define A2   (0.0078125)
+# define A3   (0.00032552083)
+# define A4   (1.0172526e-5)
+  if (x < 13.0)
+  {
+/*    assert(x>=0); */
+    double y;
+    y = A0+x*(A1+x*(A2+x*(A3+x*A4)));
+    y *= y;
+    y *= y;
+    y *= y;
+    y = 1/y;
+    return y;
+  }
+  return 0;
+# undef A0
+# undef A1
+# undef A2
+# undef A3
+# undef A4
+#endif
+}
diff --git a/lib/TH/THLogAdd.h b/lib/TH/THLogAdd.h
new file mode 100644
index 0000000..9319b8f
--- /dev/null
+++ b/lib/TH/THLogAdd.h
@@ -0,0 +1,14 @@
+#ifndef TH_LOG_ADD_INC
+#define TH_LOG_ADD_INC
+
+#include "THGeneral.h"
+
+TH_API const double THLog2Pi;
+TH_API const double THLogZero;
+TH_API const double THLogOne;
+
+TH_API double THLogAdd(double log_a, double log_b);
+TH_API double THLogSub(double log_a, double log_b);
+TH_API double THExpMinusApprox(const double x);
+
+#endif
diff --git a/lib/TH/THMath.h b/lib/TH/THMath.h
new file mode 100644
index 0000000..b96083f
--- /dev/null
+++ b/lib/TH/THMath.h
@@ -0,0 +1,21 @@
+#ifndef _THMATH_H
+#define _THMATH_H
+
+static inline double TH_sigmoid(double value) {
+  return 1.0 / (1.0 + exp(-value));
+}
+
+static inline double TH_frac(double x) {
+  return x - trunc(x);
+}
+
+static inline double TH_rsqrt(double x) {
+  return 1.0 / sqrt(x);
+}
+
+static inline double TH_lerp(double a, double b, double weight) {
+  return a + weight * (b-a);
+}
+
+#endif // _THMATH_H
+
diff --git a/lib/TH/THMemoryFile.c b/lib/TH/THMemoryFile.c
new file mode 100644
index 0000000..d39b841
--- /dev/null
+++ b/lib/TH/THMemoryFile.c
@@ -0,0 +1,678 @@
+#include "THMemoryFile.h"
+#include "THFilePrivate.h"
+
+typedef struct THMemoryFile__
+{
+    THFile file;
+    THCharStorage *storage;
+    size_t size;
+    size_t position;
+	int longSize;
+
+} THMemoryFile;
+
+static int THMemoryFile_isOpened(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  return (mfself->storage != NULL);
+}
+
+static char *THMemoryFile_strnextspace(char *str_, char *c_)
+{
+  char c;
+
+  while( (c = *str_) )
+  {
+    if( (c != ' ') && (c != '\n') && (c != ':') && (c != ';') )
+      break;
+    str_++;
+  }
+
+  while( (c = *str_) )
+  {
+    if( (c == ' ') || (c == '\n') || (c == ':') || (c == ';') )
+    {
+      *c_ = c;
+      *str_ = '\0';
+      return(str_);
+    }
+    str_++;
+  }
+  return NULL;
+}
+
+static void THMemoryFile_grow(THMemoryFile *self, size_t size)
+{
+  size_t missingSpace;
+
+  if(size <= self->size)
+    return;
+  else
+  {
+    if(size < self->storage->size) /* note the "<" and not "<=" */
+    {
+      self->size = size;
+      self->storage->data[self->size] = '\0';
+      return;
+    }
+  }
+
+  missingSpace = size-self->storage->size+1; /* +1 for the '\0' */
+  THCharStorage_resize(self->storage, (self->storage->size/2 > missingSpace ?
+                                       self->storage->size + (self->storage->size/2)
+                                       : self->storage->size + missingSpace));
+}
+
+static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
+{
+  *isReadable = 0;
+  *isWritable = 0;
+  if(strlen(mode) == 1)
+  {
+    if(*mode == 'r')
+    {
+      *isReadable = 1;
+      return 1;
+    }
+    else if(*mode == 'w')
+    {
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  else if(strlen(mode) == 2)
+  {
+    if(mode[0] == 'r' && mode[1] == 'w')
+    {
+      *isReadable = 1;
+      *isWritable = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/********************************************************/
+
+#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM, INSIDE_SPACING) \
+  static size_t THMemoryFile_read##TYPEC(THFile *self, TYPE *data, size_t n) \
+  {                                                                     \
+    THMemoryFile *mfself = (THMemoryFile*)self;                         \
+    size_t nread = 0;                                                    \
+                                                                        \
+    THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");     \
+    THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file"); \
+                                                                        \
+    if (n == 0)                                                         \
+        return 0;                                                       \
+                                                                        \
+    if(mfself->file.isBinary)                                           \
+    {                                                                   \
+      size_t nByte = sizeof(TYPE)*n;                                      \
+      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position); \
+      nread = nByteRemaining/sizeof(TYPE);                              \
+      memmove(data, mfself->storage->data+mfself->position, nread*sizeof(TYPE)); \
+      mfself->position += nread*sizeof(TYPE);                           \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      size_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        size_t nByteRead = 0;                                             \
+        char spaceChar = 0;                                             \
+        char *spacePtr = THMemoryFile_strnextspace(mfself->storage->data+mfself->position, &spaceChar); \
+        ASCII_READ_ELEM;                                                \
+        if(ret == EOF)                                                  \
+        {                                                               \
+          while(mfself->storage->data[mfself->position])                \
+            mfself->position++;                                         \
+        }                                                               \
+        else                                                            \
+          mfself->position += nByteRead;                                \
+        if(spacePtr)                                                    \
+          *spacePtr = spaceChar;                                        \
+      }                                                                 \
+      if(mfself->file.isAutoSpacing && (n > 0))                         \
+      {                                                                 \
+        if( (mfself->position < mfself->size) && (mfself->storage->data[mfself->position] == '\n') ) \
+          mfself->position++;                                           \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    if(nread != n)                                                      \
+    {                                                                   \
+      mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \
+      if(!mfself->file.isQuiet)                                         \
+        THError("read error: read %d blocks instead of %d", nread, n);  \
+    }                                                                   \
+                                                                        \
+    return nread;                                                       \
+  }                                                                     \
+                                                                        \
+  static size_t THMemoryFile_write##TYPEC(THFile *self, TYPE *data, size_t n) \
+  {                                                                     \
+    THMemoryFile *mfself = (THMemoryFile*)self;                         \
+                                                                        \
+    THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");     \
+    THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file"); \
+                                                                        \
+    if (n == 0)                                                         \
+        return 0;                                                       \
+                                                                        \
+    if(mfself->file.isBinary)                                           \
+    {                                                                   \
+      size_t nByte = sizeof(TYPE)*n;                                      \
+      THMemoryFile_grow(mfself, mfself->position+nByte);                \
+      memmove(mfself->storage->data+mfself->position, data, nByte);     \
+      mfself->position += nByte;                                        \
+      if(mfself->position > mfself->size)                               \
+      {                                                                 \
+        mfself->size = mfself->position;                                \
+        mfself->storage->data[mfself->size] = '\0';                     \
+      }                                                                 \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      size_t i;                                                           \
+      for(i = 0; i < n; i++)                                            \
+      {                                                                 \
+        ssize_t nByteWritten;                                           \
+        while (1)                                                       \
+        {                                                               \
+          ASCII_WRITE_ELEM;                                             \
+          if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) ) \
+          {                                                             \
+            mfself->position += nByteWritten;                           \
+            break;                                                      \
+          }                                                             \
+          THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2); \
+        }                                                               \
+        if(mfself->file.isAutoSpacing)                                  \
+        {                                                               \
+          if(i < n-1)                                                   \
+          {                                                             \
+            THMemoryFile_grow(mfself, mfself->position+1);              \
+            sprintf(mfself->storage->data+mfself->position, " ");       \
+            mfself->position++;                                         \
+          }                                                             \
+          if(i == n-1)                                                  \
+          {                                                             \
+            THMemoryFile_grow(mfself, mfself->position+1);              \
+            sprintf(mfself->storage->data+mfself->position, "\n");      \
+            mfself->position++;                                         \
+          }                                                             \
+        }                                                               \
+      }                                                                 \
+      if(mfself->position > mfself->size)                               \
+      {                                                                 \
+        mfself->size = mfself->position;                                \
+        mfself->storage->data[mfself->size] = '\0';                     \
+      }                                                                 \
+    }                                                                   \
+                                                                        \
+    return n;                                                           \
+  }
+
+
+void THMemoryFile_longSize(THFile *self, int size)
+{
+  THMemoryFile *dfself = (THMemoryFile*)(self);
+  THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified");
+  dfself->longSize = size;
+}
+
+THCharStorage *THMemoryFile_storage(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+
+  THCharStorage_resize(mfself->storage, mfself->size+1);
+
+  return mfself->storage;
+}
+
+static void THMemoryFile_synchronize(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+}
+
+static void THMemoryFile_seek(THFile *self, size_t position)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(position >= 0, 2, "position must be positive");
+
+  if(position <= mfself->size)
+    mfself->position = position;
+  else
+  {
+    mfself->file.hasError = 1;
+    if(!mfself->file.isQuiet)
+      THError("unable to seek at position %zu", position);
+  }
+}
+
+static void THMemoryFile_seekEnd(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+
+  mfself->position = mfself->size;
+}
+
+static size_t THMemoryFile_position(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  return mfself->position;
+}
+
+static void THMemoryFile_close(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THCharStorage_free(mfself->storage);
+  mfself->storage = NULL;
+}
+
+static void THMemoryFile_free(THFile *self)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  if(mfself->storage)
+    THCharStorage_free(mfself->storage);
+
+  THFree(mfself);
+}
+
+/* READ_WRITE_METHODS(bool, Bool, */
+/*                    int value = 0; int ret = sscanf(mfself->storage->data+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */
+/*                    int value = (data[i] ? 1 : 0); nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%d", value), */
+/*                    1) */
+
+READ_WRITE_METHODS(unsigned char, Byte,
+                   size_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position);  \
+                   if(spacePtr) *spacePtr = spaceChar; \
+                   nByteRead = ret; \
+                   nread = ret; \
+                   i = n-1; \
+                   memmove(data, mfself->storage->data+mfself->position, nByteRead),
+                   nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \
+                   i = n-1; \
+                   if(nByteWritten > -1)
+                     memmove(mfself->storage->data+mfself->position, data, nByteWritten),
+                   0)
+
+/* DEBUG: we should check if %n is count or not as a element (so ret might need to be ret-- on some systems) */
+/* Note that we do a trick for char */
+READ_WRITE_METHODS(char, Char,
+                   size_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position);  \
+                   if(spacePtr) *spacePtr = spaceChar; \
+                   nByteRead = ret; \
+                   nread = ret; \
+                   i = n-1; \
+                   memmove(data, mfself->storage->data+mfself->position, nByteRead),
+                   nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \
+                   i = n-1; \
+                   if(nByteWritten > -1)
+                     memmove(mfself->storage->data+mfself->position, data, nByteWritten),
+                   0)
+
+READ_WRITE_METHODS(short, Short,
+                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%hd", data[i]),
+                   1)
+
+READ_WRITE_METHODS(int, Int,
+                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%d", data[i]),
+                   1)
+
+/*READ_WRITE_METHODS(long, Long,
+                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%ld%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]),
+                   1)*/
+
+READ_WRITE_METHODS(float, Float,
+                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", data[i]),
+                   1)
+
+READ_WRITE_METHODS(double, Double,
+                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
+                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.17g", data[i]),
+                   1)
+
+int THDiskFile_isLittleEndianCPU(void);
+
+static size_t THMemoryFile_readLong(THFile *self, long *data, size_t n)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+  size_t nread = 0L;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file");
+
+  if (n == 0)
+    return 0;
+
+  if(mfself->file.isBinary)
+  {
+    if(mfself->longSize == 0 || mfself->longSize == sizeof(long))
+    {
+      size_t nByte = sizeof(long)*n;
+      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
+      nread = nByteRemaining/sizeof(long);
+      memmove(data, mfself->storage->data+mfself->position, nread*sizeof(long));
+      mfself->position += nread*sizeof(long);
+    } else if(mfself->longSize == 4)
+    {
+      size_t i;
+      size_t nByte = 4*n;
+      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
+      int *storage = (int *)(mfself->storage->data + mfself->position);
+      nread = nByteRemaining/4;
+      for(i = 0; i < nread; i++)
+        data[i] = storage[i];
+      mfself->position += nread*4;
+    }
+    else /* if(mfself->longSize == 8) */
+    {
+      int i, big_endian = !THDiskFile_isLittleEndianCPU();
+      size_t nByte = 8*n;
+      long *storage = (long *)(mfself->storage->data + mfself->position);
+      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
+      nread = nByteRemaining/8;
+      for(i = 0; i < nread; i++)
+        data[i] = storage[2*i + big_endian];
+      mfself->position += nread*4;
+    }
+  }
+  else
+  {
+    size_t i;
+    for(i = 0; i < n; i++)
+    {
+      size_t nByteRead = 0;
+      char spaceChar = 0;
+      char *spacePtr = THMemoryFile_strnextspace(mfself->storage->data+mfself->position, &spaceChar);
+      int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%ld%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++;
+      if(ret == EOF)
+      {
+        while(mfself->storage->data[mfself->position])
+          mfself->position++;
+      }
+      else
+        mfself->position += nByteRead;
+      if(spacePtr)
+        *spacePtr = spaceChar;
+    }
+    if(mfself->file.isAutoSpacing && (n > 0))
+    {
+      if( (mfself->position < mfself->size) && (mfself->storage->data[mfself->position] == '\n') )
+        mfself->position++;
+    }
+  }
+
+  if(nread != n)
+  {
+    mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */
+    if(!mfself->file.isQuiet)
+      THError("read error: read %d blocks instead of %d", nread, n);
+  }
+
+  return nread;
+}
+
+static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  if (n == 0)
+    return 0;
+
+  if(mfself->file.isBinary)
+  {
+    if(mfself->longSize == 0 || mfself->longSize == sizeof(long))
+    {
+      size_t nByte = sizeof(long)*n;
+      THMemoryFile_grow(mfself, mfself->position+nByte);
+      memmove(mfself->storage->data+mfself->position, data, nByte);
+      mfself->position += nByte;
+    } else if(mfself->longSize == 4)
+    {
+      int i;
+      size_t nByte = 4*n;
+      int *storage = (int *)(mfself->storage->data + mfself->position);
+      THMemoryFile_grow(mfself, mfself->position+nByte);
+      for(i = 0; i < n; i++)
+        storage[i] = data[i];
+      mfself->position += nByte;
+    }
+    else /* if(mfself->longSize == 8) */
+    {
+      int i, big_endian = !THDiskFile_isLittleEndianCPU();
+      size_t nByte = 8*n;
+      long *storage = (long *)(mfself->storage->data + mfself->position);
+      THMemoryFile_grow(mfself, mfself->position+nByte);
+      for(i = 0; i < n; i++)
+      {
+        storage[2*i + !big_endian] = 0;
+        storage[2*i + big_endian] = data[i];
+      }
+      mfself->position += nByte;
+    }
+    if(mfself->position > mfself->size)
+    {
+      mfself->size = mfself->position;
+      mfself->storage->data[mfself->size] = '\0';
+    }
+  }
+  else
+  {
+    size_t i;
+    for(i = 0; i < n; i++)
+    {
+      ssize_t nByteWritten;
+      while (1)
+      {
+        nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]);
+        if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) )
+        {
+          mfself->position += nByteWritten;
+          break;
+        }
+        THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2);
+      }
+      if(mfself->file.isAutoSpacing)
+      {
+        if(i < n-1)
+        {
+          THMemoryFile_grow(mfself, mfself->position+1);
+          sprintf(mfself->storage->data+mfself->position, " ");
+          mfself->position++;
+        }
+        if(i == n-1)
+        {
+          THMemoryFile_grow(mfself, mfself->position+1);
+          sprintf(mfself->storage->data+mfself->position, "\n");
+          mfself->position++;
+        }
+      }
+    }
+    if(mfself->position > mfself->size)
+    {
+      mfself->size = mfself->position;
+      mfself->storage->data[mfself->size] = '\0';
+    }
+  }
+
+  return n;
+}
+
+static char* THMemoryFile_cloneString(const char *str, long size)
+{
+  char *cstr = THAlloc(size);
+  memcpy(cstr, str, size);
+  return cstr;
+}
+
+static size_t THMemoryFile_readString(THFile *self, const char *format, char **str_)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file");
+  THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'");
+
+  if(mfself->position == mfself->size) /* eof ? */
+  {
+    mfself->file.hasError = 1;
+    if(!mfself->file.isQuiet)
+      THError("read error: read 0 blocks instead of 1");
+
+    *str_ = NULL;
+    return 0;
+  }
+
+  if(format[1] == 'a')
+  {
+    size_t str_size = mfself->size-mfself->position;
+
+    *str_ = THMemoryFile_cloneString(mfself->storage->data+mfself->position, str_size);
+    mfself->position = mfself->size;
+
+    return str_size;
+  }
+  else
+  {
+    char *p = mfself->storage->data+mfself->position;
+    int eolFound = 0;
+    size_t posEol;
+    size_t i;
+    for(i = 0; i < mfself->size-mfself->position; i++)
+    {
+      if(p[i] == '\n')
+      {
+        posEol = i;
+        eolFound = 1;
+        break;
+      }
+    }
+
+    if(eolFound)
+    {
+      *str_ = THMemoryFile_cloneString(mfself->storage->data+mfself->position, posEol);
+      mfself->position += posEol+1;
+      return posEol;
+    }
+    else /* well, we read all! */
+    {
+      size_t str_size = mfself->size-mfself->position;
+
+      *str_ = THMemoryFile_cloneString(mfself->storage->data+mfself->position, str_size);
+      mfself->position = mfself->size;
+
+      return str_size;
+    }
+  }
+
+  *str_ = NULL;
+  return 0;
+}
+
+static size_t THMemoryFile_writeString(THFile *self, const char *str, size_t size)
+{
+  THMemoryFile *mfself = (THMemoryFile*)self;
+
+  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
+  THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file");
+
+  THMemoryFile_grow(mfself, mfself->position+size);
+  memmove(mfself->storage->data+mfself->position, str, size);
+  mfself->position += size;
+  if(mfself->position > mfself->size)
+  {
+    mfself->size = mfself->position;
+    mfself->storage->data[mfself->size] = '\0';
+  }
+
+  return size;
+}
+
+THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
+{
+  static struct THFileVTable vtable = {
+    THMemoryFile_isOpened,
+
+    THMemoryFile_readByte,
+    THMemoryFile_readChar,
+    THMemoryFile_readShort,
+    THMemoryFile_readInt,
+    THMemoryFile_readLong,
+    THMemoryFile_readFloat,
+    THMemoryFile_readDouble,
+    THMemoryFile_readString,
+
+    THMemoryFile_writeByte,
+    THMemoryFile_writeChar,
+    THMemoryFile_writeShort,
+    THMemoryFile_writeInt,
+    THMemoryFile_writeLong,
+    THMemoryFile_writeFloat,
+    THMemoryFile_writeDouble,
+    THMemoryFile_writeString,
+
+    THMemoryFile_synchronize,
+    THMemoryFile_seek,
+    THMemoryFile_seekEnd,
+    THMemoryFile_position,
+    THMemoryFile_close,
+    THMemoryFile_free
+  };
+
+  THMemoryFile *mfself;
+  int isReadable;
+  int isWritable;
+
+  if(storage)
+  {
+    THArgCheck(storage->data[storage->size-1] == '\0', 1, "provided CharStorage must be terminated by 0");
+    THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
+    THCharStorage_retain(storage);
+  }
+  else
+  {
+    THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
+    storage = THCharStorage_newWithSize(1);
+    storage->data[0] = '\0';
+  }
+
+  mfself = THAlloc(sizeof(THMemoryFile));
+
+  mfself->storage = storage;
+  mfself->size = (storage ? storage->size-1 : 0);
+  mfself->position = 0;
+  mfself->longSize = 0;
+
+  mfself->file.vtable = &vtable;
+  mfself->file.isQuiet = 0;
+  mfself->file.isReadable = isReadable;
+  mfself->file.isWritable = isWritable;
+  mfself->file.isBinary = 0;
+  mfself->file.isAutoSpacing = 1;
+  mfself->file.hasError = 0;
+
+  return (THFile*)mfself;
+}
+
+THFile *THMemoryFile_new(const char *mode)
+{
+  return THMemoryFile_newWithStorage(NULL, mode);
+}
diff --git a/lib/TH/THMemoryFile.h b/lib/TH/THMemoryFile.h
new file mode 100644
index 0000000..b54cdcc
--- /dev/null
+++ b/lib/TH/THMemoryFile.h
@@ -0,0 +1,13 @@
+#ifndef TH_MEMORY_FILE_INC
+#define TH_MEMORY_FILE_INC
+
+#include "THFile.h"
+#include "THStorage.h"
+
+TH_API THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode);
+TH_API THFile *THMemoryFile_new(const char *mode);
+
+TH_API THCharStorage *THMemoryFile_storage(THFile *self);
+TH_API void THMemoryFile_longSize(THFile *self, int size);
+
+#endif
diff --git a/lib/TH/THRandom.c b/lib/TH/THRandom.c
new file mode 100644
index 0000000..55ee943
--- /dev/null
+++ b/lib/TH/THRandom.c
@@ -0,0 +1,274 @@
+#include "THGeneral.h"
+#include "THRandom.h"
+
+#ifndef _WIN32
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+/* Code for the Mersenne Twister random generator.... */
+#define n _MERSENNE_STATE_N
+#define m _MERSENNE_STATE_M
+
+/* Creates (unseeded) new generator*/
+static THGenerator* THGenerator_newUnseeded()
+{
+  THGenerator *self = THAlloc(sizeof(THGenerator));
+  memset(self, 0, sizeof(THGenerator));
+  self->left = 1;
+  self->seeded = 0;
+  self->normal_is_valid = 0;
+  return self;
+}
+
+/* Creates new generator and makes sure it is seeded*/
+THGenerator* THGenerator_new()
+{
+  THGenerator *self = THGenerator_newUnseeded();
+  THRandom_seed(self);
+  return self;
+}
+
+THGenerator* THGenerator_copy(THGenerator *self, THGenerator *from)
+{
+    memcpy(self, from, sizeof(THGenerator));
+    return self;
+}
+
+void THGenerator_free(THGenerator *self)
+{
+  THFree(self);
+}
+
+int THGenerator_isValid(THGenerator *_generator)
+{
+  if ((_generator->seeded == 1) &&
+    (_generator->left > 0 && _generator->left <= n) && (_generator->next <= n))
+    return 1;
+
+  return 0;
+}
+
+#ifndef _WIN32
+static unsigned long readURandomLong()
+{
+  int randDev = open("/dev/urandom", O_RDONLY);
+  unsigned long randValue;
+  if (randDev < 0) {
+    THError("Unable to open /dev/urandom");
+  }
+  ssize_t readBytes = read(randDev, &randValue, sizeof(randValue));
+  if (readBytes < sizeof(randValue)) {
+    THError("Unable to read from /dev/urandom");
+  }
+  close(randDev);
+  return randValue;
+}
+#endif // _WIN32
+
+unsigned long THRandom_seed(THGenerator *_generator)
+{
+#ifdef _WIN32
+  unsigned long s = (unsigned long)time(0);
+#else
+  unsigned long s = readURandomLong();
+#endif
+  THRandom_manualSeed(_generator, s);
+  return s;
+}
+
+/* The next 4 methods are taken from http:www.math.keio.ac.jpmatumotoemt.html
+   Here is the copyright:
+   Some minor modifications have been made to adapt to "my" C... */
+
+/*
+   A C-program for MT19937, with initialization improved 2002/2/10.
+   Coded by Takuji Nishimura and Makoto Matsumoto.
+   This is a faster version by taking Shawn Cokus's optimization,
+   Matthe Bellew's simplification, Isaku Wada's double version.
+
+   Before using, initialize the state by using init_genrand(seed)
+   or init_by_array(init_key, key_length).
+
+   Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+     1. Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+
+     2. Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+
+     3. The names of its contributors may not be used to endorse or promote
+        products derived from this software without specific prior written
+        permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+   Any feedback is very welcome.
+   http://www.math.keio.ac.jp/matumoto/emt.html
+   email: matumoto at math.keio.ac.jp
+*/
+
+/* Macros for the Mersenne Twister random generator... */
+/* Period parameters */
+/* #define n 624 */
+/* #define m 397 */
+#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
+#define UMASK 0x80000000UL /* most significant w-r bits */
+#define LMASK 0x7fffffffUL /* least significant r bits */
+#define MIXBITS(u,v) ( ((u) & UMASK) | ((v) & LMASK) )
+#define TWIST(u,v) ((MIXBITS(u,v) >> 1) ^ ((v)&1UL ? MATRIX_A : 0UL))
+/*********************************************************** That's it. */
+
+void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_)
+{
+  int j;
+
+  /* This ensures reseeding resets all of the state (i.e. state for Gaussian numbers) */
+  THGenerator *blank = THGenerator_newUnseeded();
+  THGenerator_copy(_generator, blank);
+  THGenerator_free(blank);
+
+  _generator->the_initial_seed = the_seed_;
+  _generator->state[0] = _generator->the_initial_seed & 0xffffffffUL;
+  for(j = 1; j < n; j++)
+  {
+    _generator->state[j] = (1812433253UL * (_generator->state[j-1] ^ (_generator->state[j-1] >> 30)) + j);
+    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
+    /* In the previous versions, mSBs of the seed affect   */
+    /* only mSBs of the array state[].                        */
+    /* 2002/01/09 modified by makoto matsumoto             */
+    _generator->state[j] &= 0xffffffffUL;  /* for >32 bit machines */
+  }
+  _generator->left = 1;
+  _generator->seeded = 1;
+}
+
+unsigned long THRandom_initialSeed(THGenerator *_generator)
+{
+  return _generator->the_initial_seed;
+}
+
+void THRandom_nextState(THGenerator *_generator)
+{
+  unsigned long *p = _generator->state;
+  int j;
+
+  _generator->left = n;
+  _generator->next = 0;
+
+  for(j = n-m+1; --j; p++)
+    *p = p[m] ^ TWIST(p[0], p[1]);
+
+  for(j = m; --j; p++)
+    *p = p[m-n] ^ TWIST(p[0], p[1]);
+
+  *p = p[m-n] ^ TWIST(p[0], _generator->state[0]);
+}
+
+unsigned long THRandom_random(THGenerator *_generator)
+{
+  unsigned long y;
+
+  if (--(_generator->left) == 0)
+    THRandom_nextState(_generator);
+  y = *(_generator->state + (_generator->next)++);
+
+  /* Tempering */
+  y ^= (y >> 11);
+  y ^= (y << 7) & 0x9d2c5680UL;
+  y ^= (y << 15) & 0xefc60000UL;
+  y ^= (y >> 18);
+
+  return y;
+}
+
+/* generates a random number on [0,1)-double-interval */
+static double __uniform__(THGenerator *_generator)
+{
+  /* divided by 2^32 */
+  return (double)THRandom_random(_generator) * (1.0/4294967296.0);
+}
+
+/*********************************************************
+
+ Thanks *a lot* Takuji Nishimura and Makoto Matsumoto!
+
+ Now my own code...
+
+*********************************************************/
+
+double THRandom_uniform(THGenerator *_generator, double a, double b)
+{
+  return(__uniform__(_generator) * (b - a) + a);
+}
+
+double THRandom_normal(THGenerator *_generator, double mean, double stdv)
+{
+  THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
+
+  /* This is known as the Box-Muller method */
+  if(!_generator->normal_is_valid)
+  {
+    _generator->normal_x = __uniform__(_generator);
+    _generator->normal_y = __uniform__(_generator);
+    _generator->normal_rho = sqrt(-2. * log(1.0-_generator->normal_y));
+    _generator->normal_is_valid = 1;
+  }
+  else
+    _generator->normal_is_valid = 0;
+
+  if(_generator->normal_is_valid)
+    return _generator->normal_rho*cos(2.*M_PI*_generator->normal_x)*stdv+mean;
+  else
+    return _generator->normal_rho*sin(2.*M_PI*_generator->normal_x)*stdv+mean;
+}
+
+double THRandom_exponential(THGenerator *_generator, double lambda)
+{
+  return(-1. / lambda * log(1-__uniform__(_generator)));
+}
+
+double THRandom_cauchy(THGenerator *_generator, double median, double sigma)
+{
+  return(median + sigma * tan(M_PI*(__uniform__(_generator)-0.5)));
+}
+
+/* Faut etre malade pour utiliser ca.
+   M'enfin. */
+double THRandom_logNormal(THGenerator *_generator, double mean, double stdv)
+{
+  double zm = mean*mean;
+  double zs = stdv*stdv;
+  THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
+  return(exp(THRandom_normal(_generator, log(zm/sqrt(zs + zm)), sqrt(log(zs/zm+1)) )));
+}
+
+int THRandom_geometric(THGenerator *_generator, double p)
+{
+  THArgCheck(p > 0 && p < 1, 1, "must be > 0 and < 1");
+  return((int)(log(1-__uniform__(_generator)) / log(p)) + 1);
+}
+
+int THRandom_bernoulli(THGenerator *_generator, double p)
+{
+  THArgCheck(p >= 0 && p <= 1, 1, "must be >= 0 and <= 1");
+  return(__uniform__(_generator) <= p);
+}
diff --git a/lib/TH/THRandom.h b/lib/TH/THRandom.h
new file mode 100644
index 0000000..28a14c0
--- /dev/null
+++ b/lib/TH/THRandom.h
@@ -0,0 +1,81 @@
+#ifndef TH_RANDOM_INC
+#define TH_RANDOM_INC
+
+#include "THGeneral.h"
+
+#define _MERSENNE_STATE_N 624
+#define _MERSENNE_STATE_M 397
+/* A THGenerator contains all the state required for a single random number stream */
+typedef struct THGenerator {
+  /* The initial seed. */
+  unsigned long the_initial_seed;
+  int left;  /* = 1; */
+  int seeded; /* = 0; */
+  unsigned long next;
+  unsigned long state[_MERSENNE_STATE_N]; /* the array for the state vector  */
+  /********************************/
+
+  /* For normal distribution */
+  double normal_x;
+  double normal_y;
+  double normal_rho;
+  int normal_is_valid; /* = 0; */
+} THGenerator;
+
+#define torch_Generator "torch.Generator"
+
+/* Manipulate THGenerator objects */
+TH_API THGenerator * THGenerator_new(void);
+TH_API THGenerator * THGenerator_copy(THGenerator *self, THGenerator *from);
+TH_API void THGenerator_free(THGenerator *gen);
+
+/* Checks if given generator is valid */
+TH_API int THGenerator_isValid(THGenerator *_generator);
+
+/* Initializes the random number generator from /dev/urandom (or on Windows
+platforms with the current time (granularity: seconds)) and returns the seed. */
+TH_API unsigned long THRandom_seed(THGenerator *_generator);
+
+/* Initializes the random number generator with the given long "the_seed_". */
+TH_API void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_);
+
+/* Returns the starting seed used. */
+TH_API unsigned long THRandom_initialSeed(THGenerator *_generator);
+
+/* Generates a uniform 32 bits integer. */
+TH_API unsigned long THRandom_random(THGenerator *_generator);
+
+/* Generates a uniform random number on [0,1[. */
+TH_API double THRandom_uniform(THGenerator *_generator, double a, double b);
+
+/** Generates a random number from a normal distribution.
+    (With mean #mean# and standard deviation #stdv >= 0#).
+*/
+TH_API double THRandom_normal(THGenerator *_generator, double mean, double stdv);
+
+/** Generates a random number from an exponential distribution.
+    The density is $p(x) = lambda * exp(-lambda * x)$, where
+    lambda is a positive number.
+*/
+TH_API double THRandom_exponential(THGenerator *_generator, double lambda);
+
+/** Returns a random number from a Cauchy distribution.
+    The Cauchy density is $p(x) = sigma/(pi*(sigma^2 + (x-median)^2))$
+*/
+TH_API double THRandom_cauchy(THGenerator *_generator, double median, double sigma);
+
+/** Generates a random number from a log-normal distribution.
+    (#mean > 0# is the mean of the log-normal distribution
+    and #stdv# is its standard deviation).
+*/
+TH_API double THRandom_logNormal(THGenerator *_generator, double mean, double stdv);
+
+/** Generates a random number from a geometric distribution.
+    It returns an integer #i#, where $p(i) = (1-p) * p^(i-1)$.
+    p must satisfy $0 < p < 1$.
+*/
+TH_API int THRandom_geometric(THGenerator *_generator, double p);
+
+/* Returns true with probability $p$ and false with probability $1-p$ (p > 0). */
+TH_API int THRandom_bernoulli(THGenerator *_generator, double p);
+#endif
diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c
new file mode 100644
index 0000000..d18488e
--- /dev/null
+++ b/lib/TH/THStorage.c
@@ -0,0 +1,8 @@
+#include "THAtomic.h"
+#include "THStorage.h"
+
+#include "generic/THStorage.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorageCopy.c"
+#include "THGenerateAllTypes.h"
diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h
new file mode 100644
index 0000000..36ed507
--- /dev/null
+++ b/lib/TH/THStorage.h
@@ -0,0 +1,20 @@
+#ifndef TH_STORAGE_INC
+#define TH_STORAGE_INC
+
+#include "THGeneral.h"
+#include "THAllocator.h"
+
+#define THStorage        TH_CONCAT_3(TH,Real,Storage)
+#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
+
+/* fast access methods */
+#define TH_STORAGE_GET(storage, idx) ((storage)->data[(idx)])
+#define TH_STORAGE_SET(storage, idx, value) ((storage)->data[(idx)] = (value))
+
+#include "generic/THStorage.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THStorageCopy.h"
+#include "THGenerateAllTypes.h"
+
+#endif
diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c
new file mode 100644
index 0000000..b0ab0a5
--- /dev/null
+++ b/lib/TH/THTensor.c
@@ -0,0 +1,26 @@
+#include "THAtomic.h"
+#include "THTensor.h"
+#include "THVector.h"
+#include "THBlas.h"
+#include "THLapack.h"
+#include "THRandom.h"
+#include "THTensorDimApply.h"
+#include "THMath.h"
+
+#include "generic/THTensor.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorCopy.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorRandom.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorMath.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorConv.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorLapack.c"
+#include "THGenerateFloatTypes.h"
diff --git a/lib/TH/THTensor.h b/lib/TH/THTensor.h
new file mode 100644
index 0000000..6eddf9c
--- /dev/null
+++ b/lib/TH/THTensor.h
@@ -0,0 +1,41 @@
+#ifndef TH_TENSOR_INC
+#define TH_TENSOR_INC
+
+#include "THStorage.h"
+#include "THTensorApply.h"
+
+#define THTensor          TH_CONCAT_3(TH,Real,Tensor)
+#define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)
+
+#define TH_DESC_BUFF_LEN 64
+typedef struct {
+    char str[TH_DESC_BUFF_LEN];
+} THDescBuff;
+
+/* basics */
+#include "generic/THTensor.h"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THTensorCopy.h"
+#include "THGenerateAllTypes.h"
+
+#include "THTensorMacros.h"
+
+/* random numbers */
+#include "THRandom.h"
+#include "generic/THTensorRandom.h"
+#include "THGenerateAllTypes.h"
+
+/* maths */
+#include "generic/THTensorMath.h"
+#include "THGenerateAllTypes.h"
+
+/* convolutions */
+#include "generic/THTensorConv.h"
+#include "THGenerateAllTypes.h"
+
+/* lapack support */
+#include "generic/THTensorLapack.h"
+#include "THGenerateFloatTypes.h"
+
+#endif
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h
new file mode 100644
index 0000000..f525088
--- /dev/null
+++ b/lib/TH/THTensorApply.h
@@ -0,0 +1,428 @@
+#ifndef TH_TENSOR_APPLY_INC
+#define TH_TENSOR_APPLY_INC
+
+#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = NULL; \
+  long *TENSOR1##_counter = NULL; \
+  long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
+  TYPE2 *TENSOR2##_data = NULL; \
+  long *TENSOR2##_counter = NULL; \
+  long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
+  TYPE3 *TENSOR3##_data = NULL; \
+  long *TENSOR3##_counter = NULL; \
+  long TENSOR3##_stride = 0, TENSOR3##_size = 0, TENSOR3##_dim = 0, TENSOR3##_i, TENSOR3##_n; \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+\
+  TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
+  for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
+    TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
+\
+  TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
+  for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
+    TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
+\
+  TENSOR3##_n = (TENSOR3->nDimension ? 1 : 0); \
+  for(TENSOR3##_i = 0; TENSOR3##_i < TENSOR3->nDimension; TENSOR3##_i++) \
+    TENSOR3##_n *= TENSOR3->size[TENSOR3##_i]; \
+\
+  if(TENSOR1##_n != TENSOR2##_n || TENSOR1##_n != TENSOR3##_n) /* should we do the check in the function instead? i think so */ \
+    THError("inconsistent tensor size"); \
+\
+  if(TENSOR1->nDimension == 0) \
+    TH_TENSOR_APPLY_hasFinished = 1; \
+  else \
+  { \
+    TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
+    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
+    { \
+      if(TENSOR1->size[TENSOR1##_dim] != 1) \
+        break; \
+    } \
+    TENSOR1##_stride = (TENSOR1##_dim == -1 ? 0 : TENSOR1->stride[TENSOR1##_dim]); \
+    TENSOR1##_size = 1; \
+    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
+    { \
+      if(TENSOR1->size[TENSOR1##_dim] != 1) \
+      { \
+        if(TENSOR1->stride[TENSOR1##_dim] == TENSOR1##_size) \
+          TENSOR1##_size *= TENSOR1->size[TENSOR1##_dim]; \
+        else \
+          break; \
+      } \
+    } \
+    TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(TENSOR1##_dim+1)); \
+    for(TENSOR1##_i = 0; TENSOR1##_i <= TENSOR1##_dim; TENSOR1##_i++) \
+      TENSOR1##_counter[TENSOR1##_i] = 0; \
+\
+    TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
+    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
+    { \
+      if(TENSOR2->size[TENSOR2##_dim] != 1) \
+        break; \
+    } \
+    TENSOR2##_stride = (TENSOR2##_dim == -1 ? 0 : TENSOR2->stride[TENSOR2##_dim]); \
+    TENSOR2##_size = 1; \
+    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
+    { \
+      if(TENSOR2->size[TENSOR2##_dim] != 1) \
+      { \
+        if(TENSOR2->stride[TENSOR2##_dim] == TENSOR2##_size) \
+          TENSOR2##_size *= TENSOR2->size[TENSOR2##_dim]; \
+        else \
+          break; \
+      } \
+    } \
+    TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(TENSOR2##_dim+1)); \
+    for(TENSOR2##_i = 0; TENSOR2##_i <= TENSOR2##_dim; TENSOR2##_i++) \
+      TENSOR2##_counter[TENSOR2##_i] = 0; \
+\
+    TENSOR3##_data = TENSOR3->storage->data+TENSOR3->storageOffset; \
+    for(TENSOR3##_dim = TENSOR3->nDimension-1; TENSOR3##_dim >= 0; TENSOR3##_dim--) \
+    { \
+      if(TENSOR3->size[TENSOR3##_dim] != 1) \
+        break; \
+    } \
+    TENSOR3##_stride = (TENSOR3##_dim == -1 ? 0 : TENSOR3->stride[TENSOR3##_dim]); \
+    TENSOR3##_size = 1; \
+    for(TENSOR3##_dim = TENSOR3->nDimension-1; TENSOR3##_dim >= 0; TENSOR3##_dim--) \
+    { \
+      if(TENSOR3->size[TENSOR3##_dim] != 1) \
+      { \
+        if(TENSOR3->stride[TENSOR3##_dim] == TENSOR3##_size) \
+          TENSOR3##_size *= TENSOR3->size[TENSOR3##_dim]; \
+        else \
+          break; \
+      } \
+    } \
+    TENSOR3##_counter = (long*)THAlloc(sizeof(long)*(TENSOR3##_dim+1)); \
+    for(TENSOR3##_i = 0; TENSOR3##_i <= TENSOR3##_dim; TENSOR3##_i++) \
+      TENSOR3##_counter[TENSOR3##_i] = 0; \
+  } \
+\
+  TENSOR1##_i = 0; \
+  TENSOR2##_i = 0; \
+  TENSOR3##_i = 0; \
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+\
+    if(TENSOR1##_i == TENSOR1##_size) \
+    { \
+      if(TENSOR1##_dim == -1) \
+         break; \
+\
+      TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
+      for(TENSOR1##_i = TENSOR1##_dim; TENSOR1##_i >= 0; TENSOR1##_i--) \
+      { \
+        TENSOR1##_counter[TENSOR1##_i]++; \
+        TENSOR1##_data += TENSOR1->stride[TENSOR1##_i]; \
+\
+        if(TENSOR1##_counter[TENSOR1##_i]  == TENSOR1->size[TENSOR1##_i]) \
+        { \
+          if(TENSOR1##_i == 0) \
+          { \
+            TH_TENSOR_APPLY_hasFinished = 1; \
+            break; \
+          } \
+            else \
+          { \
+            TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1->stride[TENSOR1##_i]; \
+            TENSOR1##_counter[TENSOR1##_i] = 0; \
+          } \
+        } \
+        else \
+          break; \
+      } \
+      TENSOR1##_i = 0; \
+    } \
+\
+    if(TENSOR2##_i == TENSOR2##_size) \
+    { \
+      if(TENSOR2##_dim == -1) \
+         break; \
+\
+      TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
+      for(TENSOR2##_i = TENSOR2##_dim; TENSOR2##_i >= 0; TENSOR2##_i--) \
+      { \
+        TENSOR2##_counter[TENSOR2##_i]++; \
+        TENSOR2##_data += TENSOR2->stride[TENSOR2##_i]; \
+\
+        if(TENSOR2##_counter[TENSOR2##_i]  == TENSOR2->size[TENSOR2##_i]) \
+        { \
+          if(TENSOR2##_i == 0) \
+          { \
+            TH_TENSOR_APPLY_hasFinished = 1; \
+            break; \
+          } \
+            else \
+          { \
+            TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2->stride[TENSOR2##_i]; \
+            TENSOR2##_counter[TENSOR2##_i] = 0; \
+          } \
+        } \
+        else \
+          break; \
+      } \
+      TENSOR2##_i = 0; \
+    } \
+\
+    if(TENSOR3##_i == TENSOR3##_size) \
+    { \
+      if(TENSOR3##_dim == -1) \
+         break; \
+\
+      TENSOR3##_data -= TENSOR3##_size*TENSOR3##_stride; \
+      for(TENSOR3##_i = TENSOR3##_dim; TENSOR3##_i >= 0; TENSOR3##_i--) \
+      { \
+        TENSOR3##_counter[TENSOR3##_i]++; \
+        TENSOR3##_data += TENSOR3->stride[TENSOR3##_i]; \
+\
+        if(TENSOR3##_counter[TENSOR3##_i]  == TENSOR3->size[TENSOR3##_i]) \
+        { \
+          if(TENSOR3##_i == 0) \
+          { \
+            TH_TENSOR_APPLY_hasFinished = 1; \
+            break; \
+          } \
+            else \
+          { \
+            TENSOR3##_data -= TENSOR3##_counter[TENSOR3##_i]*TENSOR3->stride[TENSOR3##_i]; \
+            TENSOR3##_counter[TENSOR3##_i] = 0; \
+          } \
+        } \
+        else \
+          break; \
+      } \
+      TENSOR3##_i = 0; \
+    } \
+  } \
+  THFree(TENSOR1##_counter); \
+  THFree(TENSOR2##_counter); \
+  THFree(TENSOR3##_counter); \
+}
+
+#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = NULL; \
+  long *TENSOR1##_counter = NULL; \
+  long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
+  TYPE2 *TENSOR2##_data = NULL; \
+  long *TENSOR2##_counter = NULL; \
+  long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+\
+  TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
+  for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
+    TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
+\
+  TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
+  for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
+    TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
+\
+  if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
+    THError("inconsistent tensor size"); \
+\
+  if(TENSOR1->nDimension == 0) \
+    TH_TENSOR_APPLY_hasFinished = 1; \
+  else \
+  { \
+    TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
+    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
+    { \
+      if(TENSOR1->size[TENSOR1##_dim] != 1) \
+        break; \
+    } \
+    TENSOR1##_stride = (TENSOR1##_dim == -1 ? 0 : TENSOR1->stride[TENSOR1##_dim]); \
+    TENSOR1##_size = 1; \
+    for(TENSOR1##_dim = TENSOR1->nDimension-1; TENSOR1##_dim >= 0; TENSOR1##_dim--) \
+    { \
+      if(TENSOR1->size[TENSOR1##_dim] != 1) \
+      { \
+        if(TENSOR1->stride[TENSOR1##_dim] == TENSOR1##_size) \
+          TENSOR1##_size *= TENSOR1->size[TENSOR1##_dim]; \
+        else \
+          break; \
+      } \
+    } \
+    TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(TENSOR1##_dim+1)); \
+    for(TENSOR1##_i = 0; TENSOR1##_i <= TENSOR1##_dim; TENSOR1##_i++) \
+      TENSOR1##_counter[TENSOR1##_i] = 0; \
+\
+    TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
+    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
+    { \
+      if(TENSOR2->size[TENSOR2##_dim] != 1) \
+        break; \
+    } \
+    TENSOR2##_stride = (TENSOR2##_dim == -1 ? 0 : TENSOR2->stride[TENSOR2##_dim]); \
+    TENSOR2##_size = 1; \
+    for(TENSOR2##_dim = TENSOR2->nDimension-1; TENSOR2##_dim >= 0; TENSOR2##_dim--) \
+    { \
+      if(TENSOR2->size[TENSOR2##_dim] != 1) \
+      { \
+        if(TENSOR2->stride[TENSOR2##_dim] == TENSOR2##_size) \
+          TENSOR2##_size *= TENSOR2->size[TENSOR2##_dim]; \
+        else \
+          break; \
+      } \
+    } \
+    TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(TENSOR2##_dim+1)); \
+    for(TENSOR2##_i = 0; TENSOR2##_i <= TENSOR2##_dim; TENSOR2##_i++) \
+      TENSOR2##_counter[TENSOR2##_i] = 0; \
+  } \
+\
+  TENSOR1##_i = 0; \
+  TENSOR2##_i = 0; \
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+\
+    if(TENSOR1##_i == TENSOR1##_size) \
+    { \
+      if(TENSOR1##_dim == -1) \
+         break; \
+\
+      TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
+      for(TENSOR1##_i = TENSOR1##_dim; TENSOR1##_i >= 0; TENSOR1##_i--) \
+      { \
+        TENSOR1##_counter[TENSOR1##_i]++; \
+        TENSOR1##_data += TENSOR1->stride[TENSOR1##_i]; \
+\
+        if(TENSOR1##_counter[TENSOR1##_i]  == TENSOR1->size[TENSOR1##_i]) \
+        { \
+          if(TENSOR1##_i == 0) \
+          { \
+            TH_TENSOR_APPLY_hasFinished = 1; \
+            break; \
+          } \
+            else \
+          { \
+            TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1->stride[TENSOR1##_i]; \
+            TENSOR1##_counter[TENSOR1##_i] = 0; \
+          } \
+        } \
+        else \
+          break; \
+      } \
+      TENSOR1##_i = 0; \
+    } \
+\
+    if(TENSOR2##_i == TENSOR2##_size) \
+    { \
+      if(TENSOR2##_dim == -1) \
+         break; \
+\
+      TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
+      for(TENSOR2##_i = TENSOR2##_dim; TENSOR2##_i >= 0; TENSOR2##_i--) \
+      { \
+        TENSOR2##_counter[TENSOR2##_i]++; \
+        TENSOR2##_data += TENSOR2->stride[TENSOR2##_i]; \
+\
+        if(TENSOR2##_counter[TENSOR2##_i]  == TENSOR2->size[TENSOR2##_i]) \
+        { \
+          if(TENSOR2##_i == 0) \
+          { \
+            TH_TENSOR_APPLY_hasFinished = 1; \
+            break; \
+          } \
+            else \
+          { \
+            TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2->stride[TENSOR2##_i]; \
+            TENSOR2##_counter[TENSOR2##_i] = 0; \
+          } \
+        } \
+        else \
+          break; \
+      } \
+      TENSOR2##_i = 0; \
+    } \
+  } \
+  THFree(TENSOR1##_counter); \
+  THFree(TENSOR2##_counter); \
+}
+
+#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
+{ \
+  TYPE *TENSOR##_data = NULL; \
+  long *TENSOR##_counter = NULL; \
+  long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i; \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+\
+  if(TENSOR->nDimension == 0) \
+    TH_TENSOR_APPLY_hasFinished = 1; \
+  else \
+  { \
+    TENSOR##_data = TENSOR->storage->data+TENSOR->storageOffset; \
+\
+    /* what is the first stride (ignore first dims=1)? */ \
+    /* it will be used for the whole largest contiguous section */ \
+    for(TENSOR##_dim = TENSOR->nDimension-1; TENSOR##_dim >= 0; TENSOR##_dim--) \
+    { \
+      if(TENSOR->size[TENSOR##_dim] != 1) \
+        break; \
+    } \
+    TENSOR##_stride = (TENSOR##_dim == -1 ? 0 : TENSOR->stride[TENSOR##_dim]); \
+\
+    /* what is the largest contiguous section? */ \
+    TENSOR##_size = 1; \
+    for(TENSOR##_dim = TENSOR->nDimension-1; TENSOR##_dim >= 0; TENSOR##_dim--) \
+    { \
+      if(TENSOR->size[TENSOR##_dim] != 1) \
+      { \
+        if(TENSOR->stride[TENSOR##_dim] == TENSOR##_size) \
+          TENSOR##_size *= TENSOR->size[TENSOR##_dim]; \
+        else \
+          break; \
+      } \
+    } \
+\
+    /* counter over found dimensions */ \
+    TENSOR##_counter = (long*)THAlloc(sizeof(long)*(TENSOR##_dim+1)); \
+    for(TENSOR##_i = 0; TENSOR##_i <= TENSOR##_dim; TENSOR##_i++) \
+      TENSOR##_counter[TENSOR##_i] = 0; \
+  } \
+\
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    for(TENSOR##_i = 0; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+\
+    if(TENSOR##_dim == -1) \
+       break; \
+ \
+    TENSOR##_data -= TENSOR##_i*TENSOR##_stride; \
+    for(TENSOR##_i = TENSOR##_dim; TENSOR##_i >= 0; TENSOR##_i--) \
+    { \
+      TENSOR##_counter[TENSOR##_i]++; \
+      TENSOR##_data += TENSOR->stride[TENSOR##_i]; \
+\
+      if(TENSOR##_counter[TENSOR##_i]  == TENSOR->size[TENSOR##_i]) \
+      { \
+        if(TENSOR##_i == 0) \
+        { \
+          TH_TENSOR_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        else \
+        { \
+          TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR->stride[TENSOR##_i]; \
+          TENSOR##_counter[TENSOR##_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+  } \
+  THFree(TENSOR##_counter); \
+}
+
+#endif
diff --git a/lib/TH/THTensorDimApply.h b/lib/TH/THTensorDimApply.h
new file mode 100644
index 0000000..40822aa
--- /dev/null
+++ b/lib/TH/THTensorDimApply.h
@@ -0,0 +1,232 @@
+#ifndef TH_TENSOR_DIM_APPLY_INC
+#define TH_TENSOR_DIM_APPLY_INC
+
+#define TH_TENSOR_DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIMENSION, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = NULL; \
+  long TENSOR1##_stride = 0, TENSOR1##_size = 0; \
+  TYPE2 *TENSOR2##_data = NULL; \
+  long TENSOR2##_stride = 0, TENSOR2##_size = 0; \
+  TYPE3 *TENSOR3##_data = NULL; \
+  long TENSOR3##_stride = 0, TENSOR3##_size = 0; \
+  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
+  int TH_TENSOR_DIM_APPLY_i; \
+\
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->nDimension) ) \
+    THError("invalid dimension"); \
+  if( TENSOR1->nDimension != TENSOR2->nDimension ) \
+    THError("inconsistent tensor sizes"); \
+  if( TENSOR1->nDimension != TENSOR3->nDimension ) \
+    THError("inconsistent tensor sizes"); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+  { \
+    if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      continue; \
+    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) \
+      THError("inconsistent tensor sizes"); \
+    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR3->size[TH_TENSOR_DIM_APPLY_i]) \
+      THError("inconsistent tensor sizes"); \
+  } \
+\
+  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+\
+  TENSOR1##_data = (TENSOR1)->storage->data+(TENSOR1)->storageOffset; \
+  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
+  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
+\
+  TENSOR2##_data = (TENSOR2)->storage->data+(TENSOR2)->storageOffset; \
+  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
+  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
+\
+  TENSOR3##_data = (TENSOR3)->storage->data+(TENSOR3)->storageOffset; \
+  TENSOR3##_stride = (TENSOR3)->stride[DIMENSION]; \
+  TENSOR3##_size = TENSOR3->size[DIMENSION]; \
+\
+  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
+  { \
+    CODE \
+\
+    if(TENSOR1->nDimension == 1) \
+       break; \
+ \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+    { \
+      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        continue; \
+      } \
+\
+      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
+      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR3##_data += TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
+\
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        else \
+        { \
+          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+  } \
+  THFree(TH_TENSOR_DIM_APPLY_counter); \
+}
+
+#define TH_TENSOR_DIM_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, DIMENSION, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = NULL; \
+  long TENSOR1##_stride = 0, TENSOR1##_size = 0; \
+  TYPE2 *TENSOR2##_data = NULL; \
+  long TENSOR2##_stride = 0, TENSOR2##_size = 0; \
+  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
+  int TH_TENSOR_DIM_APPLY_i; \
+\
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->nDimension) ) \
+    THError("invalid dimension"); \
+  if( TENSOR1->nDimension != TENSOR2->nDimension ) \
+    THError("inconsistent tensor sizes"); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+  { \
+    if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      continue; \
+    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) \
+      THError("inconsistent tensor sizes"); \
+  } \
+\
+  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+\
+  TENSOR1##_data = (TENSOR1)->storage->data+(TENSOR1)->storageOffset; \
+  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
+  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
+\
+  TENSOR2##_data = (TENSOR2)->storage->data+(TENSOR2)->storageOffset; \
+  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
+  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
+\
+  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
+  { \
+    CODE \
+\
+    if(TENSOR1->nDimension == 1) \
+       break; \
+ \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+    { \
+      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        continue; \
+      } \
+\
+      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
+      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+\
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        else \
+        { \
+          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+  } \
+  THFree(TH_TENSOR_DIM_APPLY_counter); \
+}
+
+#define TH_TENSOR_DIM_APPLY(TYPE, TENSOR, DIMENSION, CODE) \
+{ \
+  TYPE *TENSOR##_data = NULL; \
+  long TENSOR##_stride = 0, TENSOR##_size = 0; \
+  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
+  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
+  int TH_TENSOR_DIM_APPLY_i; \
+\
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR->nDimension) ) \
+    THError("invalid dimension"); \
+\
+  TENSOR##_data = (TENSOR)->storage->data+(TENSOR)->storageOffset; \
+  TENSOR##_stride = (TENSOR)->stride[DIMENSION]; \
+  TENSOR##_size = TENSOR->size[DIMENSION]; \
+  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR->nDimension)); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+\
+  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
+  { \
+    CODE \
+\
+    if(TENSOR->nDimension == 1) \
+       break; \
+ \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->nDimension; TH_TENSOR_DIM_APPLY_i++) \
+    { \
+      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR->nDimension-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        continue; \
+      } \
+\
+      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
+      TENSOR##_data += TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
+\
+      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size[TH_TENSOR_DIM_APPLY_i]) \
+      { \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR->nDimension-1) \
+        { \
+          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
+          break; \
+        } \
+        else \
+        { \
+          TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
+          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
+        } \
+      } \
+      else \
+        break; \
+    } \
+  } \
+  THFree(TH_TENSOR_DIM_APPLY_counter); \
+}
+
+#endif
diff --git a/lib/TH/THTensorMacros.h b/lib/TH/THTensorMacros.h
new file mode 100644
index 0000000..15b6766
--- /dev/null
+++ b/lib/TH/THTensorMacros.h
@@ -0,0 +1,30 @@
+#ifndef TH_TENSOR_MACROS_INC
+#define TH_TENSOR_MACROS_INC
+
+/* fast method to access to tensor data */
+
+#define THTensor_fastGet1d(self, x0)                                    \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]])
+
+#define THTensor_fastGet2d(self, x0, x1)                                \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]])
+
+#define THTensor_fastGet3d(self, x0, x1, x2)                            \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]])
+
+#define THTensor_fastGet4d(self, x0, x1, x2, x3)                        \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]+(x3)*(self)->stride[3]])
+
+#define THTensor_fastSet1d(self, x0, value)                             \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]] = value)
+
+#define THTensor_fastSet2d(self, x0, x1, value)                         \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]] = value)
+
+#define THTensor_fastSet3d(self, x0, x1, x2, value)                     \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]] = value)
+
+#define THTensor_fastSet4d(self, x0, x1, x2, x3, value)                 \
+  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]+(x3)*(self)->stride[3]] = value)
+
+#endif
diff --git a/lib/TH/THVector.h b/lib/TH/THVector.h
new file mode 100644
index 0000000..1344e75
--- /dev/null
+++ b/lib/TH/THVector.h
@@ -0,0 +1,574 @@
+#ifndef TH_VECTOR_INC
+#define TH_VECTOR_INC
+
+#include "THGeneral.h"
+
+#define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
+
+#if defined USE_SSE2 || defined USE_SSE3 || defined USE_SSSE3 \
+  || defined USE_SSE4_1 || defined USE_SSE4_2
+
+#ifdef USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#ifdef USE_SSE3
+#include <pmmintrin.h>
+#endif
+
+#ifdef USE_SSSE3
+#include <tmmintrin.h>
+#endif
+
+#if defined (USE_SSE4_2) || defined (USE_SSE4_1)
+#include <smmintrin.h>
+#endif
+
+#define THDoubleVector_fill(x, c, n) {          \
+    long i;                                     \
+    long off;                                   \
+    __m128d XMM0 = _mm_set1_pd(c);              \
+    for (i=0; i<=((n)-8); i+=8) {               \
+      _mm_storeu_pd((x)+i  , XMM0);             \
+      _mm_storeu_pd((x)+i+2, XMM0);             \
+      _mm_storeu_pd((x)+i+4, XMM0);             \
+      _mm_storeu_pd((x)+i+6, XMM0);             \
+    }                                           \
+    off = (n) - ((n)%8);                        \
+    for (i=0; i<((n)%8); i++) {                 \
+      x[off+i] = c;                             \
+    }                                           \
+  }
+
+
+#define THDoubleVector_add(y, x, c, n) {        \
+    long i = 0;                                 \
+    __m128d XMM7 = _mm_set1_pd(c);              \
+    __m128d XMM0,XMM2;                          \
+    for (; i<=((n)-2); i+=2) {                  \
+      XMM0 = _mm_loadu_pd((x)+i);               \
+      XMM2 = _mm_loadu_pd((y)+i);               \
+      XMM0 = _mm_mul_pd(XMM0, XMM7);            \
+      XMM2 = _mm_add_pd(XMM2, XMM0);            \
+      _mm_storeu_pd((y)+i  , XMM2);             \
+    }                                           \
+    for (; i<(n); i++) {                        \
+      y[i] += c * x[i];                         \
+    }                                           \
+  }
+
+#define THDoubleVector_diff(z, x, y, n) {       \
+    long i;                                     \
+    for (i=0; i<=((n)-8); i+=8) {               \
+      __m128d XMM0 = _mm_loadu_pd((x)+i  );     \
+      __m128d XMM1 = _mm_loadu_pd((x)+i+2);     \
+      __m128d XMM2 = _mm_loadu_pd((x)+i+4);     \
+      __m128d XMM3 = _mm_loadu_pd((x)+i+6);     \
+      __m128d XMM4 = _mm_loadu_pd((y)+i  );     \
+      __m128d XMM5 = _mm_loadu_pd((y)+i+2);     \
+      __m128d XMM6 = _mm_loadu_pd((y)+i+4);     \
+      __m128d XMM7 = _mm_loadu_pd((y)+i+6);     \
+      XMM0 = _mm_sub_pd(XMM0, XMM4);            \
+      XMM1 = _mm_sub_pd(XMM1, XMM5);            \
+      XMM2 = _mm_sub_pd(XMM2, XMM6);            \
+      XMM3 = _mm_sub_pd(XMM3, XMM7);            \
+      _mm_storeu_pd((z)+i  , XMM0);             \
+      _mm_storeu_pd((z)+i+2, XMM1);             \
+      _mm_storeu_pd((z)+i+4, XMM2);             \
+      _mm_storeu_pd((z)+i+6, XMM3);             \
+    }                                           \
+    long off = (n) - ((n)%8);                   \
+    for (i=0; i<((n)%8); i++) {                 \
+      z[off+i] = x[off+i] - y[off+i];           \
+    }                                           \
+  }
+
+#define THDoubleVector_scale(y, c, n) {         \
+    long i;                                     \
+    __m128d XMM7 = _mm_set1_pd(c);              \
+    for (i=0; i<=((n)-4); i+=4) {               \
+      __m128d XMM0 = _mm_loadu_pd((y)+i  );     \
+      __m128d XMM1 = _mm_loadu_pd((y)+i+2);     \
+      XMM0 = _mm_mul_pd(XMM0, XMM7);            \
+      XMM1 = _mm_mul_pd(XMM1, XMM7);            \
+      _mm_storeu_pd((y)+i  , XMM0);             \
+      _mm_storeu_pd((y)+i+2, XMM1);             \
+    }                                           \
+    long off = (n) - ((n)%4);                   \
+    for (i=0; i<((n)%4); i++) {                 \
+      y[off+i] *= c;                            \
+    }                                           \
+  }
+
+#define THDoubleVector_mul(y, x, n) {           \
+    long i;                                     \
+    for (i=0; i<=((n)-8); i+=8) {               \
+      __m128d XMM0 = _mm_loadu_pd((x)+i  );     \
+      __m128d XMM1 = _mm_loadu_pd((x)+i+2);     \
+      __m128d XMM2 = _mm_loadu_pd((x)+i+4);     \
+      __m128d XMM3 = _mm_loadu_pd((x)+i+6);     \
+      __m128d XMM4 = _mm_loadu_pd((y)+i  );     \
+      __m128d XMM5 = _mm_loadu_pd((y)+i+2);     \
+      __m128d XMM6 = _mm_loadu_pd((y)+i+4);     \
+      __m128d XMM7 = _mm_loadu_pd((y)+i+6);     \
+      XMM4 = _mm_mul_pd(XMM4, XMM0);            \
+      XMM5 = _mm_mul_pd(XMM5, XMM1);            \
+      XMM6 = _mm_mul_pd(XMM6, XMM2);            \
+      XMM7 = _mm_mul_pd(XMM7, XMM3);            \
+      _mm_storeu_pd((y)+i  , XMM4);             \
+      _mm_storeu_pd((y)+i+2, XMM5);             \
+      _mm_storeu_pd((y)+i+4, XMM6);             \
+      _mm_storeu_pd((y)+i+6, XMM7);             \
+    }                                           \
+    long off = (n) - ((n)%8);                   \
+    for (i=0; i<((n)%8); i++) {                 \
+      y[off+i] *= x[off+i];                     \
+    }                                           \
+  }
+
+#define THFloatVector_fill(x, c, n) {           \
+    long i;                                     \
+    __m128 XMM0 = _mm_set_ps1(c);               \
+    long off;                                   \
+    for (i=0; i<=((n)-16); i+=16) {             \
+      _mm_storeu_ps((x)+i  ,  XMM0);            \
+      _mm_storeu_ps((x)+i+4,  XMM0);            \
+      _mm_storeu_ps((x)+i+8,  XMM0);            \
+      _mm_storeu_ps((x)+i+12, XMM0);            \
+    }                                           \
+    off = (n) - ((n)%16);                       \
+    for (i=0; i<((n)%16); i++) {                \
+      x[off+i] = c;                             \
+    }                                           \
+  }
+
+#define THFloatVector_add(y, x, c, n) {         \
+    long i = 0;                                 \
+    __m128 XMM7 = _mm_set_ps1(c);               \
+    __m128 XMM0,XMM2;                           \
+    for (; i<=((n)-4); i+=4) {                  \
+      XMM0 = _mm_loadu_ps((x)+i);               \
+      XMM2 = _mm_loadu_ps((y)+i);               \
+      XMM0 = _mm_mul_ps(XMM0, XMM7);            \
+      XMM2 = _mm_add_ps(XMM2, XMM0);            \
+      _mm_storeu_ps((y)+i  , XMM2);             \
+    }                                           \
+    for (; i<(n); i++) {                        \
+      y[i] += c * x[i];                         \
+    }                                           \
+  }
+
+#define THFloatVector_diff(z, x, y, n) {        \
+    long i;                                     \
+    for (i=0; i<=((n)-16); i+=16) {             \
+      __m128 XMM0 = _mm_loadu_ps((x)+i   );     \
+      __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);     \
+      __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);     \
+      __m128 XMM3 = _mm_loadu_ps((x)+i+12);     \
+      __m128 XMM4 = _mm_loadu_ps((y)+i   );     \
+      __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);     \
+      __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);     \
+      __m128 XMM7 = _mm_loadu_ps((y)+i+12);     \
+      XMM0 = _mm_sub_ps(XMM0, XMM4);            \
+      XMM1 = _mm_sub_ps(XMM1, XMM5);            \
+      XMM2 = _mm_sub_ps(XMM2, XMM6);            \
+      XMM3 = _mm_sub_ps(XMM3, XMM7);            \
+      _mm_storeu_ps((z)+i   , XMM0);            \
+      _mm_storeu_ps((z)+i+ 4, XMM1);            \
+      _mm_storeu_ps((z)+i+ 8, XMM2);            \
+      _mm_storeu_ps((z)+i+12, XMM3);            \
+    }                                           \
+    long off = (n) - ((n)%16);                  \
+    for (i=0; i<((n)%16); i++) {                \
+      z[off+i] = x[off+i] - y[off+i];           \
+    }                                           \
+  }
+
+#define THFloatVector_scale(y, c, n) {          \
+    long i;                                     \
+    __m128 XMM7 = _mm_set_ps1(c);               \
+    for (i=0; i<=((n)-8); i+=8) {               \
+      __m128 XMM0 = _mm_loadu_ps((y)+i  );      \
+      __m128 XMM1 = _mm_loadu_ps((y)+i+4);      \
+      XMM0 = _mm_mul_ps(XMM0, XMM7);            \
+      XMM1 = _mm_mul_ps(XMM1, XMM7);            \
+      _mm_storeu_ps((y)+i  , XMM0);             \
+      _mm_storeu_ps((y)+i+4, XMM1);             \
+    }                                           \
+    long off = (n) - ((n)%8);                   \
+    for (i=0; i<((n)%8); i++) {                 \
+      y[off+i] *= c;                            \
+    }                                           \
+  }
+
+#define THFloatVector_mul(y, x, n) {            \
+    long i;                                     \
+    for (i=0; i<=((n)-16); i+=16) {             \
+      __m128 XMM0 = _mm_loadu_ps((x)+i   );     \
+      __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);     \
+      __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);     \
+      __m128 XMM3 = _mm_loadu_ps((x)+i+12);     \
+      __m128 XMM4 = _mm_loadu_ps((y)+i   );     \
+      __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);     \
+      __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);     \
+      __m128 XMM7 = _mm_loadu_ps((y)+i+12);     \
+      XMM4 = _mm_mul_ps(XMM4, XMM0);            \
+      XMM5 = _mm_mul_ps(XMM5, XMM1);            \
+      XMM6 = _mm_mul_ps(XMM6, XMM2);            \
+      XMM7 = _mm_mul_ps(XMM7, XMM3);            \
+      _mm_storeu_ps((y)+i   , XMM4);            \
+      _mm_storeu_ps((y)+i+ 4, XMM5);            \
+      _mm_storeu_ps((y)+i+ 8, XMM6);            \
+      _mm_storeu_ps((y)+i+12, XMM7);            \
+    }                                           \
+    long off = (n) - ((n)%16);                  \
+    for (i=0; i<((n)%16); i++) {                \
+      y[off+i] *= x[off+i];                     \
+    }                                           \
+  }
+
+#elif defined __NEON__
+/* ARM NEON Assembly routine for operating on floats */
+
+#define THFloatVector_fill(x, c, n) {                   \
+        float ctemp = c;                                \
+        float * caddr = &ctemp;                         \
+        __asm__ __volatile__ (                          \
+            "mov         r0, %0           @ \n\t"       \
+            "ldr         r4, [%1]         @ \n\t"       \
+            "vdup.32     q12, r4          @ \n\t"       \
+            "vdup.32     q13, r4          @ \n\t"       \
+            "lsrs        r4, %2, #3       @ \n\t"       \
+            "beq         3f               @ \n\t"       \
+            "1:                           @ \n\t"       \
+            "vst1.32     {d24-d27}, [r0]! @ \n\t"       \
+            "subs        r4, r4, #1       @ \n\t"       \
+            "bne         1b               @ \n\t"       \
+            "3:                           @ \n\t"       \
+            "ands        r4, %2, #7       @ \n\t"       \
+            "beq         5f               @ \n\t"       \
+            "4:                           @ \n\t"       \
+            "subs        r4, r4, #1       @ \n\t"       \
+            "vst1.32     {d24[0]}, [r0]!  @ \n\t"       \
+            "bne         4b               @ \n\t"       \
+            "5:                           @ "           \
+            :                                           \
+            :"r" (x), "r"(caddr),"r"(n)                 \
+            : "cc", "r0", "r4",  "memory",              \
+              "q12",                                    \
+              "d24", "d25", "d26", "d27"                \
+            );                                          \
+    }
+
+#define THFloatVector_diff(z, x, y, n) {                                \
+        __asm__ __volatile__ (                                          \
+            "mov         r0, %2           @ \n\t"                       \
+            "mov         r1, %1           @ \n\t"                       \
+            "mov         r2, %0           @ \n\t"                       \
+            "lsrs        r4, %3, #3       @ \n\t"                       \
+            "beq         3f               @ \n\t"                       \
+            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "1:                           @ \n\t"                       \
+            "vsub.f32    q12, q8, q0      @ \n\t"                       \
+            "vsub.f32    q13, q9, q1      @ \n\t"                       \
+            "subs        r4, r4, #1       @ \n\t"                       \
+            "beq         2f               @ \n\t"                       \
+            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
+            "b           1b               @ \n\t"                       \
+            "2:                           @ \n\t"                       \
+            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
+            "3:                           @ \n\t"                       \
+            "ands        r4, %3, #7       @ \n\t"                       \
+            "beq         5f               @ \n\t"                       \
+            "4:                           @ \n\t"                       \
+            "subs        r4, r4, #1       @ \n\t"                       \
+            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
+            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
+            "vsub.f32    d24, d16, d0     @ \n\t"                       \
+            "vst1.32     {d24[0]}, [r2]!  @ \n\t"                       \
+            "bne         4b               @ \n\t"                       \
+            "5:                           @ "                           \
+            :                                                           \
+            :"r" (z), "r" (x),"r" (y), "r"(n)                           \
+            : "cc", "r0", "r1", "r2", "r4", "memory",                   \
+              "q0", "q1", "q8", "q9", "q12", "q13",                     \
+              "d0", "d1", "d2", "d3",                                   \
+              "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"    \
+            );                                                          \
+    }
+
+#define THFloatVector_scale(y, c, n) {                                  \
+        float ctemp = c;                                                \
+        float * caddr = &ctemp;                                         \
+        __asm__ __volatile__ (                                          \
+            "mov         r0, %0           @ \n\t"                       \
+            "mov         r2, r0           @ \n\t"                       \
+            "ldr         r5, [%1]         @ \n\t"                       \
+            "vdup.32     q14, r5          @ \n\t"                       \
+            "lsrs        r5, %2, #5       @ \n\t"                       \
+            "beq         3f               @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
+            "vld1.32     {d8-d11}, [r0]!  @ \n\t"                       \
+            "vld1.32     {d12-d15}, [r0]! @ \n\t"                       \
+            "1:                           @ \n\t"                       \
+            "vmul.f32    q0, q0, q14      @ \n\t"                       \
+            "vmul.f32    q1, q1, q14      @ \n\t"                       \
+            "vmul.f32    q2, q2, q14      @ \n\t"                       \
+            "vmul.f32    q3, q3, q14      @ \n\t"                       \
+            "vmul.f32    q4, q4, q14      @ \n\t"                       \
+            "vmul.f32    q5, q5, q14      @ \n\t"                       \
+            "vmul.f32    q6, q6, q14      @ \n\t"                       \
+            "vmul.f32    q7, q7, q14      @ \n\t"                       \
+            "subs        r5, r5, #1       @ \n\t"                       \
+            "beq         2f               @ \n\t"                       \
+            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
+            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
+            "vst1.32     {d8-d11}, [r2]!  @ \n\t"                       \
+            "vld1.32     {d8-d11}, [r0]!  @ \n\t"                       \
+            "vst1.32     {d12-d15}, [r2]! @ \n\t"                       \
+            "vld1.32     {d12-d15}, [r0]! @ \n\t"                       \
+            "b           1b               @ \n\t"                       \
+            "2:                           @ \n\t"                       \
+            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
+            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
+            "vst1.32     {d8-d11}, [r2]!  @ \n\t"                       \
+            "vst1.32     {d12-d15}, [r2]! @ \n\t"                       \
+            "3:                           @ \n\t"                       \
+            "lsrs        r5, %2, #4       @ \n\t"                       \
+            "ands        r5, r5, #1       @ \n\t"                       \
+            "beq         4f               @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
+            "vmul.f32    q0, q0, q14      @ \n\t"                       \
+            "vmul.f32    q1, q1, q14      @ \n\t"                       \
+            "vmul.f32    q2, q2, q14      @ \n\t"                       \
+            "vmul.f32    q3, q3, q14      @ \n\t"                       \
+            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
+            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
+            "4:                           @ \n\t"                       \
+            "lsrs        r5, %2, #3       @ \n\t"                       \
+            "ands        r5, r5, #1       @ \n\t"                       \
+            "beq         5f               @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vmul.f32    q0, q0, q14      @ \n\t"                       \
+            "vmul.f32    q1, q1, q14      @ \n\t"                       \
+            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
+            "5:                           @ \n\t"                       \
+            "ands        r5, %2, #7       @ \n\t"                       \
+            "beq         7f               @ \n\t"                       \
+            "6:                           @ \n\t"                       \
+            "subs        r5, r5, #1       @ \n\t"                       \
+            "vld1.32     d0[0], [r0]!     @ \n\t"                       \
+            "vmul.f32    d0, d0, d28      @ \n\t"                       \
+            "vst1.32     d0[0], [r2]!     @ \n\t"                       \
+            "bne         6b               @ \n\t"                       \
+            "7:                           @ "                           \
+            :                                                           \
+            :"r" (y), "r"(caddr),"r"(n)                                 \
+            : "cc", "r0", "r2", "r5", "memory",                         \
+              "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",    \
+              "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",           \
+              "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",     \
+              "d28", "d29"                                              \
+            );                                                          \
+    }
+
+#define THFloatVector_mul(y, x, n) {                                    \
+        __asm__ __volatile__ (                                          \
+            "mov         r0, %0           @ \n\t"                       \
+            "mov         r1, %1           @ \n\t"                       \
+            "mov         r2, r0           @ \n\t"                       \
+            "lsrs        r4, %2, #3       @ \n\t"                       \
+            "beq         3f               @ \n\t"                       \
+            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "1:                           @ \n\t"                       \
+            "vmul.f32    q12, q8, q0      @ \n\t"                       \
+            "vmul.f32    q13, q9, q1      @ \n\t"                       \
+            "subs        r4, r4, #1       @ \n\t"                       \
+            "beq         2f               @ \n\t"                       \
+            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
+            "b           1b               @ \n\t"                       \
+            "2:                           @ \n\t"                       \
+            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
+            "3:                           @ \n\t"                       \
+            "ands        r4, %2, #7       @ \n\t"                       \
+            "beq         5f               @ \n\t"                       \
+            "4:                           @ \n\t"                       \
+            "subs        r4, r4, #1       @ \n\t"                       \
+            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
+            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
+            "vmul.f32    q12, q8, q0      @ \n\t"                       \
+            "vst1.32     {d24[0]}, [r2]!  @ \n\t"                       \
+            "bne         4b               @ \n\t"                       \
+            "5:                           @ "                           \
+            :                                                           \
+            :"r" (y),"r" (x),"r"(n)                                     \
+            : "cc", "r0", "r1", "r2", "r4", "memory",                   \
+              "q0", "q1", "q8", "q9", "q12", "q13",                     \
+              "d0", "d1", "d2", "d3",                                   \
+              "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"    \
+            );                                                          \
+    }
+#define THFloatVector_add(y, x, c, n) {                                 \
+        float ctemp = c;                                                \
+        float * caddr = &ctemp;                                         \
+        __asm__ __volatile__ (                                          \
+            "mov         r0, %0           @ \n\t"                       \
+            "mov         r1, %1           @ \n\t"                       \
+            "mov         r2, r0           @ \n\t"                       \
+            "ldr         r5, [%2]         @ \n\t"                       \
+            "vdup.32     q14, r5          @ \n\t"                       \
+            "lsrs        r5, %3, #4       @ \n\t"                       \
+            "beq         3f               @ \n\t"                       \
+            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vld1.32     {d20-d23}, [r1]! @ \n\t"                       \
+            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
+            "1:                           @ \n\t"                       \
+            "vmla.f32    q0, q8, q14      @ \n\t"                       \
+            "vmla.f32    q1, q9, q14      @ \n\t"                       \
+            "vmla.f32    q2, q10, q14     @ \n\t"                       \
+            "vmla.f32    q3, q11, q14     @ \n\t"                       \
+            "subs        r5, r5, #1       @ \n\t"                       \
+            "beq         2f               @ \n\t"                       \
+            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
+            "vld1.32     {d20-d23}, [r1]! @ \n\t"                       \
+            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
+            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
+            "b           1b               @ \n\t"                       \
+            "2:                           @ \n\t"                       \
+            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
+            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
+            "3:                           @ \n\t"                       \
+            "lsrs        r5, %3, #3       @ \n\t"                       \
+            "ands        r5, #1           @ \n\t"                       \
+            "beq         4f               @ \n\t"                       \
+            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
+            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
+            "vmla.f32    q0, q8, q14      @ \n\t"                       \
+            "vmla.f32    q1, q9, q14      @ \n\t"                       \
+            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
+            "4:                           @ \n\t"                       \
+            "ands        r5, %3, #7       @ \n\t"                       \
+            "beq         6f               @ \n\t"                       \
+            "5:                           @ \n\t"                       \
+            "subs        r5, r5, #1       @ \n\t"                       \
+            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
+            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
+            "vmla.f32    d0, d16, d28     @ \n\t"                       \
+            "vst1.32     d0[0], [r2]!     @ \n\t"                       \
+            "bne         5b               @ \n\t"                       \
+            "6:                           @ "                           \
+            :                                                           \
+            :"r" (y),"r" (x), "r"(caddr),"r"(n)                         \
+            : "cc", "r0", "r1", "r2", "r5", "memory",                   \
+              "q0", "q1", "q2", "q3", "q14",                            \
+              "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",           \
+              "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29" \
+            );                                                          \
+    }
+
+static inline void THDoubleVector_fill(double *x, const double c, const long n) {
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    x[i] = c;
+    x[i+1] = c;
+    x[i+2] = c;
+    x[i+3] = c;
+  }
+
+  for(; i < n; i++)
+    x[i] = c;
+}
+
+static inline void THDoubleVector_add(double *y, const double *x, const double c, const long n)
+{
+  long i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    y[i] += c * x[i];
+    y[i+1] += c * x[i+1];
+    y[i+2] += c * x[i+2];
+    y[i+3] += c * x[i+3];
+  }
+
+  for(; i < n; i++)
+    y[i] += c * x[i];
+}
+
+static inline void THDoubleVector_diff(double *z, const double *x, const double *y, const long n)
+{
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    z[i] = x[i] - y[i];
+    z[i+1] = x[i+1] - y[i+1];
+    z[i+2] = x[i+2] - y[i+2];
+    z[i+3] = x[i+3] - y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] - y[i];
+}
+
+static inline void THDoubleVector_scale(double *y, const double c, const long n)
+{
+  long i = 0;
+
+  for(; i < n-4; i +=4)
+  {
+    y[i] *= c;
+    y[i+1] *= c;
+    y[i+2] *= c;
+    y[i+3] *= c;
+  }
+
+  for(; i < n; i++)
+    y[i] *= c;
+}
+
+static inline void THDoubleVector_mul(double *y, const double *x, const long n)
+{
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    y[i] *= x[i];
+    y[i+1] *= x[i+1];
+    y[i+2] *= x[i+2];
+    y[i+3] *= x[i+3];
+  }
+
+  for(; i < n; i++)
+    y[i] *= x[i];
+}
+
+
+#else
+
+/* If SSE2 not defined, then generate plain C operators */
+#include "generic/THVector.c"
+#include "THGenerateFloatTypes.h"
+
+#endif
+
+/* For non-float types, generate plain C operators */
+#include "generic/THVector.c"
+#include "THGenerateIntTypes.h"
+
+#endif
diff --git a/lib/TH/cmake/FindARM.cmake b/lib/TH/cmake/FindARM.cmake
new file mode 100644
index 0000000..cf1f8fd
--- /dev/null
+++ b/lib/TH/cmake/FindARM.cmake
@@ -0,0 +1,67 @@
+# Check if the processor is an ARM and if Neon instruction are available on the machine where
+# the project is compiled.
+
+IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+   EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
+
+   #neon instruction can be found on the majority part of modern ARM processor
+   STRING(REGEX REPLACE "^.*(neon).*$" "\\1" NEON_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "neon" "${NEON_THERE}" NEON_TRUE)
+   IF (NEON_TRUE)
+      set(NEON_FOUND true CACHE BOOL "NEON available on host")
+   ELSE (NEON_TRUE)
+      set(NEON_FOUND false CACHE BOOL "NEON available on host")
+   ENDIF (NEON_TRUE)
+
+   #Find the processor type (for now OMAP3 or OMAP4)
+   STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)
+   IF (OMAP3_TRUE)
+      set(CORTEXA8_FOUND true CACHE BOOL "OMAP3 available on host")
+   ELSE (OMAP3_TRUE)
+      set(CORTEXA8_FOUND false CACHE BOOL "OMAP3 available on host")
+   ENDIF (OMAP3_TRUE)
+
+   #Find the processor type (for now OMAP3 or OMAP4)
+   STRING(REGEX REPLACE "^.*(OMAP4).*$" "\\1" OMAP4_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "OMAP4" "${OMAP4_THERE}" OMAP4_TRUE)
+   IF (OMAP4_TRUE)
+      set(CORTEXA9_FOUND true CACHE BOOL "OMAP4 available on host")
+   ELSE (OMAP4_TRUE)
+      set(CORTEXA9_FOUND false CACHE BOOL "OMAP4 available on host")
+   ENDIF (OMAP4_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+   EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
+      CPUINFO)
+
+   #neon instruction can be found on the majority part of modern ARM processor
+   STRING(REGEX REPLACE "^.*(neon).*$" "\\1" NEON_THERE ${CPUINFO})
+   STRING(COMPARE EQUAL "neon" "${NEON_THERE}" NEON_TRUE)
+   IF (NEON_TRUE)
+      set(NEON_FOUND true CACHE BOOL "NEON available on host")
+   ELSE (NEON_TRUE)
+      set(NEON_FOUND false CACHE BOOL "NEON available on host")
+   ENDIF (NEON_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
+   # TODO
+   set(CORTEXA8_FOUND   false CACHE BOOL "OMAP3 not available on host")
+   set(CORTEXA9_FOUND   false CACHE BOOL "OMAP4 not available on host")
+   set(NEON_FOUND   false CACHE BOOL "NEON not available on host")
+ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
+   set(CORTEXA8_FOUND   false CACHE BOOL "OMAP3 not available on host")
+   set(CORTEXA9_FOUND   false CACHE BOOL "OMAP4 not available on host")
+   set(NEON_FOUND   false CACHE BOOL "NEON not available on host")
+ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+if(NOT NEON_FOUND)
+      MESSAGE(STATUS "Could not find hardware support for NEON on this machine.")
+endif(NOT NEON_FOUND)
+if(NOT CORTEXA8_FOUND)
+      MESSAGE(STATUS "No OMAP3 processor on this on this machine.")
+endif(NOT CORTEXA8_FOUND)
+if(NOT CORTEXA9_FOUND)
+      MESSAGE(STATUS "No OMAP4 processor on this on this machine.")
+endif(NOT CORTEXA9_FOUND)
+mark_as_advanced(NEON_FOUND)
diff --git a/lib/TH/cmake/FindBLAS.cmake b/lib/TH/cmake/FindBLAS.cmake
new file mode 100644
index 0000000..2188fc7
--- /dev/null
+++ b/lib/TH/cmake/FindBLAS.cmake
@@ -0,0 +1,299 @@
+# - Find BLAS library
+# This module finds an installed fortran library that implements the BLAS 
+# linear-algebra interface (see http://www.netlib.org/blas/).  
+# The list of libraries searched for is taken
+# from the autoconf macro file, acx_blas.m4 (distributed at
+# http://ac-archive.sourceforge.net/ac-archive/acx_blas.html).
+#
+# This module sets the following variables:
+#  BLAS_FOUND - set to true if a library implementing the BLAS interface is found.
+#  BLAS_INFO - name of the detected BLAS library.
+#  BLAS_F2C - set to true if following the f2c return convention
+#  BLAS_LIBRARIES - list of libraries to link against to use BLAS
+#  BLAS_INCLUDE_DIR - include directory
+
+# Do nothing is BLAS was found before
+IF(NOT BLAS_FOUND)
+
+SET(BLAS_LIBRARIES)
+SET(BLAS_INCLUDE_DIR)
+SET(BLAS_INFO)
+SET(BLAS_F2C)
+
+SET(WITH_BLAS "" CACHE STRING "Blas type [mkl/open/goto/acml/atlas/accelerate/veclib/generic]")
+
+# Old FindBlas
+INCLUDE(CheckCSourceRuns)
+INCLUDE(CheckFortranFunctionExists)
+
+MACRO(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list)
+  # This macro checks for the existence of the combination of fortran libraries
+  # given by _list.  If the combination is found, this macro checks (using the 
+  # Check_Fortran_Function_Exists macro) whether can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to NOTFOUND.
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+  
+  set(__list)
+  foreach(_elem ${_list})
+    if(__list)
+      set(__list "${__list} - ${_elem}")
+    else(__list)
+      set(__list "${_elem}")
+    endif(__list)
+  endforeach(_elem)
+  message(STATUS "Checking for [${__list}]")
+
+  set(_libraries_work TRUE)
+  set(${LIBRARIES})
+  set(_combined_name)
+  foreach(_library ${_list})
+    set(_combined_name ${_combined_name}_${_library})
+    if(_libraries_work)
+      if ( WIN32 )
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS ENV LIB 
+          PATHS ENV PATH )
+      endif ( WIN32 )
+      if ( APPLE ) 
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 
+          ENV DYLD_LIBRARY_PATH )
+      else ( APPLE )
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 
+          ENV LD_LIBRARY_PATH )
+      endif( APPLE )
+      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+      MESSAGE(STATUS "  Library ${_library}: ${${_prefix}_${_library}_LIBRARY}")
+    endif(_libraries_work)
+  endforeach(_library ${_list})
+  if(_libraries_work)
+    # Test this combination of libraries.
+    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
+    if (CMAKE_Fortran_COMPILER_WORKS)
+      check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS)
+    else (CMAKE_Fortran_COMPILER_WORKS)
+      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
+    endif (CMAKE_Fortran_COMPILER_WORKS)
+    set(CMAKE_REQUIRED_LIBRARIES)
+    mark_as_advanced(${_prefix}${_combined_name}_WORKS)
+    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  endif(_libraries_work)
+  if(NOT _libraries_work)
+    set(${LIBRARIES} NOTFOUND)
+  endif(NOT _libraries_work)
+endmacro(Check_Fortran_Libraries)
+
+# Intel MKL?
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "mkl")))
+  FIND_PACKAGE(MKL)
+  IF(MKL_FOUND)
+    SET(BLAS_INFO "mkl")
+    SET(BLAS_LIBRARIES ${MKL_LIBRARIES})
+    SET(BLAS_INCLUDE_DIR ${MKL_INCLUDE_DIR})
+    SET(BLAS_VERSION ${MKL_VERSION})
+  ENDIF(MKL_FOUND)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "openblas")
+  if(BLAS_LIBRARIES)
+    set(BLAS_INFO "open")
+  endif(BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "openblas;pthread")
+  if(BLAS_LIBRARIES)
+    set(BLAS_INFO "open")
+  endif(BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES) AND (WIN32)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "libopenblas")
+  if(BLAS_LIBRARIES)
+    set(BLAS_INFO "open")
+  endif(BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "goto")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "goto2;gfortran")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "goto")
+  endif (BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "goto")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "goto2;gfortran;pthread")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "goto")
+  endif (BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "acml")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "acml;gfortran")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "acml")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# Apple BLAS library?
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "accelerate")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "Accelerate")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "accelerate")
+    set(BLAS_IS_ACCELERATE 1)
+  endif (BLAS_LIBRARIES)
+endif()
+
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "veclib")))
+  check_fortran_libraries(
+    BLAS_LIBRARIES
+    BLAS
+    sgemm
+    ""
+    "vecLib")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "veclib")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "atlas")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "ptf77blas;atlas;gfortran")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "atlas")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# Generic BLAS library?
+if((NOT BLAS_LIBRARIES)
+    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "generic")))
+  check_fortran_libraries(
+  BLAS_LIBRARIES
+  BLAS
+  sgemm
+  ""
+  "blas")
+  if (BLAS_LIBRARIES)
+    set(BLAS_INFO "generic")
+  endif (BLAS_LIBRARIES)
+endif()
+
+# Determine if blas was compiled with the f2c conventions
+IF (BLAS_LIBRARIES)
+  SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+  CHECK_C_SOURCE_RUNS("
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+int four = 4;
+int one = 1;
+extern double sdot_();
+int main() {
+  int i;
+  double r = sdot_(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
+}" BLAS_F2C_DOUBLE_WORKS )
+  CHECK_C_SOURCE_RUNS("
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+int four = 4;
+int one = 1;
+extern float sdot_();
+int main() {
+  int i;
+  double r = sdot_(&four, x, &one, y, &one);
+  exit((float)r != (float).1234);
+}" BLAS_F2C_FLOAT_WORKS )
+  IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+    MESSAGE(STATUS "This BLAS uses the F2C return conventions")
+    SET(BLAS_F2C TRUE)
+  ELSE (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+    SET(BLAS_F2C FALSE)
+  ENDIF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
+ENDIF(BLAS_LIBRARIES)
+
+# epilogue
+
+if(BLAS_LIBRARIES)
+  set(BLAS_FOUND TRUE)
+else(BLAS_LIBRARIES)
+  set(BLAS_FOUND FALSE)
+endif(BLAS_LIBRARIES)
+
+IF (NOT BLAS_FOUND AND BLAS_FIND_REQUIRED)
+  message(FATAL_ERROR "Cannot find a library with BLAS API. Please specify library location.")
+ENDIF (NOT BLAS_FOUND AND BLAS_FIND_REQUIRED)
+IF(NOT BLAS_FIND_QUIETLY)
+  IF(BLAS_FOUND)
+    MESSAGE(STATUS "Found a library with BLAS API (${BLAS_INFO}).")
+  ELSE(BLAS_FOUND)
+    MESSAGE(STATUS "Cannot find a library with BLAS API. Not using BLAS.")
+  ENDIF(BLAS_FOUND)
+ENDIF(NOT BLAS_FIND_QUIETLY)
+
+# Do nothing is BLAS was found before
+ENDIF(NOT BLAS_FOUND)
diff --git a/lib/TH/cmake/FindLAPACK.cmake b/lib/TH/cmake/FindLAPACK.cmake
new file mode 100644
index 0000000..9eca073
--- /dev/null
+++ b/lib/TH/cmake/FindLAPACK.cmake
@@ -0,0 +1,190 @@
+# - Find LAPACK library
+# This module finds an installed fortran library that implements the LAPACK
+# linear-algebra interface (see http://www.netlib.org/lapack/).
+#
+# The approach follows that taken for the autoconf macro file, acx_lapack.m4
+# (distributed at http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html).
+#
+# This module sets the following variables:
+#  LAPACK_FOUND - set to true if a library implementing the LAPACK interface is found
+#  LAPACK_LIBRARIES - list of libraries (using full path name) for LAPACK
+
+# Note: I do not think it is a good idea to mixup different BLAS/LAPACK versions
+# Hence, this script wants to find a Lapack library matching your Blas library
+
+# Do nothing if LAPACK was found before
+IF(NOT LAPACK_FOUND)
+
+SET(LAPACK_LIBRARIES)
+SET(LAPACK_INFO)
+
+IF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+  FIND_PACKAGE(BLAS)
+ELSE(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+  FIND_PACKAGE(BLAS REQUIRED)
+ENDIF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+
+# Old search lapack script
+include(CheckFortranFunctionExists)
+
+macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas)
+  # This macro checks for the existence of the combination of fortran libraries
+  # given by _list.  If the combination is found, this macro checks (using the
+  # Check_Fortran_Function_Exists macro) whether can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to FALSE.
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+  set(_libraries_work TRUE)
+  set(${LIBRARIES})
+  set(_combined_name)
+  foreach(_library ${_list})
+    set(_combined_name ${_combined_name}_${_library})
+    if(_libraries_work)
+      if (WIN32)
+        find_library(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library} PATHS ENV LIB PATHS ENV PATH)
+      else (WIN32)
+        if(APPLE)
+          find_library(${_prefix}_${_library}_LIBRARY
+            NAMES ${_library}
+            PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
+            ENV DYLD_LIBRARY_PATH)
+        else(APPLE)
+          find_library(${_prefix}_${_library}_LIBRARY
+            NAMES ${_library}
+            PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
+            ENV LD_LIBRARY_PATH)
+        endif(APPLE)
+      endif(WIN32)
+      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+    endif(_libraries_work)
+  endforeach(_library ${_list})
+  if(_libraries_work)
+    # Test this combination of libraries.
+    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas})
+    if (CMAKE_Fortran_COMPILER_WORKS)
+      check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS)
+    else (CMAKE_Fortran_COMPILER_WORKS)
+      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
+    endif (CMAKE_Fortran_COMPILER_WORKS)
+    set(CMAKE_REQUIRED_LIBRARIES)
+    mark_as_advanced(${_prefix}${_combined_name}_WORKS)
+    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  endif(_libraries_work)
+  if(NOT _libraries_work)
+    set(${LIBRARIES} FALSE)
+  endif(NOT _libraries_work)
+endmacro(Check_Lapack_Libraries)
+
+
+if(BLAS_FOUND)
+
+  # Intel MKL
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "mkl"))
+    IF(MKL_LAPACK_LIBRARIES)
+      SET(LAPACK_LIBRARIES ${MKL_LAPACK_LIBRARIES} ${MKL_LIBRARIES})
+    ELSE(MKL_LAPACK_LIBRARIES)
+      SET(LAPACK_LIBRARIES ${MKL_LIBRARIES})
+    ENDIF(MKL_LAPACK_LIBRARIES)
+    SET(LAPACK_INCLUDE_DIR ${MKL_INCLUDE_DIR})
+    SET(LAPACK_INFO "mkl")
+  ENDIF()
+
+  # OpenBlas
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "open"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" OPEN_LAPACK_WORKS)
+    if(OPEN_LAPACK_WORKS)
+      SET(LAPACK_INFO "open")
+    else()
+      message(STATUS "It seems OpenBlas has not been compiled with Lapack support")
+    endif()
+  endif()
+
+  # GotoBlas
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "goto"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" GOTO_LAPACK_WORKS)
+    if(GOTO_LAPACK_WORKS)
+      SET(LAPACK_INFO "goto")
+    else()
+      message(STATUS "It seems GotoBlas has not been compiled with Lapack support")
+    endif()
+  endif()
+
+  # ACML
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "acml"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" ACML_LAPACK_WORKS)
+    if(ACML_LAPACK_WORKS)
+      SET(LAPACK_INFO "acml")
+    else()
+      message(STATUS "Strangely, this ACML library does not support Lapack?!")
+    endif()
+  endif()
+
+  # Accelerate
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "accelerate"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" ACCELERATE_LAPACK_WORKS)
+    if(ACCELERATE_LAPACK_WORKS)
+      SET(LAPACK_INFO "accelerate")
+    else()
+      message(STATUS "Strangely, this Accelerate library does not support Lapack?!")
+    endif()
+  endif()
+
+  # vecLib
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "veclib"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" VECLIB_LAPACK_WORKS)
+    if(VECLIB_LAPACK_WORKS)
+      SET(LAPACK_INFO "veclib")
+    else()
+      message(STATUS "Strangely, this vecLib library does not support Lapack?!")
+    endif()
+  endif()
+
+  # Generic LAPACK library?
+  IF((NOT LAPACK_INFO) AND ((BLAS_INFO STREQUAL "generic") OR (BLAS_INFO STREQUAL "open")))
+    check_lapack_libraries(
+      LAPACK_LIBRARIES
+      LAPACK
+      cheev
+      ""
+      "lapack"
+      "${BLAS_LIBRARIES}"
+      )
+    if(LAPACK_LIBRARIES)
+      SET(LAPACK_INFO "generic")
+    endif(LAPACK_LIBRARIES)
+  endif()
+
+else(BLAS_FOUND)
+  message(STATUS "LAPACK requires BLAS")
+endif(BLAS_FOUND)
+
+if(LAPACK_INFO)
+  set(LAPACK_FOUND TRUE)
+else(LAPACK_INFO)
+  set(LAPACK_FOUND FALSE)
+endif(LAPACK_INFO)
+
+IF (NOT LAPACK_FOUND AND LAPACK_FIND_REQUIRED)
+  message(FATAL_ERROR "Cannot find a library with LAPACK API. Please specify library location.")
+ENDIF (NOT LAPACK_FOUND AND LAPACK_FIND_REQUIRED)
+IF(NOT LAPACK_FIND_QUIETLY)
+  IF(LAPACK_FOUND)
+    MESSAGE(STATUS "Found a library with LAPACK API. (${LAPACK_INFO})")
+  ELSE(LAPACK_FOUND)
+    MESSAGE(STATUS "Cannot find a library with LAPACK API. Not using LAPACK.")
+  ENDIF(LAPACK_FOUND)
+ENDIF(NOT LAPACK_FIND_QUIETLY)
+
+# Do nothing if LAPACK was found before
+ENDIF(NOT LAPACK_FOUND)
diff --git a/lib/TH/cmake/FindMKL.cmake b/lib/TH/cmake/FindMKL.cmake
new file mode 100644
index 0000000..8dc3cde
--- /dev/null
+++ b/lib/TH/cmake/FindMKL.cmake
@@ -0,0 +1,265 @@
+# - Find INTEL MKL library
+#
+# This module finds the Intel Mkl libraries.
+#
+# This module sets the following variables:
+#  MKL_FOUND - set to true if a library implementing the CBLAS interface is found
+#  MKL_VERSION - best guess
+#  MKL_INCLUDE_DIR - path to include dir.
+#  MKL_LIBRARIES - list of libraries for base mkl
+#  MKL_LAPACK_LIBRARIES - list of libraries to add for lapack
+#  MKL_SCALAPACK_LIBRARIES - list of libraries to add for scalapack
+#  MKL_SOLVER_LIBRARIES - list of libraries to add for the solvers
+#  MKL_CDFT_LIBRARIES - list of libraries to add for the solvers
+
+
+# Do nothing if MKL_FOUND was set before!
+IF (NOT MKL_FOUND)
+
+SET(MKL_VERSION)
+SET(MKL_INCLUDE_DIR)
+SET(MKL_LIBRARIES)
+SET(MKL_LAPACK_LIBRARIES)
+SET(MKL_SCALAPACK_LIBRARIES)
+SET(MKL_SOLVER_LIBRARIES)
+SET(MKL_CDFT_LIBRARIES)
+
+# Includes
+INCLUDE(CheckTypeSize)
+INCLUDE(CheckFunctionExists)
+
+# Intel Compiler Suite
+SET(INTEL_COMPILER_DIR CACHE STRING
+  "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
+SET(INTEL_MKL_DIR CACHE STRING
+  "Root directory of the Intel MKL (standalone)")
+SET(INTEL_MKL_SEQUENTIAL OFF CACHE BOOL
+  "Force using the sequential (non threaded) libraries")
+
+# Checks
+CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
+IF ("${SIZE_OF_VOIDP}" EQUAL 8)
+  SET(mklvers "em64t")
+  SET(iccvers "intel64")
+  SET(mkl64s "_lp64")
+ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
+  SET(mklvers "32")
+  SET(iccvers "ia32")
+  SET(mkl64s)
+ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8)
+IF(CMAKE_COMPILER_IS_GNUCC)
+  SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread")
+  SET(mklifaces  "gf" "intel")
+  SET(mklrtls)
+ELSE(CMAKE_COMPILER_IS_GNUCC)
+  SET(mklthreads "mkl_intel_thread")
+  SET(mklifaces  "intel")
+  SET(mklrtls "iomp5" "guide")
+ENDIF (CMAKE_COMPILER_IS_GNUCC)
+
+# Kernel libraries dynamically loaded
+SET(mklkerlibs "mc" "mc3" "nc" "p4n" "p4m" "p4m3" "p4p" "def")
+SET(mklseq)
+
+
+
+# Paths
+SET(saved_CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH})
+SET(saved_CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH})
+IF (INTEL_COMPILER_DIR)
+  # TODO: diagnostic if dir does not exist
+  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
+    "${INTEL_COMPILER_DIR}/lib/${iccvers}")
+  IF (NOT INTEL_MKL_DIR)
+    SET(INTEL_MKL_DIR "${INTEL_COMPILER_DIR}/mkl")
+  ENDIF (NOT INTEL_MKL_DIR)
+ENDIF (INTEL_COMPILER_DIR)
+IF (INTEL_MKL_DIR)
+  # TODO: diagnostic if dir does not exist
+  SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
+    "${INTEL_MKL_DIR}/include")
+  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
+    "${INTEL_MKL_DIR}/lib/${mklvers}")
+ENDIF (INTEL_MKL_DIR)
+
+# Try linking multiple libs
+MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
+  # This macro checks for the existence of the combination of libraries given by _list.
+  # If the combination is found, this macro whether we can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to FALSE.
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+  SET(_prefix "${LIBRARIES}")
+  # start checking
+  SET(_libraries_work TRUE)
+  SET(${LIBRARIES})
+  SET(_combined_name)
+  SET(_paths)
+  set(__list)
+  foreach(_elem ${_list})
+    if(__list)
+      set(__list "${__list} - ${_elem}")
+    else(__list)
+      set(__list "${_elem}")
+    endif(__list)
+  endforeach(_elem)
+  message(STATUS "Checking for [${__list}]")
+  FOREACH(_library ${_list})
+    SET(_combined_name ${_combined_name}_${_library})
+    IF(_libraries_work)      
+      FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library})
+      MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
+      SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      SET(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+      IF(${_prefix}_${_library}_LIBRARY)
+        MESSAGE(STATUS "  Library ${_library}: ${${_prefix}_${_library}_LIBRARY}")
+      ELSE(${_prefix}_${_library}_LIBRARY)
+        MESSAGE(STATUS "  Library ${_library}: not found")
+      ENDIF(${_prefix}_${_library}_LIBRARY)
+    ENDIF(_libraries_work)
+  ENDFOREACH(_library ${_list})
+  # Test this combination of libraries.
+  IF(_libraries_work)
+    SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
+    CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS)
+    SET(CMAKE_REQUIRED_LIBRARIES)
+    MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS)
+    SET(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  ENDIF(_libraries_work)
+  # Fin
+  IF(_libraries_work)
+  ELSE (_libraries_work)
+    SET(${LIBRARIES})
+    MARK_AS_ADVANCED(${LIBRARIES})
+  ENDIF(_libraries_work)
+ENDMACRO(CHECK_ALL_LIBRARIES)
+
+if(WIN32)
+  set(mkl_m "")
+else(WIN32)
+  set(mkl_m "m")
+endif(WIN32)
+
+
+# Check for version 10/11
+IF (NOT MKL_LIBRARIES)
+  SET(MKL_VERSION 1011)
+ENDIF (NOT MKL_LIBRARIES)
+FOREACH(mklrtl ${mklrtls} "")
+  FOREACH(mkliface ${mklifaces})
+    FOREACH(mkl64 ${mkl64s} "")
+      FOREACH(mklthread ${mklthreads})
+        IF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)
+          CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
+        ENDIF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)          
+      ENDFOREACH(mklthread)
+    ENDFOREACH(mkl64)
+  ENDFOREACH(mkliface)
+ENDFOREACH(mklrtl)
+FOREACH(mklrtl ${mklrtls} "")
+  FOREACH(mkliface ${mklifaces})
+    FOREACH(mkl64 ${mkl64s} "")
+      IF (NOT MKL_LIBRARIES)
+        CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+          "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m}" "")
+        IF (MKL_LIBRARIES)
+          SET(mklseq "_sequential")
+        ENDIF (MKL_LIBRARIES)
+      ENDIF (NOT MKL_LIBRARIES)
+    ENDFOREACH(mkl64)
+  ENDFOREACH(mkliface)
+ENDFOREACH(mklrtl)
+FOREACH(mklrtl ${mklrtls} "")
+  FOREACH(mkliface ${mklifaces})
+    FOREACH(mkl64 ${mkl64s} "")
+      FOREACH(mklthread ${mklthreads})
+        IF (NOT MKL_LIBRARIES)
+          CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
+        ENDIF (NOT MKL_LIBRARIES)          
+      ENDFOREACH(mklthread)
+    ENDFOREACH(mkl64)
+  ENDFOREACH(mkliface)
+ENDFOREACH(mklrtl)
+
+# Check for older versions
+IF (NOT MKL_LIBRARIES)
+  SET(MKL_VERSION 900)
+  CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
+    "mkl;guide;pthread;m" "")
+ENDIF (NOT MKL_LIBRARIES)          
+
+# Include files
+IF (MKL_LIBRARIES)
+  FIND_PATH(MKL_INCLUDE_DIR "mkl_cblas.h")
+  MARK_AS_ADVANCED(MKL_INCLUDE_DIR)
+ENDIF (MKL_LIBRARIES)
+
+# Other libraries
+IF (MKL_LIBRARIES)
+  FOREACH(mkl64 ${mkl64s} "_core" "")
+    FOREACH(mkls ${mklseq} "")
+      IF (NOT MKL_LAPACK_LIBRARIES)
+        FIND_LIBRARY(MKL_LAPACK_LIBRARIES NAMES "mkl_lapack${mkl64}${mkls}")
+        MARK_AS_ADVANCED(MKL_LAPACK_LIBRARIES)
+      ENDIF (NOT MKL_LAPACK_LIBRARIES)
+      IF (NOT MKL_SCALAPACK_LIBRARIES)
+        FIND_LIBRARY(MKL_SCALAPACK_LIBRARIES NAMES "mkl_scalapack${mkl64}${mkls}") 
+        MARK_AS_ADVANCED(MKL_SCALAPACK_LIBRARIES)
+      ENDIF (NOT MKL_SCALAPACK_LIBRARIES)
+      IF (NOT MKL_SOLVER_LIBRARIES)
+        FIND_LIBRARY(MKL_SOLVER_LIBRARIES NAMES "mkl_solver${mkl64}${mkls}")
+        MARK_AS_ADVANCED(MKL_SOLVER_LIBRARIES)
+      ENDIF (NOT MKL_SOLVER_LIBRARIES)
+      IF (NOT MKL_CDFT_LIBRARIES)
+        FIND_LIBRARY(MKL_CDFT_LIBRARIES NAMES "mkl_cdft${mkl64}${mkls}")
+        MARK_AS_ADVANCED(MKL_CDFT_LIBRARIES)
+      ENDIF (NOT MKL_CDFT_LIBRARIES)
+    ENDFOREACH(mkls)
+  ENDFOREACH(mkl64)
+ENDIF (MKL_LIBRARIES)
+
+# LibIRC: intel compiler always links this; 
+# gcc does not; but mkl kernels sometimes need it.
+IF (MKL_LIBRARIES)
+  IF (CMAKE_COMPILER_IS_GNUCC)
+    FIND_LIBRARY(MKL_KERNEL_libirc "irc")
+  ELSEIF (CMAKE_C_COMPILER_ID AND NOT CMAKE_C_COMPILER_ID STREQUAL "Intel")
+    FIND_LIBRARY(MKL_KERNEL_libirc "irc")
+  ENDIF (CMAKE_COMPILER_IS_GNUCC)
+  MARK_AS_ADVANCED(MKL_KERNEL_libirc)
+  IF (MKL_KERNEL_libirc)
+    SET(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_KERNEL_libirc})
+  ENDIF (MKL_KERNEL_libirc)
+ENDIF (MKL_LIBRARIES)
+
+# Final
+SET(CMAKE_LIBRARY_PATH ${saved_CMAKE_LIBRARY_PATH})
+SET(CMAKE_INCLUDE_PATH ${saved_CMAKE_INCLUDE_PATH})
+IF (MKL_LIBRARIES)
+  SET(MKL_FOUND TRUE)
+ELSE (MKL_LIBRARIES)
+  SET(MKL_FOUND FALSE)
+  SET(MKL_VERSION)
+ENDIF (MKL_LIBRARIES)
+
+# Standard termination
+IF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
+  MESSAGE(FATAL_ERROR "MKL library not found. Please specify library  location")
+ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
+IF(NOT MKL_FIND_QUIETLY)
+  IF(MKL_FOUND)
+    MESSAGE(STATUS "MKL library found")
+  ELSE(MKL_FOUND)
+    MESSAGE(STATUS "MKL library not found")
+  ENDIF(MKL_FOUND)
+ENDIF(NOT MKL_FIND_QUIETLY)
+
+# Do nothing if MKL_FOUND was set before!
+ENDIF (NOT MKL_FOUND)
+
+
diff --git a/lib/TH/cmake/FindSSE.cmake b/lib/TH/cmake/FindSSE.cmake
new file mode 100644
index 0000000..f6aac07
--- /dev/null
+++ b/lib/TH/cmake/FindSSE.cmake
@@ -0,0 +1,111 @@
+INCLUDE(CheckCSourceRuns)
+INCLUDE(CheckCXXSourceRuns)
+
+SET(SSE1_CODE "
+  #include <xmmintrin.h>
+
+  int main()
+  {
+    __m128 a;
+    float vals[4] = {0,0,0,0};
+    a = _mm_loadu_ps(vals);
+    return 0;
+  }")
+
+SET(SSE2_CODE "
+  #include <emmintrin.h>
+
+  int main()
+  {
+    __m128d a;
+    double vals[2] = {0,0};
+    a = _mm_loadu_pd(vals);
+    return 0;
+  }")
+
+SET(SSE3_CODE "
+  #include <pmmintrin.h>
+
+  int main( )
+  {
+    const int vals[4] = {0,0,0,0};
+    __m128i a;
+    a = _mm_lddqu_si128( (const __m128i*)vals );
+    return 0;
+  }")
+
+SET(SSE4_1_CODE "
+  #include <smmintrin.h>
+
+  int main ()
+  {
+    __m128i a = {0,0,0,0}, b = {0,0,0,0};
+    __m128i res = _mm_max_epi8(a, b);
+
+    return 0;
+  }
+")
+
+SET(SSE4_2_CODE "
+  #include <nmmintrin.h>
+
+  int main()
+  {
+    __m128i a = {0,0,0,0}, b = {0,0,0,0}, c = {0,0,0,0};
+    c = _mm_cmpgt_epi64(a, b);
+    return 0;
+  }
+")
+
+SET(AVX_CODE "
+  #include <immintrin.h>
+
+  int main()
+  {
+     __m256 a;
+    a = _mm256_set1_ps(0);
+    return 0;
+  }
+")
+
+MACRO(CHECK_SSE lang type flags)
+  SET(__FLAG_I 1)
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  FOREACH(__FLAG ${flags})
+    IF(NOT ${lang}_${type}_FOUND)
+      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
+      IF(lang STREQUAL "CXX")
+        CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      ELSE()
+        CHECK_C_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
+      ENDIF()
+      IF(${lang}_HAS_${type}_${__FLAG_I})
+        SET(${lang}_${type}_FOUND TRUE CACHE BOOL "${lang} ${type} support")
+        SET(${lang}_${type}_FLAGS "${__FLAG}" CACHE STRING "${lang} ${type} flags")
+      ENDIF()
+      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
+    ENDIF()
+  ENDFOREACH()
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+  IF(NOT ${lang}_${type}_FOUND)
+    SET(${lang}_${type}_FOUND FALSE CACHE BOOL "${lang} ${type} support")
+    SET(${lang}_${type}_FLAGS "" CACHE STRING "${lang} ${type} flags")
+  ENDIF()
+
+  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
+
+ENDMACRO()
+
+CHECK_SSE(C "SSE1" " ;-msse;/arch:SSE")
+CHECK_SSE(C "SSE2" " ;-msse2;/arch:SSE2")
+CHECK_SSE(C "SSE3" " ;-msse3;/arch:SSE3")
+CHECK_SSE(C "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
+CHECK_SSE(C "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
+CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
+
+CHECK_SSE(CXX "SSE1" " ;-msse;/arch:SSE")
+CHECK_SSE(CXX "SSE2" " ;-msse2;/arch:SSE2")
+CHECK_SSE(CXX "SSE3" " ;-msse3;/arch:SSE3")
+CHECK_SSE(CXX "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
+CHECK_SSE(CXX "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c
new file mode 100644
index 0000000..759689f
--- /dev/null
+++ b/lib/TH/generic/THBlas.c
@@ -0,0 +1,391 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THBlas.c"
+#else
+
+#ifdef BLAS_F2C
+# define ffloat double
+#else
+# define ffloat float
+#endif
+
+TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
+TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
+TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx);
+TH_EXTERNC void sscal_(int *n, float *a, float *x, int *incx);
+TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
+TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
+TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
+TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
+TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy);
+TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy);
+TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
+TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
+TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
+TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
+TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
+TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
+    
+ 
+
+void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dswap_(&i_n, x, &i_incx, y, &i_incy);
+#else
+    sswap_(&i_n, x, &i_incx, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    long i;
+    for(i = 0; i < n; i++)
+    {
+      real z = x[i*incx];
+      x[i*incx] = y[i*incy];
+      y[i*incy] = z;
+    }
+  }
+}
+
+void THBlas_(scal)(long n, real a, real *x, long incx)
+{
+  if(n == 1)
+    incx = 1;
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dscal_(&i_n, &a, x, &i_incx);
+#else
+    sscal_(&i_n, &a, x, &i_incx);
+#endif
+    return;
+  }
+#endif
+  {
+    long i;
+    for(i = 0; i < n; i++)
+      x[i*incx] *= a;
+  }
+}
+
+void THBlas_(copy)(long n, real *x, long incx, real *y, long incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dcopy_(&i_n, x, &i_incx, y, &i_incy);
+#else
+    scopy_(&i_n, x, &i_incx, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    long i;
+    for(i = 0; i < n; i++)
+      y[i*incy] = x[i*incx];
+  }
+}
+
+void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    daxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
+#else
+    saxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    long i;
+    for(i = 0; i < n; i++)
+      y[i*incy] += a*x[i*incx];
+  }
+}
+
+real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
+{
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_n = (int)n;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    return (real) ddot_(&i_n, x, &i_incx, y, &i_incy);
+#else
+    return (real) sdot_(&i_n, x, &i_incx, y, &i_incy);
+#endif
+  }
+#endif
+  {
+    long i;
+    real sum = 0;
+    for(i = 0; i < n; i++)
+    sum += x[i*incx]*y[i*incy];
+    return sum;
+  }
+}
+
+void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, real *x, long incx, real beta, real *y, long incy)
+{
+  if(n == 1)
+    lda = m;
+  
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (m <= INT_MAX) && (n <= INT_MAX) && 
+      (lda > 0) && (lda <= INT_MAX) &&
+      (incx > 0) && (incx <= INT_MAX) &&
+      (incy > 0) && (incy <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
+#else
+    sgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
+#endif
+    return;
+  }
+#endif
+  {
+    long i, j;
+
+    if( (trans == 'T') || (trans == 't') )
+    {
+      for(i = 0; i < n; i++)
+      {
+        real sum = 0;
+        real *row_ = a+lda*i;
+        for(j = 0; j < m; j++)
+          sum += x[j*incx]*row_[j];
+        y[i*incy] = beta*y[i*incy] + alpha*sum;
+      }
+    }
+    else
+    {
+      if(beta != 1)
+        THBlas_(scal)(m, beta, y, incy);
+      
+      for(j = 0; j < n; j++)
+      {
+        real *column_ = a+lda*j;
+        real z = alpha*x[j*incx];
+        for(i = 0; i < m; i++)
+          y[i*incy] += z*column_[i];
+      }
+    }
+  }
+}
+
+void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long incy, real *a, long lda)
+{
+  if(n == 1)
+    lda = m;
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_lda = (int)lda;
+    int i_incx = (int)incx;
+    int i_incy = (int)incy;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
+#else
+    sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
+#endif
+    return;
+  }
+#endif
+  {
+    long i, j;
+    for(j = 0; j < n; j++)
+    {
+      real *column_ = a+j*lda;
+      real z = alpha*y[j*incy];
+      for(i = 0; i < m; i++)
+        column_[i] += z*x[i*incx] ;
+    }
+  }
+}
+
+void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha, real *a, long lda, real *b, long ldb, real beta, real *c, long ldc)
+{
+  int transa_ = ((transa == 't') || (transa == 'T'));
+  int transb_ = ((transb == 't') || (transb == 'T'));
+
+  if(n == 1)
+    ldc = m;
+
+  if(transa_)
+  {
+    if(m == 1)
+      lda = k;
+  }
+  else
+  {
+    if(k == 1)
+      lda = m;
+  }
+
+  if(transb_)
+  {
+    if(k == 1)
+      ldb = n;
+  }
+  else
+  {
+    if(n == 1)
+      ldb = k;
+  }
+
+#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
+  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+  {
+    int i_m = (int)m;
+    int i_n = (int)n;
+    int i_k = (int)k;
+    int i_lda = (int)lda;
+    int i_ldb = (int)ldb;
+    int i_ldc = (int)ldc;
+
+#if defined(TH_REAL_IS_DOUBLE)
+    dgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);
+#else
+    sgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);
+#endif
+    return;
+  }
+#endif
+  {
+    long i, j, l;
+    if(!transa_ && !transb_)
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l*lda]*b_[l];
+          b_ += ldb;
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_++;
+      }
+    }
+    else if(transa_ && !transb_)
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l]*b_[l];
+          b_ += ldb;
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_ += lda;
+      }
+    }
+    else if(!transa_ && transb_)
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l*lda]*b_[l*ldb];
+          b_++;
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_++;
+      }
+    }
+    else
+    {
+      real *a_ = a;
+      for(i = 0; i < m; i++)
+      {
+        real *b_ = b;
+        for(j = 0; j < n; j++)
+        {
+          real sum = 0;
+          for(l = 0; l < k; l++)
+            sum += a_[l]*b_[l*ldb];
+          b_++;
+          c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
+        }
+        a_ += lda;
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/TH/generic/THBlas.h b/lib/TH/generic/THBlas.h
new file mode 100644
index 0000000..9e14f5a
--- /dev/null
+++ b/lib/TH/generic/THBlas.h
@@ -0,0 +1,19 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THBlas.h"
+#else
+
+/* Level 1 */
+TH_API void THBlas_(swap)(long n, real *x, long incx, real *y, long incy);
+TH_API void THBlas_(scal)(long n, real a, real *x, long incx);
+TH_API void THBlas_(copy)(long n, real *x, long incx, real *y, long incy);
+TH_API void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy);
+TH_API real THBlas_(dot)(long n, real *x, long incx, real *y, long incy);
+
+/* Level 2 */
+TH_API void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, real *x, long incx, real beta, real *y, long incy);
+TH_API void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long incy, real *a, long lda);
+
+/* Level 3 */
+TH_API void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha, real *a, long lda, real *b, long ldb, real beta, real *c, long ldc);
+
+#endif
diff --git a/lib/TH/generic/THLapack.c b/lib/TH/generic/THLapack.c
new file mode 100644
index 0000000..7a9321b
--- /dev/null
+++ b/lib/TH/generic/THLapack.c
@@ -0,0 +1,254 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THLapack.c"
+#else
+
+
+TH_EXTERNC void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
+TH_EXTERNC void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
+TH_EXTERNC void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
+TH_EXTERNC void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
+TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info);
+TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info);
+TH_EXTERNC void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
+TH_EXTERNC void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
+TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
+TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
+TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *info);
+TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *info);
+TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
+TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
+TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info);
+TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info);
+TH_EXTERNC void dpotrf_(char *uplo, int *n, double *a, int *lda, int *info);
+TH_EXTERNC void spotrf_(char *uplo, int *n, float *a, int *lda, int *info);
+TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info);
+TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info);
+TH_EXTERNC void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
+TH_EXTERNC void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
+TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+TH_EXTERNC void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+TH_EXTERNC void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
+TH_EXTERNC void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
+TH_EXTERNC void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
+TH_EXTERNC void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
+TH_EXTERNC void spstrf_(char *uplo, int *n, float *a, int *lda, int *piv, int *rank, float *tol, float *work, int *info);
+TH_EXTERNC void dpstrf_(char *uplo, int *n, double *a, int *lda, int *piv, int *rank, double *tol, double *work, int *info);
+
+
+/* Compute the solution to a real system of linear equations  A * X = B */
+void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#else
+  sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
+#endif
+#else
+  THError("gesv : Lapack library not found in compile time\n");
+#endif
+  return;
+}
+
+/* Solve a triangular system of the form A * X = B  or A^T * X = B */
+void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dtrtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
+#else
+  strtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
+#endif
+#else
+  THError("trtrs : Lapack library not found in compile time\n");
+#endif
+  return;
+}
+
+/* Solve overdetermined or underdetermined real linear systems involving an
+M-by-N matrix A, or its transpose, using a QR or LQ factorization of A */
+void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+#else
+  sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
+#endif
+#else
+  THError("gels : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Compute all eigenvalues and, optionally, eigenvectors of a real symmetric
+matrix A */
+void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
+#else
+  ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
+#endif
+#else
+  THError("syev : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Compute for an N-by-N real nonsymmetric matrix A, the eigenvalues and,
+optionally, the left and/or right eigenvectors */
+void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
+#else
+  sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
+#endif
+#else
+  THError("geev : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Compute the singular value decomposition (SVD) of a real M-by-N matrix A,
+optionally computing the left and/or right singular vectors */
+void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info)
+{
+#ifdef USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
+#else
+  sgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
+#endif
+#else
+  THError("gesvd : Lapack library not found in compile time\n");
+#endif
+}
+
+/* LU decomposition */
+void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgetrf_(&m, &n, a, &lda, ipiv, info);
+#else
+  sgetrf_(&m, &n, a, &lda, ipiv, info);
+#endif
+#else
+  THError("getrf : Lapack library not found in compile time\n");
+#endif
+}
+/* Matrix Inverse */
+void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgetri_(&n, a, &lda, ipiv, work, &lwork, info);
+#else
+  sgetri_(&n, a, &lda, ipiv, work, &lwork, info);
+#endif
+#else
+  THError("getri : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Cholesky factorization */
+void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpotrf_(&uplo, &n, a, &lda, info);
+#else
+  spotrf_(&uplo, &n, a, &lda, info);
+#endif
+#else
+  THError("potrf : Lapack library not found in compile time\n");
+#endif
+}
+
+/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
+void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+#else
+  spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+#endif
+#else
+  THError("potrs: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Cholesky factorization based Matrix Inverse */
+void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpotri_(&uplo, &n, a, &lda, info);
+#else
+  spotri_(&uplo, &n, a, &lda, info);
+#endif
+#else
+  THError("potri: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Cholesky factorization with complete pivoting */
+void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dpstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info);
+#else
+  spstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info);
+#endif
+#else
+  THError("pstrf: Lapack library not found at compile time\n");
+#endif
+}
+
+/* QR decomposition */
+void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
+#else
+  sgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
+#endif
+#else
+  THError("geqrf: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Build Q from output of geqrf */
+void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
+#else
+  sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
+#endif
+#else
+  THError("orgqr: Lapack library not found in compile time\n");
+#endif
+}
+
+/* Multiply Q with a matrix using the output of geqrf */
+void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info)
+{
+#ifdef  USE_LAPACK
+#if defined(TH_REAL_IS_DOUBLE)
+  dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
+#else
+  sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
+#endif
+#else
+  THError("ormqr: Lapack library not found in compile time\n");
+#endif
+}
+
+
+#endif
diff --git a/lib/TH/generic/THLapack.h b/lib/TH/generic/THLapack.h
new file mode 100644
index 0000000..da9df91
--- /dev/null
+++ b/lib/TH/generic/THLapack.h
@@ -0,0 +1,39 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THLapack.h"
+#else
+
+/* AX=B */
+TH_API void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info);
+/* Solve a triangular system of the form A * X = B  or A^T * X = B */
+TH_API void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info);
+/* ||AX-B|| */
+TH_API void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info);
+/* Eigenvals */
+TH_API void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info);
+/* Non-sym eigenvals */
+TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info);
+/* svd */
+TH_API void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info);
+/* LU decomposition */
+TH_API void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info);
+/* Matrix Inverse */
+TH_API void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info);
+
+/* Positive Definite matrices */
+/* Cholesky factorization */
+void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info);
+/* Matrix inverse based on Cholesky factorization */
+void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info);
+/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
+void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info);
+/* Cholesky factorization with complete pivoting. */
+void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info);
+
+/* QR decomposition */
+void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info);
+/* Build Q from output of geqrf */
+void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info);
+/* Multiply Q with a matrix from output of geqrf */
+void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info);
+
+#endif
diff --git a/lib/TH/generic/THStorage.c b/lib/TH/generic/THStorage.c
new file mode 100644
index 0000000..cac043e
--- /dev/null
+++ b/lib/TH/generic/THStorage.c
@@ -0,0 +1,206 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorage.c"
+#else
+
+real* THStorage_(data)(const THStorage *self)
+{
+  return self->data;
+}
+
+long THStorage_(size)(const THStorage *self)
+{
+  return self->size;
+}
+
+int THStorage_(elementSize)()
+{
+  return sizeof(real);
+}
+
+THStorage* THStorage_(new)(void)
+{
+  return THStorage_(newWithSize)(0);
+}
+
+THStorage* THStorage_(newWithSize)(long size)
+{
+  return THStorage_(newWithAllocator)(size, &THDefaultAllocator, NULL);
+}
+
+THStorage* THStorage_(newWithAllocator)(long size,
+                                        THAllocator *allocator,
+                                        void *allocatorContext)
+{
+  THStorage *storage = THAlloc(sizeof(THStorage));
+  storage->data = allocator->malloc(allocatorContext, sizeof(real)*size);
+  storage->size = size;
+  storage->refcount = 1;
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
+  storage->allocator = allocator;
+  storage->allocatorContext = allocatorContext;
+  return storage;
+}
+
+THStorage* THStorage_(newWithMapping)(const char *filename, long size, int shared)
+{
+  THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, shared);
+
+  THStorage *storage = THStorage_(newWithAllocator)(size,
+                                                    &THMapAllocator,
+                                                    ctx);
+
+  if(size <= 0)
+    storage->size = THMapAllocatorContext_size(ctx)/sizeof(real);
+
+  THStorage_(clearFlag)(storage, TH_STORAGE_RESIZABLE);
+
+  return storage;
+}
+
+THStorage* THStorage_(newWithSize1)(real data0)
+{
+  THStorage *self = THStorage_(newWithSize)(1);
+  self->data[0] = data0;
+  return self;
+}
+
+THStorage* THStorage_(newWithSize2)(real data0, real data1)
+{
+  THStorage *self = THStorage_(newWithSize)(2);
+  self->data[0] = data0;
+  self->data[1] = data1;
+  return self;
+}
+
+THStorage* THStorage_(newWithSize3)(real data0, real data1, real data2)
+{
+  THStorage *self = THStorage_(newWithSize)(3);
+  self->data[0] = data0;
+  self->data[1] = data1;
+  self->data[2] = data2;
+  return self;
+}
+
+THStorage* THStorage_(newWithSize4)(real data0, real data1, real data2, real data3)
+{
+  THStorage *self = THStorage_(newWithSize)(4);
+  self->data[0] = data0;
+  self->data[1] = data1;
+  self->data[2] = data2;
+  self->data[3] = data3;
+  return self;
+}
+
+void THStorage_(setFlag)(THStorage *storage, const char flag)
+{
+  storage->flag |= flag;
+}
+
+void THStorage_(clearFlag)(THStorage *storage, const char flag)
+{
+  storage->flag &= ~flag;
+}
+
+void THStorage_(retain)(THStorage *storage)
+{
+  if(storage && (storage->flag & TH_STORAGE_REFCOUNTED))
+    THAtomicIncrementRef(&storage->refcount);
+}
+
+void THStorage_(free)(THStorage *storage)
+{
+  if(!storage)
+    return;
+
+  if((storage->flag & TH_STORAGE_REFCOUNTED) && (THAtomicGet(&storage->refcount) > 0))
+  {
+    if(THAtomicDecrementRef(&storage->refcount))
+    {
+      if(storage->flag & TH_STORAGE_FREEMEM) {
+        storage->allocator->free(storage->allocatorContext, storage->data);
+      }
+      if(storage->flag & TH_STORAGE_VIEW) {
+        THStorage_(free)(storage->view);
+      }
+      THFree(storage);
+    }
+  }
+}
+
+THStorage* THStorage_(newWithData)(real *data, long size)
+{
+  return THStorage_(newWithDataAndAllocator)(data, size,
+                                             &THDefaultAllocator, NULL);
+}
+
+THStorage* THStorage_(newWithDataAndAllocator)(real* data, long size,
+                                               THAllocator* allocator,
+                                               void* allocatorContext) {
+  THStorage *storage = THAlloc(sizeof(THStorage));
+  storage->data = data;
+  storage->size = size;
+  storage->refcount = 1;
+  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
+  storage->allocator = allocator;
+  storage->allocatorContext = allocatorContext;
+  return storage;
+}
+
+void THStorage_(resize)(THStorage *storage, long size)
+{
+  if(storage->flag & TH_STORAGE_RESIZABLE)
+  {
+    if(storage->allocator->realloc == NULL) {
+      /* case when the allocator does not have a realloc defined */
+      real *old_data = storage->data;
+      long  old_size = storage->size;
+      if (size == 0) {
+	storage->data = NULL;
+      } else {
+	storage->data = storage->allocator->malloc(
+						   storage->allocatorContext,
+						   sizeof(real)*size);
+      }
+      storage->size = size;
+      if (old_data != NULL) {
+	long copy_size = old_size;
+	if (storage->size < copy_size) {
+	  copy_size = storage->size;
+	}
+	if (copy_size > 0) {
+	  memcpy(storage->data, old_data, sizeof(real)*copy_size);
+	}
+	storage->allocator->free(storage->allocatorContext, old_data);
+      }
+    } else {
+      storage->data = storage->allocator->realloc(
+						  storage->allocatorContext,
+						  storage->data,
+						  sizeof(real)*size);
+      storage->size = size;
+    }
+  } else {
+    THError("Trying to resize storage that is not resizable");
+  }
+}
+
+void THStorage_(fill)(THStorage *storage, real value)
+{
+  long i;
+  for(i = 0; i < storage->size; i++)
+    storage->data[i] = value;
+}
+
+void THStorage_(set)(THStorage *self, long idx, real value)
+{
+  THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds");
+  self->data[idx] = value;
+}
+
+real THStorage_(get)(const THStorage *self, long idx)
+{
+  THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds");
+  return self->data[idx];
+}
+
+#endif
diff --git a/lib/TH/generic/THStorage.h b/lib/TH/generic/THStorage.h
new file mode 100644
index 0000000..79013d8
--- /dev/null
+++ b/lib/TH/generic/THStorage.h
@@ -0,0 +1,70 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorage.h"
+#else
+
+/* on pourrait avoir un liste chainee
+   qui initialise math, lab structures (or more).
+   mouais -- complique.
+
+   Pb: THMapStorage is kind of a class
+   THLab_()... comment je m'en sors?
+
+   en template, faudrait que je les instancie toutes!!! oh boy!
+   Et comment je sais que c'est pour Cuda? Le type float est le meme dans les <>
+
+   au bout du compte, ca serait sur des pointeurs float/double... etc... = facile.
+   primitives??
+ */
+
+#define TH_STORAGE_REFCOUNTED 1
+#define TH_STORAGE_RESIZABLE  2
+#define TH_STORAGE_FREEMEM    4
+#define TH_STORAGE_VIEW       8
+
+typedef struct THStorage
+{
+    real *data;
+    long size;
+    int refcount;
+    char flag;
+    THAllocator *allocator;
+    void *allocatorContext;
+    struct THStorage *view;
+} THStorage;
+
+TH_API real* THStorage_(data)(const THStorage*);
+TH_API long THStorage_(size)(const THStorage*);
+TH_API int THStorage_(elementSize)(void);
+
+/* slow access -- checks everything */
+TH_API void THStorage_(set)(THStorage*, long, real);
+TH_API real THStorage_(get)(const THStorage*, long);
+
+TH_API THStorage* THStorage_(new)(void);
+TH_API THStorage* THStorage_(newWithSize)(long size);
+TH_API THStorage* THStorage_(newWithSize1)(real);
+TH_API THStorage* THStorage_(newWithSize2)(real, real);
+TH_API THStorage* THStorage_(newWithSize3)(real, real, real);
+TH_API THStorage* THStorage_(newWithSize4)(real, real, real, real);
+TH_API THStorage* THStorage_(newWithMapping)(const char *filename, long size, int shared);
+
+/* takes ownership of data */
+TH_API THStorage* THStorage_(newWithData)(real *data, long size);
+
+TH_API THStorage* THStorage_(newWithAllocator)(long size,
+                                               THAllocator* allocator,
+                                               void *allocatorContext);
+TH_API THStorage* THStorage_(newWithDataAndAllocator)(
+    real* data, long size, THAllocator* allocator, void *allocatorContext);
+
+/* should not differ with API */
+TH_API void THStorage_(setFlag)(THStorage *storage, const char flag);
+TH_API void THStorage_(clearFlag)(THStorage *storage, const char flag);
+TH_API void THStorage_(retain)(THStorage *storage);
+
+/* might differ with other API (like CUDA) */
+TH_API void THStorage_(free)(THStorage *storage);
+TH_API void THStorage_(resize)(THStorage *storage, long size);
+TH_API void THStorage_(fill)(THStorage *storage, real value);
+
+#endif
diff --git a/lib/TH/generic/THStorageCopy.c b/lib/TH/generic/THStorageCopy.c
new file mode 100644
index 0000000..63a26dc
--- /dev/null
+++ b/lib/TH/generic/THStorageCopy.c
@@ -0,0 +1,36 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorageCopy.c"
+#else
+
+void THStorage_(rawCopy)(THStorage *storage, real *src)
+{
+  long i;
+  for(i = 0; i < storage->size; i++)
+    storage->data[i] = src[i];
+}
+
+void THStorage_(copy)(THStorage *storage, THStorage *src)
+{
+  THArgCheck(storage->size == src->size, 2, "size mismatch");
+  THStorage_(rawCopy)(storage, src->data);
+}
+
+
+#define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \
+void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
+{ \
+  long i; \
+  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
+  for(i = 0; i < storage->size; i++) \
+    storage->data[i] = (real)src->data[i]; \
+}
+
+IMPLEMENT_THStorage_COPY(Byte)
+IMPLEMENT_THStorage_COPY(Char)
+IMPLEMENT_THStorage_COPY(Short)
+IMPLEMENT_THStorage_COPY(Int)
+IMPLEMENT_THStorage_COPY(Long)
+IMPLEMENT_THStorage_COPY(Float)
+IMPLEMENT_THStorage_COPY(Double)
+
+#endif
diff --git a/lib/TH/generic/THStorageCopy.h b/lib/TH/generic/THStorageCopy.h
new file mode 100644
index 0000000..f853a82
--- /dev/null
+++ b/lib/TH/generic/THStorageCopy.h
@@ -0,0 +1,17 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THStorageCopy.h"
+#else
+
+/* Support for copy between different Storage types */
+
+TH_API void THStorage_(rawCopy)(THStorage *storage, real *src);
+TH_API void THStorage_(copy)(THStorage *storage, THStorage *src);
+TH_API void THStorage_(copyByte)(THStorage *storage, struct THByteStorage *src);
+TH_API void THStorage_(copyChar)(THStorage *storage, struct THCharStorage *src);
+TH_API void THStorage_(copyShort)(THStorage *storage, struct THShortStorage *src);
+TH_API void THStorage_(copyInt)(THStorage *storage, struct THIntStorage *src);
+TH_API void THStorage_(copyLong)(THStorage *storage, struct THLongStorage *src);
+TH_API void THStorage_(copyFloat)(THStorage *storage, struct THFloatStorage *src);
+TH_API void THStorage_(copyDouble)(THStorage *storage, struct THDoubleStorage *src);
+
+#endif
diff --git a/lib/TH/generic/THTensor.c b/lib/TH/generic/THTensor.c
new file mode 100644
index 0000000..26bbb01
--- /dev/null
+++ b/lib/TH/generic/THTensor.c
@@ -0,0 +1,819 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensor.c"
+#else
+
+/**** access methods ****/
+THStorage *THTensor_(storage)(const THTensor *self)
+{
+  return self->storage;
+}
+
+long THTensor_(storageOffset)(const THTensor *self)
+{
+  return self->storageOffset;
+}
+
+int THTensor_(nDimension)(const THTensor *self)
+{
+  return self->nDimension;
+}
+
+long THTensor_(size)(const THTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
+      dim+1, THTensor_(nDimension)(self));
+  return self->size[dim];
+}
+
+long THTensor_(stride)(const THTensor *self, int dim)
+{
+  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor", dim+1,
+      THTensor_(nDimension)(self));
+  return self->stride[dim];
+}
+
+THLongStorage *THTensor_(newSizeOf)(THTensor *self)
+{
+  THLongStorage *size = THLongStorage_newWithSize(self->nDimension);
+  THLongStorage_rawCopy(size, self->size);
+  return size;
+}
+
+THLongStorage *THTensor_(newStrideOf)(THTensor *self)
+{
+  THLongStorage *stride = THLongStorage_newWithSize(self->nDimension);
+  THLongStorage_rawCopy(stride, self->stride);
+  return stride;
+}
+
+real *THTensor_(data)(const THTensor *self)
+{
+  if(self->storage)
+    return (self->storage->data+self->storageOffset);
+  else
+    return NULL;
+}
+
+void THTensor_(setFlag)(THTensor *self, const char flag)
+{
+  self->flag |= flag;
+}
+
+void THTensor_(clearFlag)(THTensor *self, const char flag)
+{
+  self->flag &= ~flag;
+}
+
+/**** creation methods ****/
+
+static void THTensor_(rawInit)(THTensor *self);
+static void THTensor_(rawSet)(THTensor *self, THStorage *storage, long storageOffset, int nDimension, long *size, long *stride);
+static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride);
+
+
+/* Empty init */
+THTensor *THTensor_(new)(void)
+{
+  THTensor *self = THAlloc(sizeof(THTensor));
+  THTensor_(rawInit)(self);
+  return self;
+}
+
+/* Pointer-copy init */
+THTensor *THTensor_(newWithTensor)(THTensor *tensor)
+{
+  THTensor *self = THAlloc(sizeof(THTensor));
+  THTensor_(rawInit)(self);
+  THTensor_(rawSet)(self,
+                    tensor->storage,
+                    tensor->storageOffset,
+                    tensor->nDimension,
+                    tensor->size,
+                    tensor->stride);
+  return self;
+}
+
+/* Storage init */
+THTensor *THTensor_(newWithStorage)(THStorage *storage, long storageOffset, THLongStorage *size, THLongStorage *stride)
+{
+  THTensor *self = THAlloc(sizeof(THTensor));
+  if(size && stride)
+    THArgCheck(size->size == stride->size, 4, "inconsistent size");
+
+  THTensor_(rawInit)(self);
+  THTensor_(rawSet)(self,
+                    storage,
+                    storageOffset,
+                    (size ? size->size : (stride ? stride->size : 0)),
+                    (size ? size->data : NULL),
+                    (stride ? stride->data : NULL));
+
+  return self;
+}
+THTensor *THTensor_(newWithStorage1d)(THStorage *storage, long storageOffset,
+                               long size0, long stride0)
+{
+  return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, -1, -1,  -1, -1,  -1, -1);
+}
+
+THTensor *THTensor_(newWithStorage2d)(THStorage *storage, long storageOffset,
+                               long size0, long stride0,
+                               long size1, long stride1)
+{
+  return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, size1, stride1,  -1, -1,  -1, -1);
+}
+
+THTensor *THTensor_(newWithStorage3d)(THStorage *storage, long storageOffset,
+                               long size0, long stride0,
+                               long size1, long stride1,
+                               long size2, long stride2)
+{
+  return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, size1, stride1,  size2, stride2,  -1, -1);
+}
+
+THTensor *THTensor_(newWithStorage4d)(THStorage *storage, long storageOffset,
+                               long size0, long stride0,
+                               long size1, long stride1,
+                               long size2, long stride2,
+                               long size3, long stride3)
+{
+  long size[4] = {size0, size1, size2, size3};
+  long stride[4] = {stride0, stride1, stride2, stride3};
+
+  THTensor *self = THAlloc(sizeof(THTensor));
+  THTensor_(rawInit)(self);
+  THTensor_(rawSet)(self, storage, storageOffset, 4, size, stride);
+
+  return self;
+}
+
+THTensor *THTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride)
+{
+  return THTensor_(newWithStorage)(NULL, 0, size, stride);
+}
+
+THTensor *THTensor_(newWithSize1d)(long size0)
+{
+  return THTensor_(newWithSize4d)(size0, -1, -1, -1);
+}
+
+THTensor *THTensor_(newWithSize2d)(long size0, long size1)
+{
+  return THTensor_(newWithSize4d)(size0, size1, -1, -1);
+}
+
+THTensor *THTensor_(newWithSize3d)(long size0, long size1, long size2)
+{
+  return THTensor_(newWithSize4d)(size0, size1, size2, -1);
+}
+
+THTensor *THTensor_(newWithSize4d)(long size0, long size1, long size2, long size3)
+{
+  long size[4] = {size0, size1, size2, size3};
+
+  THTensor *self = THAlloc(sizeof(THTensor));
+  THTensor_(rawInit)(self);
+  THTensor_(rawResize)(self, 4, size, NULL);
+
+  return self;
+}
+
+THTensor *THTensor_(newClone)(THTensor *self)
+{
+  THTensor *tensor = THTensor_(new)();
+  THTensor_(resizeAs)(tensor, self);
+  THTensor_(copy)(tensor, self);
+  return tensor;
+}
+
+THTensor *THTensor_(newContiguous)(THTensor *self)
+{
+  if(!THTensor_(isContiguous)(self))
+    return THTensor_(newClone)(self);
+  else
+  {
+    THTensor_(retain)(self);
+    return self;
+  }
+}
+
+THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sliceIndex_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(select)(self, NULL, dimension_, sliceIndex_);
+  return self;
+}
+
+THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(narrow)(self, NULL, dimension_, firstIndex_, size_);
+  return self;
+}
+
+THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(transpose)(self, NULL, dimension1_, dimension2_);
+  return self;
+}
+
+THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_)
+{
+  THTensor *self = THTensor_(newWithTensor)(tensor);
+  THTensor_(unfold)(self, NULL, dimension_, size_, step_);
+  return self;
+}
+
+/* Resize */
+void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *stride)
+{
+  THArgCheck(size != NULL, 2, "invalid size");
+  if(stride)
+    THArgCheck(stride->size == size->size, 3, "invalid stride");
+
+  THTensor_(rawResize)(self, size->size, size->data, (stride ? stride->data : NULL));
+}
+
+void THTensor_(resizeAs)(THTensor *self, THTensor *src)
+{
+  if(!THTensor_(isSameSizeAs)(self, src))
+    THTensor_(rawResize)(self, src->nDimension, src->size, NULL);
+}
+
+void THTensor_(resize1d)(THTensor *tensor, long size0)
+{
+  THTensor_(resize4d)(tensor, size0, -1, -1, -1);
+}
+
+void THTensor_(resize2d)(THTensor *tensor, long size0, long size1)
+{
+  THTensor_(resize4d)(tensor, size0, size1, -1, -1);
+}
+
+void THTensor_(resize3d)(THTensor *tensor, long size0, long size1, long size2)
+{
+  THTensor_(resize4d)(tensor, size0, size1, size2, -1);
+}
+
+void THTensor_(resize4d)(THTensor *self, long size0, long size1, long size2, long size3)
+{
+  long size[4] = {size0, size1, size2, size3};
+
+  THTensor_(rawResize)(self, 4, size, NULL);
+}
+
+void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, long size3, long size4)
+{
+    long size[5] = {size0, size1, size2, size3, size4};
+
+  THTensor_(rawResize)(self, 5, size, NULL);
+}
+
+void THTensor_(set)(THTensor *self, THTensor *src)
+{
+  if(self != src)
+    THTensor_(rawSet)(self,
+                      src->storage,
+                      src->storageOffset,
+                      src->nDimension,
+                      src->size,
+                      src->stride);
+}
+
+void THTensor_(setStorage)(THTensor *self, THStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_)
+{
+  if(size_ && stride_)
+    THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
+
+  THTensor_(rawSet)(self,
+                    storage_,
+                    storageOffset_,
+                    (size_ ? size_->size : (stride_ ? stride_->size : 0)),
+                    (size_ ? size_->data : NULL),
+                    (stride_ ? stride_->data : NULL));
+}
+
+void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                             long size0_, long stride0_)
+{
+  THTensor_(setStorage4d)(self, storage_, storageOffset_,
+                          size0_, stride0_,
+                          -1, -1,
+                          -1, -1,
+                          -1, -1);
+}
+
+void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                             long size0_, long stride0_,
+                             long size1_, long stride1_)
+{
+  THTensor_(setStorage4d)(self, storage_, storageOffset_,
+                          size0_, stride0_,
+                          size1_, stride1_,
+                          -1, -1,
+                          -1, -1);
+}
+
+void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                             long size0_, long stride0_,
+                             long size1_, long stride1_,
+                             long size2_, long stride2_)
+{
+  THTensor_(setStorage4d)(self, storage_, storageOffset_,
+                          size0_, stride0_,
+                          size1_, stride1_,
+                          size2_, stride2_,
+                          -1, -1);
+}
+
+void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                             long size0_, long stride0_,
+                             long size1_, long stride1_,
+                             long size2_, long stride2_,
+                             long size3_, long stride3_)
+{
+
+  long size[4] = {size0_, size1_, size2_, size3_};
+  long stride[4] = {stride0_, stride1_, stride2_, stride3_};
+
+  THTensor_(rawSet)(self, storage_, storageOffset_, 4, size, stride);
+}
+
+
+void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, long firstIndex, long size)
+{
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
+  THArgCheck( (firstIndex >= 0) && (firstIndex < src->size[dimension]), 3, "out of range");
+  THArgCheck( (size > 0) && (firstIndex+size <= src->size[dimension]), 4, "out of range");
+
+  THTensor_(set)(self, src);
+
+  if(firstIndex > 0)
+    self->storageOffset += firstIndex*self->stride[dimension];
+
+  self->size[dimension] = size;
+}
+
+void THTensor_(select)(THTensor *self, THTensor *src, int dimension, long sliceIndex)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck(src->nDimension > 1, 1, "cannot select on a vector");
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
+  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range");
+
+  THTensor_(set)(self, src);
+  THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1);
+  for(d = dimension; d < self->nDimension-1; d++)
+  {
+    self->size[d] = self->size[d+1];
+    self->stride[d] = self->stride[d+1];
+  }
+  self->nDimension--;
+}
+
+void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2)
+{
+  long z;
+
+  if(!src)
+    src = self;
+
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->nDimension), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->nDimension), 2, "out of range");
+
+  THTensor_(set)(self, src);
+
+  if(dimension1 == dimension2)
+	  return;
+
+  z = self->stride[dimension1];
+  self->stride[dimension1] = self->stride[dimension2];
+  self->stride[dimension2] = z;
+  z = self->size[dimension1];
+  self->size[dimension1] = self->size[dimension2];
+  self->size[dimension2] = z;
+}
+
+void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, long size, long step)
+{
+  long *newSize;
+  long *newStride;
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck( (src->nDimension > 0), 1, "cannot unfold an empty tensor");
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
+  THArgCheck(size <= src->size[dimension], 3, "out of range");
+  THArgCheck(step > 0, 4, "invalid step");
+
+  THTensor_(set)(self, src);
+
+  newSize = THAlloc(sizeof(long)*(self->nDimension+1));
+  newStride = THAlloc(sizeof(long)*(self->nDimension+1));
+
+  newSize[self->nDimension] = size;
+  newStride[self->nDimension] = self->stride[dimension];
+  for(d = 0; d < self->nDimension; d++)
+  {
+    if(d == dimension)
+    {
+      newSize[d] = (self->size[d] - size) / step + 1;
+      newStride[d] = step*self->stride[d];
+    }
+    else
+    {
+      newSize[d] = self->size[d];
+      newStride[d] = self->stride[d];
+    }
+  }
+
+  THFree(self->size);
+  THFree(self->stride);
+
+  self->size = newSize;
+  self->stride = newStride;
+  self->nDimension++;
+}
+
+/* we have to handle the case where the result is a number */
+void THTensor_(squeeze)(THTensor *self, THTensor *src)
+{
+  int ndim = 0;
+  int d;
+
+  if(!src)
+    src = self;
+
+  THTensor_(set)(self, src);
+
+  for(d = 0; d < src->nDimension; d++)
+  {
+    if(src->size[d] != 1)
+    {
+      if(d != ndim)
+      {
+        self->size[ndim] = src->size[d];
+        self->stride[ndim] = src->stride[d];
+      }
+      ndim++;
+    }
+  }
+
+  /* right now, we do not handle 0-dimension tensors */
+  if(ndim == 0 && src->nDimension > 0)
+  {
+    self->size[0] = 1;
+    self->stride[0] = 1;
+    ndim = 1;
+  }
+  self->nDimension = ndim;
+}
+
+void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
+{
+  int d;
+
+  if(!src)
+    src = self;
+
+  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "dimension out of range");
+
+  THTensor_(set)(self, src);
+
+  if(src->size[dimension] == 1 && src->nDimension > 1)
+  {
+    for(d = dimension; d < self->nDimension-1; d++)
+    {
+      self->size[d] = self->size[d+1];
+      self->stride[d] = self->stride[d+1];
+    }
+    self->nDimension--;
+  }
+}
+
+int THTensor_(isContiguous)(const THTensor *self)
+{
+  long z = 1;
+  int d;
+  for(d = self->nDimension-1; d >= 0; d--)
+  {
+    if(self->size[d] != 1)
+    {
+      if(self->stride[d] == z)
+        z *= self->size[d];
+      else
+        return 0;
+    }
+  }
+  return 1;
+}
+
+int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims)
+{
+  int d;
+  if (self->nDimension != dims->size)
+    return 0;
+
+  for(d = 0; d < self->nDimension; ++d)
+  {
+    if(self->size[d] != dims->data[d])
+      return 0;
+  }
+  return 1;
+}
+
+int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src)
+{
+  int d;
+  if (self->nDimension != src->nDimension)
+    return 0;
+  for(d = 0; d < self->nDimension; ++d)
+  {
+    if(self->size[d] != src->size[d])
+      return 0;
+  }
+  return 1;
+}
+
+int THTensor_(isSetTo)(const THTensor *self, const THTensor* src)
+{
+  if (!self->storage)
+    return 0;
+  if (self->storage == src->storage &&
+      self->storageOffset == src->storageOffset &&
+      self->nDimension == src->nDimension)
+  {
+    int d;
+    for (d = 0; d < self->nDimension; ++d)
+    {
+      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
+        return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+long THTensor_(nElement)(const THTensor *self)
+{
+  if(self->nDimension == 0)
+    return 0;
+  else
+  {
+    long nElement = 1;
+    int d;
+    for(d = 0; d < self->nDimension; d++)
+      nElement *= self->size[d];
+    return nElement;
+  }
+}
+
+void THTensor_(retain)(THTensor *self)
+{
+  if(self->flag & TH_TENSOR_REFCOUNTED)
+    THAtomicIncrementRef(&self->refcount);
+}
+
+void THTensor_(free)(THTensor *self)
+{
+  if(!self)
+    return;
+
+  if(self->flag & TH_TENSOR_REFCOUNTED)
+  {
+    if(THAtomicDecrementRef(&self->refcount))
+    {
+      THFree(self->size);
+      THFree(self->stride);
+      if(self->storage)
+        THStorage_(free)(self->storage);
+      THFree(self);
+    }
+  }
+}
+
+void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
+{
+  if(self != dst)
+    THTensor_(copy)(dst, self);
+
+  THTensor_(free)(self);
+}
+
+/*******************************************************************************/
+
+static void THTensor_(rawInit)(THTensor *self)
+{
+  self->refcount = 1;
+  self->storage = NULL;
+  self->storageOffset = 0;
+  self->size = NULL;
+  self->stride = NULL;
+  self->nDimension = 0;
+  self->flag = TH_TENSOR_REFCOUNTED;
+}
+
+static void THTensor_(rawSet)(THTensor *self, THStorage *storage, long storageOffset, int nDimension, long *size, long *stride)
+{
+  /* storage */
+  if(self->storage != storage)
+  {
+    if(self->storage)
+      THStorage_(free)(self->storage);
+
+    if(storage)
+    {
+      self->storage = storage;
+      THStorage_(retain)(self->storage);
+    }
+    else
+      self->storage = NULL;
+  }
+
+  /* storageOffset */
+  if(storageOffset < 0)
+    THError("Tensor: invalid storage offset");
+  self->storageOffset = storageOffset;
+
+  /* size and stride */
+  THTensor_(rawResize)(self, nDimension, size, stride);
+}
+
+static void THTensor_(rawResize)(THTensor *self, int nDimension, long *size, long *stride)
+{
+  int d;
+  int nDimension_;
+  long totalSize;
+  int hascorrectsize = 1;
+
+  nDimension_ = 0;
+  for(d = 0; d < nDimension; d++)
+  {
+    if(size[d] > 0)
+    {
+      nDimension_++;
+      if((self->nDimension > d) && (size[d] != self->size[d]))
+        hascorrectsize = 0;
+
+      if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d]))
+        hascorrectsize = 0;
+    }
+    else
+      break;
+  }
+  nDimension = nDimension_;
+
+  if(nDimension != self->nDimension)
+    hascorrectsize = 0;
+
+  if(hascorrectsize)
+    return;
+
+  if(nDimension > 0)
+  {
+    if(nDimension != self->nDimension)
+    {
+      self->size = THRealloc(self->size, sizeof(long)*nDimension);
+      self->stride = THRealloc(self->stride, sizeof(long)*nDimension);
+      self->nDimension = nDimension;
+    }
+
+    totalSize = 1;
+    for(d = self->nDimension-1; d >= 0; d--)
+    {
+      self->size[d] = size[d];
+      if(stride && (stride[d] >= 0) )
+        self->stride[d] = stride[d];
+      else
+      {
+        if(d == self->nDimension-1)
+          self->stride[d] = 1;
+        else
+          self->stride[d] = self->size[d+1]*self->stride[d+1];
+      }
+      totalSize += (self->size[d]-1)*self->stride[d];
+    }
+
+    if(totalSize+self->storageOffset > 0)
+    {
+      if(!self->storage)
+        self->storage = THStorage_(new)();
+      if(totalSize+self->storageOffset > self->storage->size)
+        THStorage_(resize)(self->storage, totalSize+self->storageOffset);
+    }
+  }
+  else
+    self->nDimension = 0;
+}
+
+void THTensor_(set1d)(THTensor *tensor, long x0, real value)
+{
+  THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
+}
+
+real THTensor_(get1d)(const THTensor *tensor, long x0)
+{
+  THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
+}
+
+void THTensor_(set2d)(THTensor *tensor, long x0, long x1, real value)
+{
+  THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
+}
+
+real THTensor_(get2d)(const THTensor *tensor, long x0, long x1)
+{
+  THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
+}
+
+void THTensor_(set3d)(THTensor *tensor, long x0, long x1, long x2, real value)
+{
+  THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
+}
+
+real THTensor_(get3d)(const THTensor *tensor, long x0, long x1, long x2)
+{
+  THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
+}
+
+void THTensor_(set4d)(THTensor *tensor, long x0, long x1, long x2, long x3, real value)
+{
+  THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
+}
+
+real THTensor_(get4d)(const THTensor *tensor, long x0, long x1, long x2, long x3)
+{
+  THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
+  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
+  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]);
+}
+
+THDescBuff THTensor_(desc)(const THTensor *tensor) {
+  const int L = TH_DESC_BUFF_LEN;
+  THDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+#define _stringify(x) #x
+  n += snprintf(str, L-n, "torch." _stringify(x) "Tensor of size ");
+#undef _stringify
+  int i;
+  for(i = 0; i < tensor->nDimension; i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
+    if(i < tensor->nDimension-1) {
+      n += snprintf(str+n, L-n, "x");
+    }
+  }
+  if(n >= L) {
+    snprintf(str+L-4+n, 4, "...");
+  }
+  return buf;
+}
+
+THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
+  const int L = TH_DESC_BUFF_LEN;
+  THDescBuff buf;
+  char *str = buf.str;
+  int n = 0;
+  n += snprintf(str, L-n, "[");
+  int i;
+  for(i = 0; i < tensor->nDimension; i++) {
+    if(n >= L) break;
+    n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
+    if(i < tensor->nDimension-1) {
+      n += snprintf(str+n, L-n, " x ");
+    }
+  }
+  if(n < L - 2) {
+    snprintf(str+n, L-n, "]");
+  } else {
+    snprintf(str+L-5, 5, "...]");
+  }
+  return buf;
+}
+
+#endif
diff --git a/lib/TH/generic/THTensor.h b/lib/TH/generic/THTensor.h
new file mode 100644
index 0000000..7a3d585
--- /dev/null
+++ b/lib/TH/generic/THTensor.h
@@ -0,0 +1,130 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensor.h"
+#else
+
+/* a la lua? dim, storageoffset, ...  et les methodes ? */
+
+#define TH_TENSOR_REFCOUNTED 1
+
+typedef struct THTensor
+{
+    long *size;
+    long *stride;
+    int nDimension;
+    
+    THStorage *storage;
+    long storageOffset;
+    int refcount;
+
+    char flag;
+
+} THTensor;
+
+
+/**** access methods ****/
+TH_API THStorage* THTensor_(storage)(const THTensor *self);
+TH_API long THTensor_(storageOffset)(const THTensor *self);
+TH_API int THTensor_(nDimension)(const THTensor *self);
+TH_API long THTensor_(size)(const THTensor *self, int dim);
+TH_API long THTensor_(stride)(const THTensor *self, int dim);
+TH_API THLongStorage *THTensor_(newSizeOf)(THTensor *self);
+TH_API THLongStorage *THTensor_(newStrideOf)(THTensor *self);
+TH_API real *THTensor_(data)(const THTensor *self);
+
+TH_API void THTensor_(setFlag)(THTensor *self, const char flag);
+TH_API void THTensor_(clearFlag)(THTensor *self, const char flag);
+
+
+/**** creation methods ****/
+TH_API THTensor *THTensor_(new)(void);
+TH_API THTensor *THTensor_(newWithTensor)(THTensor *tensor);
+/* stride might be NULL */
+TH_API THTensor *THTensor_(newWithStorage)(THStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+TH_API THTensor *THTensor_(newWithStorage1d)(THStorage *storage_, long storageOffset_,
+                                long size0_, long stride0_);
+TH_API THTensor *THTensor_(newWithStorage2d)(THStorage *storage_, long storageOffset_,
+                                long size0_, long stride0_,
+                                long size1_, long stride1_);
+TH_API THTensor *THTensor_(newWithStorage3d)(THStorage *storage_, long storageOffset_,
+                                long size0_, long stride0_,
+                                long size1_, long stride1_,
+                                long size2_, long stride2_);
+TH_API THTensor *THTensor_(newWithStorage4d)(THStorage *storage_, long storageOffset_,
+                                long size0_, long stride0_,
+                                long size1_, long stride1_,
+                                long size2_, long stride2_,
+                                long size3_, long stride3_);
+
+/* stride might be NULL */
+TH_API THTensor *THTensor_(newWithSize)(THLongStorage *size_, THLongStorage *stride_);
+TH_API THTensor *THTensor_(newWithSize1d)(long size0_);
+TH_API THTensor *THTensor_(newWithSize2d)(long size0_, long size1_);
+TH_API THTensor *THTensor_(newWithSize3d)(long size0_, long size1_, long size2_);
+TH_API THTensor *THTensor_(newWithSize4d)(long size0_, long size1_, long size2_, long size3_);
+
+TH_API THTensor *THTensor_(newClone)(THTensor *self);
+TH_API THTensor *THTensor_(newContiguous)(THTensor *tensor);
+TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sliceIndex_);
+TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_);
+TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
+TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_);
+  
+TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
+TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
+TH_API void THTensor_(resize1d)(THTensor *tensor, long size0_);
+TH_API void THTensor_(resize2d)(THTensor *tensor, long size0_, long size1_);
+TH_API void THTensor_(resize3d)(THTensor *tensor, long size0_, long size1_, long size2_);
+TH_API void THTensor_(resize4d)(THTensor *tensor, long size0_, long size1_, long size2_, long size3_);
+TH_API void THTensor_(resize5d)(THTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
+
+TH_API void THTensor_(set)(THTensor *self, THTensor *src);
+TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                                    long size0_, long stride0_);
+TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                                    long size0_, long stride0_,
+                                    long size1_, long stride1_);
+TH_API void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                                    long size0_, long stride0_,
+                                    long size1_, long stride1_,
+                                    long size2_, long stride2_);
+TH_API void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, long storageOffset_,
+                                    long size0_, long stride0_,
+                                    long size1_, long stride1_,
+                                    long size2_, long stride2_,
+                                    long size3_, long stride3_);
+
+TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, long firstIndex_, long size_);
+TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, long sliceIndex_);
+TH_API void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1_, int dimension2_);
+TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, long size_, long step_);
+
+TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src);
+TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
+
+TH_API int THTensor_(isContiguous)(const THTensor *self);
+TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
+TH_API int THTensor_(isSetTo)(const THTensor *self, const THTensor *src);
+TH_API int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims);
+TH_API long THTensor_(nElement)(const THTensor *self);
+
+TH_API void THTensor_(retain)(THTensor *self);
+TH_API void THTensor_(free)(THTensor *self);
+TH_API void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst);
+
+/* Slow access methods [check everything] */
+TH_API void THTensor_(set1d)(THTensor *tensor, long x0, real value);
+TH_API void THTensor_(set2d)(THTensor *tensor, long x0, long x1, real value);
+TH_API void THTensor_(set3d)(THTensor *tensor, long x0, long x1, long x2, real value);
+TH_API void THTensor_(set4d)(THTensor *tensor, long x0, long x1, long x2, long x3, real value);
+
+TH_API real THTensor_(get1d)(const THTensor *tensor, long x0);
+TH_API real THTensor_(get2d)(const THTensor *tensor, long x0, long x1);
+TH_API real THTensor_(get3d)(const THTensor *tensor, long x0, long x1, long x2);
+TH_API real THTensor_(get4d)(const THTensor *tensor, long x0, long x1, long x2, long x3);
+
+/* Debug methods */
+TH_API THDescBuff THTensor_(desc)(const THTensor *tensor);
+TH_API THDescBuff THTensor_(sizeDesc)(const THTensor *tensor);
+
+#endif
diff --git a/lib/TH/generic/THTensorConv.c b/lib/TH/generic/THTensorConv.c
new file mode 100644
index 0000000..da37989
--- /dev/null
+++ b/lib/TH/generic/THTensorConv.c
@@ -0,0 +1,1959 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorConv.c"
+#else
+
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel.
+*/
+void THTensor_(validXCorr2Dptr)(real *r_,
+                                       real alpha,
+                                       real *t_, long ir, long ic,
+                                       real *k_, long kr, long kc,
+                                       long sr, long sc)
+{
+  long or = (ir - kr) / sr + 1;
+  long oc = (ic - kc) / sc + 1;
+
+  long xx, yy, kx, ky;
+
+  if ((sc != 1) || (oc < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < or; yy++) {
+      for(xx = 0; xx < oc; xx++) {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real *pw_ = k_;
+        real sum = 0;
+        for(ky = 0; ky < kr; ky++) {
+          for(kx = 0; kx < kc; kx++) {
+            sum += pi_[kx]*pw_[kx];
+          }
+          pi_ += ic; /* next input line */
+          pw_ += kc; /* next mask line */
+        }
+        /* Update output */
+        *r_++ += alpha*sum;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < or; yy++) {
+      real *pi_ = t_ + yy*sr*ic;
+      real *pw_ = k_;
+      for (ky = 0; ky < kr; ky++) {
+        real *pis_ = pi_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(add)(r_, pis_, alpha*pw_[kx], oc);
+          pis_++;
+        }
+        pi_ += ic; /* next input line */
+        pw_ += kc; /* next mask line */
+      }
+      r_ += oc;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel.
+*/
+void THTensor_(validConv2Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, long ir, long ic,
+                                      real *k_, long kr, long kc,
+                                      long sr, long sc)
+{
+  long or = (ir - kr) / sr + 1;
+  long oc = (ic - kc) / sc + 1;
+
+  long xx, yy, kx, ky;
+
+  if ((sc != 1) || (oc < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < or; yy++) {
+      for(xx = 0; xx < oc; xx++) {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real *pw_ = k_ + kr*kc - 1;
+        real sum = 0;
+        for(ky = 0; ky < kr; ky++) {
+          for(kx = 0; kx < kc; kx++) {
+            sum += pi_[kx]*pw_[-kx];
+          }
+          pi_ += ic; /* next input line */
+          pw_ -= kc; /* next mask line */
+        }
+        /* Update output */
+        *r_++ += alpha*sum;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < or; yy++) {
+      real *pw_ = k_ + kr*kc - 1;
+      real *pi_ = t_ + yy*sr*ic;
+      for (ky = 0; ky < kr; ky++) {
+        real *pis_ = pi_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(add)(r_, pis_, alpha*pw_[-kx], oc);
+          pis_++;
+        }
+        pi_ += ic; /* next input line */
+        pw_ -= kc; /* next mask line */
+      }
+      r_ += oc;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
+*/
+void THTensor_(fullConv2Dptr)(real *r_,
+                                     real alpha,
+                                     real *t_, long ir, long ic,
+                                     real *k_, long kr, long kc,
+                                     long sr, long sc)
+{
+  long oc = (ic - 1) * sc + kc;
+
+  long xx, yy, kx, ky;
+
+  if ((sc != 1) || (ic < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < ir; yy++) {
+      for(xx = 0; xx < ic; xx++) {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + yy*sr*oc + xx*sc;
+        real *pw_ = k_;
+        for(ky = 0; ky < kr; ky++)
+        {
+          real z = *t_ * alpha;
+          for(kx = 0; kx < kc; kx++) {
+            po_[kx] += z * pw_[kx];
+          }
+          po_ += oc; /* next input line */
+          pw_ += kc; /* next mask line */
+        }
+        t_++;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < ir; yy++) {
+      real *po_ = r_ + yy*sr*oc;
+      real *pw_ = k_;
+      for (ky = 0; ky < kr; ky++) {
+        real *pos_ = po_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(add)(pos_, t_, alpha*pw_[kx], ic);
+          pos_++;
+        }
+        po_ += oc; /* next input line */
+        pw_ += kc; /* next mask line */
+      }
+      t_ += ic;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
+*/
+void THTensor_(fullXCorr2Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, long ir, long ic,
+                                      real *k_, long kr, long kc,
+                                      long sr, long sc)
+{
+  long oc = (ic - 1) * sc + kc;
+
+  long xx, yy, kx, ky;
+
+  if ((sc != 1) || (ic < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < ir; yy++) {
+      for(xx = 0; xx < ic; xx++) {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + yy*sr*oc + xx*sc;
+        real *pw_ = k_ + kr*kc -1;
+        long kx, ky;
+        for(ky = 0; ky < kr; ky++)
+        {
+          real z = *t_ * alpha;
+          for(kx = 0; kx < kc; kx++) {
+            po_[kx] += z * pw_[-kx];
+          }
+          po_ += oc; /* next input line */
+          pw_ -= kc; /* next mask line */
+        }
+        t_++;
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < ir; yy++) {
+      real *po_ = r_ + yy*sr*oc;
+      real *pw_ = k_ + kr*kc -1;
+      for (ky = 0; ky < kr; ky++) {
+        real *pos_ = po_;
+        for (kx = 0; kx < kc; kx++) {
+          THVector_(add)(pos_, t_, pw_[-kx]*alpha, ic);
+          pos_++;
+        }
+        po_ += oc; /* next input line */
+        pw_ -= kc; /* next mask line */
+      }
+      t_ += ic;
+    }
+  }
+}
+
+/*
+  2D Input, 2D kernel  : convolve given image with the given kernel, valid convolution.
+  for sr,sc=1 this is equivalent to validXCorr2Dptr, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(validXCorr2DRevptr)(real *r_,
+                                          real alpha,
+                                          real *t_, long ir, long ic,
+                                          real *k_, long kr, long kc,
+                                          long sr, long sc)
+{
+  long or = ir - (kr - 1) * sr;
+  long oc = ic - (kc - 1) * sc;
+
+  long xx, yy, kx, ky;
+
+  if ((sc != 1) || (kc < 4))  {
+    /* regular convolution */
+    for(yy = 0; yy < kr; yy++) {
+      for(xx = 0; xx < kc; xx++) {
+        real *po_ = r_;
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real z = *k_++ * alpha;
+
+        for(ky = 0; ky < or; ky++) {
+          for(kx = 0; kx < oc; kx++)
+            po_[kx] += z * pi_[kx];
+          pi_ += ic;
+          po_ += oc;
+        }
+      }
+    }
+
+  } else {
+    /* SSE-based convolution */
+    for(yy = 0; yy < kr; yy++) {
+      for(xx = 0; xx < kc; xx++) {
+        real *po_ = r_;
+        real *pi_ = t_ + yy*sr*ic + xx*sc;
+        real z = *k_++ * alpha;
+
+        for(ky = 0; ky < or; ky++) {
+          THVector_(add)(po_, pi_, z, oc);
+          pi_ += ic;
+          po_ += oc;
+        }
+      }
+    }
+  }
+}
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel.
+*/
+void THTensor_(validXCorr3Dptr)(real *r_,
+                                       real alpha,
+                                       real *t_, long it, long ir, long ic,
+                                       real *k_, long kt, long kr, long kc,
+                                       long st, long sr, long sc)
+{
+  long ot = (it - kt) / st + 1;
+  long or = (ir - kr) / sr + 1;
+  long oc = (ic - kc) / sc + 1;
+
+  long zz, xx, yy;
+
+  for (zz = 0; zz < ot; zz++)
+  {
+    for(yy = 0; yy < or; yy++)
+    {
+      for(xx = 0; xx < oc; xx++)
+      {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
+        real *pw_ = k_;
+        real sum = 0;
+        long kz, kx, ky;
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            for(kx = 0; kx < kc; kx++) {
+              sum += pi_[kx]*pw_[kx];
+            }
+            pi_ += ic; /* next input line */
+            pw_ += kc; /* next mask line */
+          }
+          pi_ += (ir-kr)*ic; /* next input slice */
+        }
+        /* Update output */
+        *r_++ += sum*alpha;
+      }
+    }
+  }
+}
+
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel.
+*/
+void THTensor_(validConv3Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, long it, long ir, long ic,
+                                      real *k_, long kt, long kr, long kc,
+                                      long st, long sr, long sc)
+{
+  long ot = (it - kt) / st + 1;
+  long or = (ir - kr) / sr + 1;
+  long oc = (ic - kc) / sc + 1;
+
+  long zz, xx, yy;
+
+  for(zz = 0; zz < ot; zz++)
+  {
+    for(yy = 0; yy < or; yy++)
+    {
+      for(xx = 0; xx < oc; xx++)
+      {
+        /* Dot product in two dimensions... (between input image and the mask) */
+        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
+        real *pw_ = k_ + kt*kr*kc - 1;
+        real sum = 0;
+        long kz, kx, ky;
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            for(kx = 0; kx < kc; kx++) {
+              sum += pi_[kx]*pw_[-kx];
+            }
+            pi_ += ic; /* next input line */
+            pw_ -= kc; /* next mask line */
+          }
+          pi_ += (ir-kr)*ic; /* next input slice */
+        }
+        /* Update output */
+        *r_++ += alpha*sum;
+      }
+    }
+  }
+}
+
+
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
+*/
+void THTensor_(fullConv3Dptr)(real *r_,
+                                     real alpha,
+                                     real *t_, long it, long ir, long ic,
+                                     real *k_, long kt, long kr, long kc,
+                                     long st, long sr, long sc)
+{
+  long or = (ir - 1) * sr + kr;
+  long oc = (ic - 1) * sc + kc;
+
+  long zz, xx, yy;
+
+  for(zz = 0; zz < it; zz++)
+  {
+    for(yy = 0; yy < ir; yy++)
+    {
+      for(xx = 0; xx < ic; xx++)
+      {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
+        real *pw_ = k_;
+        long kz, kx, ky;
+        /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            real z = *t_ * alpha;
+            for(kx = 0; kx < kc; kx++) {
+              /* printf("o=%g,k=%g," , po_[kx],pw_[kx]); */
+              po_[kx] += z * pw_[kx];
+              /* printf("o=%g " , po_[kx]); */
+            }
+            /* printf("\n"); */
+            po_ += oc; /* next input line */
+            pw_ += kc; /* next mask line */
+          }
+          po_ += (or-kr)*oc; /* next output slice */
+          /* printf("\n"); */
+        }
+        t_++;
+      }
+    }
+  }
+}
+
+/*
+  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
+*/
+void THTensor_(fullXCorr3Dptr)(real *r_,
+                                      real alpha,
+                                      real *t_, long it, long ir, long ic,
+                                      real *k_, long kt, long kr, long kc,
+                                      long st, long sr, long sc)
+{
+  long or = (ir - 1) * sr + kr;
+  long oc = (ic - 1) * sc + kc;
+
+  long zz, xx, yy;
+
+  for(zz = 0; zz < it; zz++)
+  {
+    for(yy = 0; yy < ir; yy++)
+    {
+      for(xx = 0; xx < ic; xx++)
+      {
+        /* Outer product in two dimensions... (between input image and the mask) */
+        real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
+        real *pw_ = k_ + kt*kr*kc -1;
+        long kz, kx, ky;
+        for(kz = 0; kz < kt; kz++)
+        {
+          for(ky = 0; ky < kr; ky++)
+          {
+            real z = *t_ * alpha;
+            for(kx = 0; kx < kc; kx++) {
+              po_[kx] += z * pw_[-kx];
+            }
+            po_ += oc; /* next input line */
+            pw_ -= kc; /* next mask line */
+          }
+          po_ += (or-kr)*oc; /* next output slice */
+        }
+        t_++;
+      }
+    }
+  }
+}
+
+/*
+  3D Input, 3D kernel  : convolve given image with the given kernel, valid convolution.
+  for sr,sc=1 this is equivalent to validXCorr3Dptr, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(validXCorr3DRevptr)(real *r_,
+                                          real alpha,
+                                          real *t_, long it, long ir, long ic,
+                                          real *k_, long kt, long kr, long kc,
+                                          long st, long sr, long sc)
+{
+  long ot = it - (kt - 1) * st;
+  long or = ir - (kr - 1) * sr;
+  long oc = ic - (kc - 1) * sc;
+
+  long zz, xx, yy;
+  for(zz = 0; zz < kt; zz++)
+  {
+    for(yy = 0; yy < kr; yy++)
+    {
+      for(xx = 0; xx < kc; xx++)
+      {
+        real *po_ = r_;
+        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
+        real z = *k_++ * alpha;
+        long kz, kx, ky;
+        for(kz = 0; kz < ot; kz++)
+        {
+          for(ky = 0; ky < or; ky++)
+          {
+            for(kx = 0; kx < oc; kx++)
+              po_[kx] += z * pi_[kx];
+            pi_ += ic;
+            po_ += oc;
+          }
+          pi_ += (ir-or)*ic; /* next input slice */
+        }
+      }
+    }
+  }
+}
+
+void THTensor_(conv2d)(real* output_data,
+                       real alpha,
+                       real* ptr_input, long nInputRows, long nInputCols,
+                       real* ptr_weight, long nKernelRows, long nKernelCols,
+                       long srow, long scol,
+                       const char *vf, const char *xc)
+{
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
+  if (*vf == 'F')
+    if (*xc == 'X')
+      THTensor_(fullXCorr2Dptr)(output_data,
+                                alpha,
+                                ptr_input,  nInputRows,  nInputCols,
+                                ptr_weight, nKernelRows, nKernelCols,
+                                srow, scol);
+    else
+      THTensor_(fullConv2Dptr)(output_data,
+                               alpha,
+                               ptr_input,  nInputRows,  nInputCols,
+                               ptr_weight, nKernelRows, nKernelCols,
+                               srow, scol);
+  else
+    if (*xc == 'X')
+      THTensor_(validXCorr2Dptr)(output_data,
+                                 alpha,
+                                 ptr_input,  nInputRows,  nInputCols,
+                                 ptr_weight, nKernelRows, nKernelCols,
+                                 srow, scol);
+    else
+      THTensor_(validConv2Dptr)(output_data,
+                                alpha,
+                                ptr_input,  nInputRows,  nInputCols,
+                                ptr_weight, nKernelRows, nKernelCols,
+                                srow, scol);
+}
+
+void THTensor_(conv3d)(real* output_data,
+                       real alpha,
+                       real* ptr_input, long nInputDepth, long nInputRows, long nInputCols,
+                       real* ptr_weight, long nKernelDepth, long nKernelRows, long nKernelCols,
+                       long sdepth, long srow, long scol,
+                       const char *vf, const char *xc)
+{
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
+  if (*vf == 'F')
+    if (*xc == 'X')
+      THTensor_(fullXCorr3Dptr)(output_data,
+                                alpha,
+                                ptr_input, nInputDepth, nInputRows,  nInputCols,
+                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                sdepth, srow, scol);
+    else
+      THTensor_(fullConv3Dptr)(output_data,
+                               alpha,
+                               ptr_input, nInputDepth, nInputRows,  nInputCols,
+                               ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                               sdepth, srow, scol);
+  else
+    if (*xc == 'X')
+      THTensor_(validXCorr3Dptr)(output_data,
+                                 alpha,
+                                 ptr_input, nInputDepth, nInputRows,  nInputCols,
+                                 ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                 sdepth, srow, scol);
+    else
+      THTensor_(validConv3Dptr)(output_data,
+                                alpha,
+                                ptr_input, nInputDepth, nInputRows,  nInputCols,
+                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                sdepth, srow, scol);
+}
+
+long THTensor_(convsize)(long x, long k, long s, const char* vf)
+{
+  THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'");
+  if (*vf == 'V')
+    return (x-k)/s + 1;
+  else
+    return (x-1)*s + k;
+}
+
+
+/*
+  3D input, 3D kernel, 4D output
+  like rank1 update
+  A <- xx' + beta*A
+  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
+{
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelPlane, nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k;
+
+  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
+  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0 = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+  nOutputPlane = nInputPlane * kernel->size[0];
+
+  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel");
+
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    long i;
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get output */
+      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(validXCorr2DRevptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+      /* Next output plane */
+      /* output_data += nOutputCols*nOutputRows; */
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 3D kernel, 4D output
+  like rank1 update
+  A <- xx' + beta*A
+  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
+{
+  long nbatch, nInputPlane, nInputRows, nInputCols;
+  long nKernelPlane, nKernelRows, nKernelCols;
+  long nOutputRows, nOutputCols;
+  long istride0, kstride0, istride1, kstride1;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k;
+
+  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
+  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  istride1    = input->stride[1];
+  nbatch      = input->size[0];
+  nInputPlane = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0 = kernel->stride[0];
+  kstride1 = kernel->stride[1];
+  nKernelPlane = kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel");
+  THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size");
+
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    long i;
+    for(i = 0; i < nInputPlane; i++)
+    {
+      long p;
+      for(p = 0; p < nbatch; p++)
+      {
+        /* get kernel */
+        real *ptr_weight = weight_data + p*kstride0 + k*kstride1;
+        /* get output */
+        real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
+        /* get input */
+        real *ptr_input = input_data + p*istride0 + i*istride1;
+  
+        /* do image, kernel convolution */
+        THTensor_(validXCorr2DRevptr)(ptr_output,
+                                      alpha,
+                                      ptr_input,  nInputRows,  nInputCols,
+                                      ptr_weight, nKernelRows, nKernelCols,
+                                      srow, scol);
+        /* Next output plane */
+        /* output_data += nOutputCols*nOutputRows; */
+      }
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 3D kernel, 4D output
+  like rank1 update
+  A <- xx' + beta*A
+*/
+void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelPlane, nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k;
+
+  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
+  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0 = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+  nOutputPlane = nInputPlane * kernel->size[0];
+
+  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel");
+
+  if (*vf == 'F') {
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+  } else { /* valid */
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    long i;
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get output */
+      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      if (*vf == 'F')
+        if (*xc == 'X')
+          THTensor_(fullXCorr2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+        else
+          THTensor_(fullConv2Dptr)(ptr_output,
+                                   alpha,
+                                   ptr_input,  nInputRows,  nInputCols,
+                                   ptr_weight, nKernelRows, nKernelCols,
+                                   srow, scol);
+      else
+        if (*xc == 'X')
+          THTensor_(validXCorr2Dptr)(ptr_output,
+                                     alpha,
+                                     ptr_input,  nInputRows,  nInputCols,
+                                     ptr_weight, nKernelRows, nKernelCols,
+                                     srow, scol);
+        else
+          THTensor_(validConv2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+      /* Next output plane */
+      /* output_data += nOutputCols*nOutputRows; */
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 4D kernel, 3D output
+  matrix vector product like
+  y <- Ax + beta*y
+*/
+void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+  long istride0, kstride0, kstride1;
+  THTensor *input;
+  THTensor* kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k;
+
+  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
+  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+    kernel = THTensor_(newContiguous)(k_);
+  } else {
+    THTensor_(retain)(k_);
+    kernel = k_;
+  }
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0    = kernel->stride[0];
+  kstride1    = kernel->stride[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
+
+  if (*vf == 'F') {
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+  } else { /* valid */
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] = 0.0;
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(k)
+    for (k = 0; k < r_->size[0]; k++)
+    {
+      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
+      long l;
+      for (l = 0; l < nOutputRows*nOutputCols; l++)
+        ptr_output[l] *= beta;
+    }
+  }
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    long i;
+    /* get output */
+    real *ptr_output = output_data + k*nOutputCols*nOutputRows;
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get kernel */
+      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
+      /* get input */
+      real *ptr_input = input_data + i*istride0;
+
+      /* do image, kernel convolution */
+      if (*vf == 'F')
+        if (*xc == 'X')
+          THTensor_(fullXCorr2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+        else
+          THTensor_(fullConv2Dptr)(ptr_output,
+                                   alpha,
+                                   ptr_input,  nInputRows,  nInputCols,
+                                   ptr_weight, nKernelRows, nKernelCols,
+                                   srow, scol);
+      else
+        if (*xc == 'X')
+          THTensor_(validXCorr2Dptr)(ptr_output,
+                                     alpha,
+                                     ptr_input,  nInputRows,  nInputCols,
+                                     ptr_weight, nKernelRows, nKernelCols,
+                                     srow, scol);
+        else
+          THTensor_(validConv2Dptr)(ptr_output,
+                                    alpha,
+                                    ptr_input,  nInputRows,  nInputCols,
+                                    ptr_weight, nKernelRows, nKernelCols,
+                                    srow, scol);
+    }
+    /* Next output plane */
+    /* output_data += nOutputCols*nOutputRows;*/
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  3D input, 4D kernel, 3D output
+  matrix vector product like
+  y <- Ax + beta*y
+*/
+void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+  long kstride0, kstride1;
+  THTensor *input;
+  THTensor* kernel;
+  long nbatch;
+  long nelem;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long p;
+
+  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
+  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
+    kernel = THTensor_(newContiguous)(k_);
+  } else {
+    THTensor_(retain)(k_);
+    kernel = k_;
+  }
+
+  nbatch = input->size[0];
+  nInputPlane = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  kstride1    = kernel->stride[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
+
+  if (*vf == 'F') {
+    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
+    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
+  } else { /* valid */
+    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
+    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
+  }
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    /*THTensor_(zero)(r_);*/
+#pragma omp parallel for private(p)
+    for (p=0; p < r_->size[0]; p++)
+    {
+      long k;
+      for (k = 0; k < r_->size[1]; k++)
+      {
+        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
+        long l;
+        for (l = 0; l < nOutputRows*nOutputCols; l++)
+          ptr_output[l] = 0.0;
+      }
+    }
+  }
+  else if (beta != 1)
+  {
+    /*THTensor_(mul)(r_, beta);*/
+#pragma omp parallel for private(p)
+    for(p=0; p < r_->size[0]; p++)
+    {
+      long k;
+      for (k = 0; k < r_->size[1]; k++)
+      {
+        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
+        long l;
+        for (l = 0; l < nOutputRows*nOutputCols; l++)
+          ptr_output[l] *= beta;
+      }
+    }
+  }
+
+#pragma omp parallel for private(p)
+  for(p=0; p < nbatch; p++)
+  {
+    long k;
+    for(k = 0; k < nOutputPlane; k++)
+    {
+      long i;
+      /* get output */
+      real *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows;
+      for(i = 0; i < nInputPlane; i++)
+      {
+        /* get kernel */
+        real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
+        /* get input */
+        real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;
+  
+        /* do image, kernel convolution */
+        if (*vf == 'F')
+          if (*xc == 'X')
+            THTensor_(fullXCorr2Dptr)(ptr_output,
+                                      alpha,
+                                      ptr_input,  nInputRows,  nInputCols,
+                                      ptr_weight, nKernelRows, nKernelCols,
+                                      srow, scol);
+          else
+            THTensor_(fullConv2Dptr)(ptr_output,
+                                     alpha,
+                                     ptr_input,  nInputRows,  nInputCols,
+                                     ptr_weight, nKernelRows, nKernelCols,
+                                     srow, scol);
+        else
+          if (*xc == 'X')
+            THTensor_(validXCorr2Dptr)(ptr_output,
+                                       alpha,
+                                       ptr_input,  nInputRows,  nInputCols,
+                                       ptr_weight, nKernelRows, nKernelCols,
+                                       srow, scol);
+          else
+            THTensor_(validConv2Dptr)(ptr_output,
+                                      alpha,
+                                      ptr_input,  nInputRows,  nInputCols,
+                                      ptr_weight, nKernelRows, nKernelCols,
+                                      srow, scol);
+      }
+      /* Next output plane */
+      /* output_data += nOutputCols*nOutputRows;*/
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  2D input, 2D kernel, 2D output
+  scalar multiplication like
+  y <- x*y + beta*y
+*/
+void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+{
+  THTensor *input;
+  THTensor* kernel;
+  long nInputRows;
+  long nInputCols;
+  long nKernelRows;
+  long nKernelCols;
+  long nOutputRows, nOutputCols;
+  real *ptr_input;
+  real *ptr_weight;
+  real *output_data;
+  long nelem;
+
+  THArgCheck(t_->nDimension == 2 , 3, "input: 2D Tensor expected");
+  THArgCheck(k_->nDimension == 2 , 4, "kernel: 2D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputRows  = input->size[0];
+  nInputCols  = input->size[1];
+  nKernelRows = kernel->size[0];
+  nKernelCols = kernel->size[1];
+
+  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel");
+
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize2d)(r_, nOutputRows, nOutputCols);
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+    THTensor_(zero)(r_);
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  ptr_input = THTensor_(data)(input);
+  ptr_weight = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+
+  /* do image, kernel convolution */
+  THTensor_(conv2d)(output_data,
+                    alpha,
+                    ptr_input, nInputRows, nInputCols,
+                    ptr_weight, nKernelRows, nKernelCols,
+                    srow, scol, vf, xc);
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  3D input, 3D kernel, 3D output
+  component wise multiplication like
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k;
+
+  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
+  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel");
+
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + k*istride0;
+
+    /* do image, kernel convolution */
+    THTensor_(conv2d)(output_data,
+                      alpha,
+                      ptr_input, nInputRows, nInputCols,
+                      ptr_weight, nKernelRows, nKernelCols,
+                      srow, scol, vf, xc);
+    /* Next output plane */
+    output_data += nOutputCols*nOutputRows;
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  3D input, 3D kernel, 3D output
+  component wise multiplication like with a permutation map
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputRows, nInputCols;
+  long nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+  THTensor *input;
+  THTensor* kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nmaps;
+  long nelem;
+  long k;
+
+  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
+  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols)
+              || *vf == 'F', 2, "conv2Dmap : Input image is smaller than kernel");
+
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  nmaps = map->size[0];
+
+  for(k = 0; k < nmaps; k++)
+  {
+    /* get indices */
+    long from = (long)THTensor_(get2d)(map,k,0)-1;
+    long to   = (long)THTensor_(get2d)(map,k,1)-1;
+
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + from*istride0;
+    /* get output */
+    real *ptr_output = output_data + to*nOutputRows*nOutputCols;
+
+    /* do image, kernel convolution */
+    THTensor_(conv2d)(ptr_output,
+                      alpha,
+                      ptr_input, nInputRows, nInputCols,
+                      ptr_weight, nKernelRows, nKernelCols,
+                      srow, scol, vf, xc);
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 4D kernel, 5D output
+  like rank1 update
+  A <- xx' + beta*A
+  for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for
+  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
+*/
+void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                             long sdepth, long srow, long scol)
+{
+  long nInputPlane, nInputDepth, nInputRows, nInputCols;
+  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k, i;
+
+  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
+  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0 = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelDepth= kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+  nOutputPlane = nInputPlane * kernel->size[0];
+
+  THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel");
+
+  nOutputDepth = nInputDepth - (nKernelDepth - 1) * sdepth;
+  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
+  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(validXCorr3DRevptr)(output_data,
+                                    alpha,
+                                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                                    sdepth, srow, scol);
+      /* Next output plane */
+      output_data += nOutputDepth*nOutputCols*nOutputRows;
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+
+/*
+  4D input, 4D kernel, 5D output
+  like rank1 update
+  A <- xx' + beta*A
+*/
+void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                          long sdepth, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputDepth, nInputRows, nInputCols;
+  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k, i;
+
+  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
+  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0     = kernel->stride[0];
+  nKernelPlane = kernel->size[0];
+  nKernelDepth = kernel->size[1];
+  nKernelRows  = kernel->size[2];
+  nKernelCols  = kernel->size[3];
+  nOutputPlane = nInputPlane * kernel->size[0];
+
+  THArgCheck((nInputDepth >= nKernelDepth
+              && nInputRows >= nKernelRows
+              && nInputCols >= nKernelCols)
+             || *vf == 'F', 2, "conv3Dger : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nKernelPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data+k*kstride0;
+
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get input */
+      real *ptr_input = input_data+i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(conv3d)(output_data,
+                        alpha,
+                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                        sdepth, srow, scol, vf, xc);
+
+      /* Next output plane */
+      output_data += nOutputDepth*nOutputCols*nOutputRows;
+    }
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 5D kernel, 4D output
+  matrix vector product like
+  y <- Ax + beta*y
+*/
+void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                         long sdepth, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputDepth, nInputRows, nInputCols;
+  long nKernelDepth, nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  long istride0, kstride0, kstride1;
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k, i;
+
+  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
+  THArgCheck(k_->nDimension == 5 , 4, "kernel: 5D Tensor expected");
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) {
+    kernel = THTensor_(newContiguous)(k_);
+  } else {
+    THTensor_(retain)(k_);
+    kernel = k_;
+  }
+
+  nInputPlane = input->size[0];
+  istride0    = input->stride[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  kstride1    = kernel->stride[1];
+  nKernelDepth = kernel->size[2];
+  nKernelRows = kernel->size[3];
+  nKernelCols = kernel->size[4];
+  nOutputPlane = kernel->size[0];
+  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
+
+  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    for(i = 0; i < nInputPlane; i++)
+    {
+      /* get kernel */
+      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
+      /* get input */
+      real *ptr_input = input_data + i*istride0;
+
+      /* do image, kernel convolution */
+      THTensor_(conv3d)(output_data,
+                        alpha,
+                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                        sdepth, srow, scol, vf, xc);
+    }
+    /* Next output plane */
+    output_data += nOutputDepth*nOutputCols*nOutputRows;
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  3D input, 3D kernel, 3D output
+  scalar multiplication like
+  y <- x*y + beta*y
+*/
+void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                          long sdepth, long srow, long scol, const char *vf, const char *xc)
+{
+  THTensor *input;
+  THTensor* kernel;
+  long nInputDepth;
+  long nInputRows;
+  long nInputCols;
+  long nKernelDepth;
+  long nKernelRows;
+  long nKernelCols;
+  long nOutputDepth, nOutputRows, nOutputCols;
+  real *ptr_input;
+  real *ptr_weight;
+  real *output_data;
+  long nelem;
+
+  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
+  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  nInputDepth = input->size[0];
+  nInputRows  = input->size[1];
+  nInputCols  = input->size[2];
+  nKernelDepth = kernel->size[0];
+  nKernelRows = kernel->size[1];
+  nKernelCols = kernel->size[2];
+
+  THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize3d)(r_, nOutputDepth, nOutputRows, nOutputCols);
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+    THTensor_(zero)(r_);
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  ptr_input = THTensor_(data)(input);
+  ptr_weight = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+
+  /* do image, kernel convolution */
+  THTensor_(conv3d)(output_data,
+                    alpha,
+                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                    sdepth, srow, scol, vf, xc);
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 4D kernel, 4D output
+  component wise multiplication like
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
+                           long sdepth, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputDepth, nInputRows, nInputCols;
+  long nKernelDepth, nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+
+  THTensor *input;
+  THTensor *kernel;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nelem;
+  long k;
+
+  THArgCheck(t_->nDimension == 4 , 3, "input: 3D Tensor expected");
+  THArgCheck(k_->nDimension == 4 , 4, "kernel: 3D Tensor expected");
+  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelDepth = kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  for(k = 0; k < nOutputPlane; k++)
+  {
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + k*istride0;
+
+    /* do image, kernel convolution */
+    THTensor_(conv3d)(output_data,
+                      alpha,
+                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                      sdepth, srow, scol, vf, xc);
+
+    /* Next output plane */
+    output_data += nOutputDepth*nOutputCols*nOutputRows;
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+/*
+  4D input, 4D kernel, 4D output
+  component wise multiplication like with a permutation map
+  y <- y.*x + beta*y
+*/
+void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map,
+                          long sdepth, long srow, long scol, const char *vf, const char *xc)
+{
+  long nInputPlane, nInputDepth, nInputRows, nInputCols;
+  long nKernelDepth, nKernelRows, nKernelCols;
+  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
+  long istride0, kstride0;
+
+  THTensor *input;
+  THTensor *kernel;
+  long nelem;
+  real *input_data;
+  real *weight_data;
+  real *output_data;
+  long nmaps;
+  long k;
+
+  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
+  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
+  THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
+  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
+  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
+  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
+  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
+
+  input = THTensor_(newContiguous)(t_);
+  kernel = THTensor_(newContiguous)(k_);
+
+  istride0    = input->stride[0];
+  nInputPlane = input->size[0];
+  nInputDepth = input->size[1];
+  nInputRows  = input->size[2];
+  nInputCols  = input->size[3];
+
+  kstride0    = kernel->stride[0];
+  nOutputPlane = kernel->size[0];
+  nKernelDepth = kernel->size[1];
+  nKernelRows = kernel->size[2];
+  nKernelCols = kernel->size[3];
+
+  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
+  THArgCheck((nInputDepth >= nKernelDepth
+              && nInputRows >= nKernelRows
+              && nInputCols >= nKernelCols) || *vf == 'F',
+             2, "conv3Dmap : Input image is smaller than kernel");
+
+  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
+  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
+  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
+
+  nelem = THTensor_(nElement)(r_);
+  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
+
+  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
+  {
+    THTensor_(zero)(r_);
+  }
+  else if (beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  input_data = THTensor_(data)(input);
+  weight_data = THTensor_(data)(kernel);
+  output_data = THTensor_(data)(r_);
+
+  nmaps = map->size[0];
+
+  for(k = 0; k < nmaps; k++)
+  {
+    /* get indices */
+    long from = (long)THTensor_(get2d)(map,k,0)-1;
+    long to   = (long)THTensor_(get2d)(map,k,1)-1;
+
+    /* get kernel */
+    real *ptr_weight = weight_data + k*kstride0;
+    /* get input */
+    real *ptr_input = input_data + from*istride0;
+    /* get output */
+    real *ptr_output = output_data + to*nOutputDepth*nOutputRows*nOutputCols;
+
+    /* do image, kernel convolution */
+    THTensor_(conv3d)(ptr_output,
+                      alpha,
+                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
+                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
+                      sdepth, srow, scol, vf, xc);
+  }
+  THTensor_(free)(input);
+  THTensor_(free)(kernel);
+}
+
+#endif
diff --git a/lib/TH/generic/THTensorConv.h b/lib/TH/generic/THTensorConv.h
new file mode 100644
index 0000000..d215fcd
--- /dev/null
+++ b/lib/TH/generic/THTensorConv.h
@@ -0,0 +1,80 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorConv.h"
+#else
+
+
+TH_API void THTensor_(validXCorr2Dptr)(real *r_,
+                                    real alpha,
+                                    real *t_, long ir, long ic,
+                                    real *k_, long kr, long kc,
+                                    long sr, long sc);
+
+TH_API void THTensor_(validConv2Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, long ir, long ic,
+                                   real *k_, long kr, long kc,
+                                   long sr, long sc);
+
+TH_API void THTensor_(fullXCorr2Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, long ir, long ic,
+                                   real *k_, long kr, long kc,
+                                   long sr, long sc);
+
+TH_API void THTensor_(fullConv2Dptr)(real *r_,
+                                  real alpha,
+                                  real *t_, long ir, long ic,
+                                  real *k_, long kr, long kc,
+                                  long sr, long sc);
+
+TH_API void THTensor_(validXCorr2DRevptr)(real *r_,
+                                       real alpha,
+                                       real *t_, long ir, long ic,
+                                       real *k_, long kr, long kc,
+                                       long sr, long sc);
+
+TH_API void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol);
+TH_API void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol);
+TH_API void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
+
+TH_API void THTensor_(validXCorr3Dptr)(real *r_,
+                                    real alpha,
+                                    real *t_, long it, long ir, long ic,
+                                    real *k_, long kt, long kr, long kc,
+                                    long st, long sr, long sc);
+
+TH_API void THTensor_(validConv3Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, long it, long ir, long ic,
+                                   real *k_, long kt, long kr, long kc,
+                                   long st, long sr, long sc);
+
+TH_API void THTensor_(fullXCorr3Dptr)(real *r_,
+                                   real alpha,
+                                   real *t_, long it, long ir, long ic,
+                                   real *k_, long kt, long kr, long kc,
+                                   long st, long sr, long sc);
+
+TH_API void THTensor_(fullConv3Dptr)(real *r_,
+                                  real alpha,
+                                  real *t_, long it, long ir, long ic,
+                                  real *k_, long kt, long kr, long kc,
+                                  long st, long sr, long sc);
+
+TH_API void THTensor_(validXCorr3DRevptr)(real *r_,
+                                       real alpha, 
+                                       real *t_, long it, long ir, long ic,
+                                       real *k_, long kt, long kr, long kc,
+                                       long st, long sr, long sc);
+
+TH_API void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol);
+TH_API void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
+TH_API void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
+
+#endif
diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c
new file mode 100644
index 0000000..ea6d6f1
--- /dev/null
+++ b/lib/TH/generic/THTensorCopy.c
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorCopy.c"
+#else
+
+void THTensor_(copy)(THTensor *tensor, THTensor *src)
+{
+  TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = (real)(*src_data);)
+}
+
+#define IMPLEMENT_THTensor_COPY(TYPENAMESRC, TYPE_SRC) \
+void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
+{ \
+  TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)(*src_data);) \
+}
+
+IMPLEMENT_THTensor_COPY(Byte, unsigned char)
+IMPLEMENT_THTensor_COPY(Char, char)
+IMPLEMENT_THTensor_COPY(Short, short)
+IMPLEMENT_THTensor_COPY(Int, int)
+IMPLEMENT_THTensor_COPY(Long, long)
+IMPLEMENT_THTensor_COPY(Float, float)
+IMPLEMENT_THTensor_COPY(Double, double)
+
+#endif
diff --git a/lib/TH/generic/THTensorCopy.h b/lib/TH/generic/THTensorCopy.h
new file mode 100644
index 0000000..8d03b22
--- /dev/null
+++ b/lib/TH/generic/THTensorCopy.h
@@ -0,0 +1,16 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorCopy.h"
+#else
+
+/* Support for copy between different Tensor types */
+
+TH_API void THTensor_(copy)(THTensor *tensor, THTensor *src);
+TH_API void THTensor_(copyByte)(THTensor *tensor, struct THByteTensor *src);
+TH_API void THTensor_(copyChar)(THTensor *tensor, struct THCharTensor *src);
+TH_API void THTensor_(copyShort)(THTensor *tensor, struct THShortTensor *src);
+TH_API void THTensor_(copyInt)(THTensor *tensor, struct THIntTensor *src);
+TH_API void THTensor_(copyLong)(THTensor *tensor, struct THLongTensor *src);
+TH_API void THTensor_(copyFloat)(THTensor *tensor, struct THFloatTensor *src);
+TH_API void THTensor_(copyDouble)(THTensor *tensor, struct THDoubleTensor *src);
+
+#endif
diff --git a/lib/TH/generic/THTensorLapack.c b/lib/TH/generic/THTensorLapack.c
new file mode 100644
index 0000000..62d730a
--- /dev/null
+++ b/lib/TH/generic/THTensorLapack.c
@@ -0,0 +1,884 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorLapack.c"
+#else
+
+/*
+Check if self is transpose of a contiguous matrix
+*/
+static int THTensor_(isTransposedContiguous)(THTensor *self)
+{
+  return self->stride[0] == 1 && self->stride[1] == self->size[0];
+}
+/*
+If a matrix is a regular contiguous matrix, make sure it is transposed
+because this is what we return from Lapack calls.
+*/
+static void THTensor_(checkTransposed)(THTensor *self)
+{
+  if(THTensor_(isContiguous)(self))
+    THTensor_(transpose)(self, NULL, 0, 1);
+  return;
+}
+/*
+newContiguous followed by transpose
+Similar to (newContiguous), but checks if the transpose of the matrix
+is contiguous and also limited to 2D matrices.
+*/
+static THTensor *THTensor_(newTransposedContiguous)(THTensor *self)
+{
+  THTensor *tensor;
+  if(THTensor_(isTransposedContiguous)(self))
+  {
+    THTensor_(retain)(self);
+    tensor = self;
+  }
+  else
+  {
+    tensor = THTensor_(newContiguous)(self);
+    THTensor_(transpose)(tensor, NULL, 0, 1);
+  }
+
+  return tensor;
+}
+
+/*
+Given the result tensor and src tensor, decide if the lapack call should use the
+provided result tensor or should allocate a new space to put the result in.
+
+The returned tensor have to be freed by the calling function.
+
+nrows is required, because some lapack calls, require output space smaller than
+input space, like underdetermined gels.
+*/
+static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows)
+{
+  /* check if user wants to reuse src and if it is correct shape/size */
+  if (src == result && THTensor_(isTransposedContiguous)(src) && src->size[1] == nrows)
+    THTensor_(retain)(result);
+  else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */
+    result = THTensor_(new)();
+  else
+    THTensor_(retain)(result);
+  return result;
+}
+
+/*
+Same as cloneColumnMajor, but accepts nrows argument, because some lapack calls require
+the resulting tensor to be larger than src.
+*/
+static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, int nrows)
+{
+  THTensor *result;
+  THTensor *view;
+
+  if (src == NULL)
+    src = self;
+  result = THTensor_(checkLapackClone)(self, src, nrows);
+  if (src == result)
+    return result;
+
+  THTensor_(resize2d)(result, src->size[1], nrows);
+  THTensor_(checkTransposed)(result);
+
+  if (src->size[0] == nrows)
+    THTensor_(copy)(result, src);
+  else
+  {
+    view = THTensor_(newNarrow)(result, 0, 0, src->size[0]);
+    THTensor_(copy)(view, src);
+    THTensor_(free)(view);
+  }
+  return result;
+}
+
+/*
+Create a clone of src in self column major order for use with Lapack.
+If src == self, a new tensor is allocated, in any case, the return tensor should be
+freed by calling function.
+*/
+static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
+{
+  return THTensor_(cloneColumnMajorNrows)(self, src, src->size[0]);
+}
+
+void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
+{
+  if (a == NULL) a = ra_;
+  if (b == NULL) b = rb_;
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+  THArgCheck(b->nDimension == 2, 1, "B should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+  THArgCheck(a->size[0] == b->size[0], 2, "A,b size incompatible");
+
+  int n, nrhs, lda, ldb, info;
+  THIntTensor *ipiv;
+  THTensor *ra__;  // working version of A matrix to be passed into lapack GELS
+  THTensor *rb__;  // working version of B matrix to be passed into lapack GELS
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
+
+  n    = (int)ra__->size[0];
+  nrhs = (int)rb__->size[1];
+  lda  = n;
+  ldb  = n;
+
+  ipiv = THIntTensor_newWithSize1d((long)n);
+  THLapack_(gesv)(n, nrhs,
+		  THTensor_(data)(ra__), lda, THIntTensor_data(ipiv),
+		  THTensor_(data)(rb__), ldb, &info);
+
+  THLapackCheckWithCleanup("Lapack Error in %s : U(%d,%d) is zero, singular U.",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(rb__);
+                               THIntTensor_free(ipiv);),
+                           "gesv", info, info);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(freeCopyTo)(rb__, rb_);
+  THIntTensor_free(ipiv);
+}
+
+void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
+                      const char *uplo, const char *trans, const char *diag)
+{
+  if (a == NULL) a = ra_;
+  if (b == NULL) b = rb_;
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+  THArgCheck(b->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+  THArgCheck(b->size[0] == a->size[0], 2, "A,b size incompatible");
+
+  int n, nrhs, lda, ldb, info;
+  THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
+  THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
+
+  n    = (int)ra__->size[0];
+  nrhs = (int)rb__->size[1];
+  lda  = n;
+  ldb  = n;
+
+  THLapack_(trtrs)(uplo[0], trans[0], diag[0], n, nrhs,
+                   THTensor_(data)(ra__), lda,
+                   THTensor_(data)(rb__), ldb, &info);
+
+
+  THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
+                           THCleanup(THTensor_(free)(ra__); THTensor_(free)(rb__);),
+                           "trtrs", info, info);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(freeCopyTo)(rb__, rb_);
+}
+
+void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
+{
+  // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
+  if (a == NULL) a = ra_;
+  if (b == NULL) b = rb_;
+  THArgCheck(a->nDimension == 2, 2, "A should be 2 dimensional");
+  THArgCheck(b->nDimension == 2, 1, "B should be 2 dimensional");
+  THArgCheck(a->size[0] == b->size[0], 2, "size incompatible A,b");
+
+  int m, n, nrhs, lda, ldb, info, lwork;
+  THTensor *work = NULL;
+  real wkopt = 0;
+
+  THTensor *ra__ = NULL;  // working version of A matrix to be passed into lapack GELS
+  THTensor *rb__ = NULL;  // working version of B matrix to be passed into lapack GELS
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  m = ra__->size[0];
+  n = ra__->size[1];
+  lda = m;
+  ldb = (m > n) ? m : n;
+
+  rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb);
+
+  nrhs = rb__->size[1];
+  info = 0;
+
+
+  /* get optimal workspace size */
+  THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
+		  THTensor_(data)(rb__), ldb,
+		  &wkopt, -1, &info);
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
+		  THTensor_(data)(rb__), ldb,
+		  THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
+                           THCleanup(THTensor_(free)(ra__);
+                                     THTensor_(free)(rb__);
+                                     THTensor_(free)(work);),
+                           "gels", info,"");
+
+  /* rb__ is currently ldb by nrhs; resize it to n by nrhs */
+  rb__->size[0] = n;
+  if (rb__ != rb_)
+    THTensor_(resize2d)(rb_, n, nrhs);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(freeCopyTo)(rb__, rb_);
+  THTensor_(free)(work);
+}
+
+void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
+{
+  int n, lda, lwork, info, ldvr;
+  THTensor *work, *wi, *wr, *a;
+  real wkopt;
+  real *rv_data;
+  long i;
+
+  THTensor *re__ = NULL;
+  THTensor *rv__ = NULL;
+
+  THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a_->size[0] == a_->size[1], 1,"A should be square");
+
+  /* we want to definitely clone a_ for geev*/
+  a = THTensor_(cloneColumnMajor)(NULL, a_);
+
+  n = a->size[0];
+  lda = n;
+
+  wi = THTensor_(newWithSize1d)(n);
+  wr = THTensor_(newWithSize1d)(n);
+
+  rv_data = NULL;
+  ldvr = 1;
+  if (*jobvr == 'V')
+  {
+    THTensor_(resize2d)(rv_,n,n);
+    /* guard against someone passing a correct size, but wrong stride */
+    rv__ = THTensor_(newTransposedContiguous)(rv_);
+    rv_data = THTensor_(data)(rv__);
+    ldvr = n;
+  }
+  THTensor_(resize2d)(re_,n,2);
+  re__ = THTensor_(newContiguous)(re_);
+
+  /* get optimal workspace size */
+  THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
+      NULL, 1, rv_data, ldvr, &wkopt, -1, &info);
+
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+
+  THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
+      NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero",
+                           THCleanup(THTensor_(free)(re__);
+                                     THTensor_(free)(rv__);
+                                     THTensor_(free)(a);
+                                     THTensor_(free)(wi);
+                                     THTensor_(free)(wr);
+                                     THTensor_(free)(work);),
+                           "geev", info,"");
+
+  {
+    real *re_data = THTensor_(data)(re__);
+    real *wi_data = THTensor_(data)(wi);
+    real *wr_data = THTensor_(data)(wr);
+    for (i=0; i<n; i++)
+    {
+      re_data[2*i] = wr_data[i];
+      re_data[2*i+1] = wi_data[i];
+    }
+  }
+
+  if (*jobvr == 'V')
+  {
+    THTensor_(checkTransposed)(rv_);
+    THTensor_(freeCopyTo)(rv__, rv_);
+  }
+  THTensor_(freeCopyTo)(re__, re_);
+  THTensor_(free)(a);
+  THTensor_(free)(wi);
+  THTensor_(free)(wr);
+  THTensor_(free)(work);
+}
+
+void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz, const char *uplo)
+{
+  if (a == NULL) a = rv_;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  int n, lda, lwork, info;
+  THTensor *work;
+  real wkopt;
+
+  THTensor *rv__ = NULL;
+  THTensor *re__ = NULL;
+
+  rv__ = THTensor_(cloneColumnMajor)(rv_, a);
+
+  n = rv__->size[0];
+  lda = n;
+
+  THTensor_(resize1d)(re_,n);
+  re__ = THTensor_(newContiguous)(re_);
+
+  /* get optimal workspace size */
+  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
+		  THTensor_(data)(re_), &wkopt, -1, &info);
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
+		  THTensor_(data)(re_), THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero",
+                           THCleanup(THTensor_(free)(rv__);
+                                     THTensor_(free)(re__);
+                                     THTensor_(free)(work);),
+                           "syev", info,"");
+
+  THTensor_(freeCopyTo)(rv__, rv_);
+  THTensor_(freeCopyTo)(re__, re_);
+  THTensor_(free)(work);
+}
+
+void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* jobu)
+{
+  THTensor *ra_ = THTensor_(new)();
+  THTensor_(gesvd2)(ru_, rs_, rv_,  ra_, a, jobu);
+  THTensor_(free)(ra_);
+}
+
+void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  int k,m, n, lda, ldu, ldvt, lwork, info;
+  THTensor *work;
+  THTensor *rvf_ = THTensor_(new)();
+  real wkopt;
+
+  THTensor *ra__ = NULL;
+  THTensor *ru__ = NULL;
+  THTensor *rs__ = NULL;
+  THTensor *rv__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  m = ra__->size[0];
+  n = ra__->size[1];
+  k = (m < n ? m : n);
+
+  lda = m;
+  ldu = m;
+  ldvt = n;
+
+  THTensor_(resize1d)(rs_,k);
+  THTensor_(resize2d)(rvf_,ldvt,n);
+  if (*jobu == 'A')
+    THTensor_(resize2d)(ru_,m,ldu);
+  else
+    THTensor_(resize2d)(ru_,k,ldu);
+
+  THTensor_(checkTransposed)(ru_);
+
+  /* guard against someone passing a correct size, but wrong stride */
+  ru__ = THTensor_(newTransposedContiguous)(ru_);
+  rs__ = THTensor_(newContiguous)(rs_);
+  rv__ = THTensor_(newContiguous)(rvf_);
+
+  THLapack_(gesvd)(jobu[0],jobu[0],
+		   m,n,THTensor_(data)(ra__),lda,
+		   THTensor_(data)(rs__),
+		   THTensor_(data)(ru__),
+		   ldu,
+		   THTensor_(data)(rv__), ldvt,
+		   &wkopt, -1, &info);
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(gesvd)(jobu[0],jobu[0],
+		   m,n,THTensor_(data)(ra__),lda,
+		   THTensor_(data)(rs__),
+		   THTensor_(data)(ru__),
+		   ldu,
+		   THTensor_(data)(rv__), ldvt,
+		   THTensor_(data)(work),lwork, &info);
+
+  THLapackCheckWithCleanup(" Lapack Error %s : %d superdiagonals failed to converge.",
+                           THCleanup(
+                               THTensor_(free)(ru__);
+                               THTensor_(free)(rs__);
+                               THTensor_(free)(rv__);
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "gesvd", info,"");
+
+  if (*jobu == 'S')
+    THTensor_(narrow)(rv__,NULL,1,0,k);
+
+  THTensor_(freeCopyTo)(ru__, ru_);
+  THTensor_(freeCopyTo)(rs__, rs_);
+  THTensor_(freeCopyTo)(rv__, rvf_);
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+
+  if (*jobu == 'S') {
+    THTensor_(narrow)(rvf_,NULL,1,0,k);
+  }
+  THTensor_(resizeAs)(rv_, rvf_);
+  THTensor_(copy)(rv_, rvf_);
+  THTensor_(free)(rvf_);
+}
+
+void THTensor_(getri)(THTensor *ra_, THTensor *a)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int m, n, lda, info, lwork;
+  real wkopt;
+  THIntTensor *ipiv;
+  THTensor *work;
+  THTensor *ra__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  m = ra__->size[0];
+  n = ra__->size[1];
+  lda = m;
+  ipiv = THIntTensor_newWithSize1d((long)m);
+
+  /* Run LU */
+  THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &info);
+  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THIntTensor_free(ipiv);),
+                           "getrf", info, info);
+
+  /* Run inverse */
+  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &wkopt, -1, &info);
+  lwork = (int)wkopt;
+  work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), THTensor_(data)(work), lwork, &info);
+  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);
+                               THIntTensor_free(ipiv);),
+                           "getri", info, info);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+  THIntTensor_free(ipiv);
+}
+
+void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
+{
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n = a->size[0];
+
+  /* Build full matrix */
+  real *p = THTensor_(data)(a);
+  long i, j;
+
+  /* Upper Triangular Case */
+  if (uplo[0] == 'U')
+  {
+    /* Clear lower triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+     for (j=i+1; j<n; j++) {
+        p[n*i + j] = 0;
+      }
+    }
+  }
+  /* Lower Triangular Case */
+  else if (uplo[0] == 'L')
+  {
+    /* Clear upper triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+      for (j=0; j<i; j++) {
+        p[n*i + j] = 0;
+      }
+    }
+  }
+}
+
+void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo)
+{
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n = a->size[0];
+
+  /* Build full matrix */
+  real *p = THTensor_(data)(a);
+  long i, j;
+
+  /* Upper Triangular Case */
+  if (uplo[0] == 'U')
+  {
+    /* Clear lower triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+     for (j=i+1; j<n; j++) {
+        p[n*i + j] = p[n*j+i];
+      }
+    }
+  }
+  /* Lower Triangular Case */
+  else if (uplo[0] == 'L')
+  {
+    /* Clear upper triangle (excluding diagonals) */
+    for (i=0; i<n; i++) {
+      for (j=0; j<i; j++) {
+        p[n*i + j] = p[n*j+i];
+      }
+    }
+  }
+}
+
+void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n, lda, info;
+  THTensor *ra__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  n = ra__->size[0];
+  lda = n;
+
+  /* Run Factorization */
+  THLapack_(potrf)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
+  THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
+                           THCleanup(THTensor_(free)(ra__);),
+                           "potrf", info, info);
+
+  THTensor_(clearUpLoTriangle)(ra__, uplo);
+  THTensor_(freeCopyTo)(ra__, ra_);
+}
+
+void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
+{
+  if (b == NULL) b = rb_;
+
+  THArgCheck(a->size[0] == a->size[1], 2, "A should be square");
+
+  int n, nrhs, lda, ldb, info;
+  THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
+  THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
+
+  ra__ = THTensor_(cloneColumnMajor)(NULL, a);
+  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
+
+  n    = (int)ra__->size[0];
+  nrhs = (int)rb__->size[1];
+  lda  = n;
+  ldb  = n;
+
+  THLapack_(potrs)(uplo[0], n, nrhs, THTensor_(data)(ra__),
+                   lda, THTensor_(data)(rb__), ldb, &info);
+
+
+  THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(rb__);),
+                           "potrs", info, info);
+
+  THTensor_(free)(ra__);
+  THTensor_(freeCopyTo)(rb__, rb_);
+}
+
+void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n, lda, info;
+  THTensor *ra__ = NULL;
+
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  n = ra__->size[0];
+  lda = n;
+
+  /* Run inverse */
+  THLapack_(potri)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
+  THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
+                           THCleanup(THTensor_(free)(ra__);),
+                           "potri", info, info);
+
+  THTensor_(copyUpLoTriangle)(ra__, uplo);
+  THTensor_(freeCopyTo)(ra__, ra_);
+}
+
+/*
+ Computes the Cholesky factorization with complete pivoting of a real symmetric
+ positive semidefinite matrix.
+
+ Args:
+ * `ra_`    - result Tensor in which to store the factor U or L from the
+              Cholesky factorization.
+ * `rpiv_`  - result IntTensor containing sparse permutation matrix P, encoded
+              as P[rpiv_[k], k] = 1.
+ * `a`      - input Tensor; the input matrix to factorize.
+ * `uplo`   - string; specifies whether the upper or lower triangular part of
+              the symmetric matrix A is stored. "U"/"L" for upper/lower
+              triangular.
+ * `tol`    - double; user defined tolerance, or < 0 for automatic choice.
+              The algorithm terminates when the pivot <= tol.
+ */
+void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, real tol) {
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
+
+  int n = a->size[0];
+
+  THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+  THIntTensor_resize1d(rpiv_, n);
+
+  // Allocate working tensor
+  THTensor *work = THTensor_(newWithSize1d)(2 * n);
+
+  // Run Cholesky factorization
+  int lda = n;
+  int rank, info;
+
+  THLapack_(pstrf)(uplo[0], n, THTensor_(data)(ra__), lda,
+                   THIntTensor_data(rpiv_), &rank, tol,
+                   THTensor_(data)(work), &info);
+
+  THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "pstrf", info,"");
+
+  THTensor_(clearUpLoTriangle)(ra__, uplo);
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+/*
+  Perform a QR decomposition of a matrix.
+
+  In LAPACK, two parts of the QR decomposition are implemented as two separate
+  functions: geqrf and orgqr. For flexibility and efficiency, these are wrapped
+  directly, below - but to make the common usage convenient, we also provide
+  this function, which calls them both and returns the results in a more
+  intuitive form.
+
+  Args:
+  * `rq_` - result Tensor in which to store the Q part of the decomposition.
+  * `rr_` - result Tensor in which to store the R part of the decomposition.
+  * `a`   - input Tensor; the matrix to decompose.
+
+*/
+void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a)
+{
+  int m = a->size[0];
+  int n = a->size[1];
+  int k = (m < n ? m : n);
+  THTensor *ra_ = THTensor_(new)();
+  THTensor *rtau_ = THTensor_(new)();
+  THTensor *rr__ = THTensor_(new)();
+  THTensor_(geqrf)(ra_, rtau_, a);
+  THTensor_(resize2d)(rr__, k, ra_->size[1]);
+  THTensor_(narrow)(rr__, ra_, 0, 0, k);
+  THTensor_(triu)(rr_, rr__, 0);
+  THTensor_(resize2d)(rq_, ra_->size[0], k);
+  THTensor_(orgqr)(rq_, ra_, rtau_);
+  THTensor_(narrow)(rq_, rq_, 1, 0, k);
+  THTensor_(free)(ra_);
+  THTensor_(free)(rtau_);
+  THTensor_(free)(rr__);
+}
+
+/*
+  The geqrf function does the main work of QR-decomposing a matrix.
+  However, rather than producing a Q matrix directly, it produces a sequence of
+  elementary reflectors which may later be composed to construct Q - for example
+  with the orgqr function, below.
+
+  Args:
+  * `ra_`   - Result matrix which will contain:
+              i)  The elements of R, on and above the diagonal.
+              ii) Directions of the reflectors implicitly defining Q.
+  * `rtau_` - Result tensor which will contain the magnitudes of the reflectors
+              implicitly defining Q.
+  * `a`     - Input matrix, to decompose. If NULL, `ra_` is used as input.
+
+  For further details, please see the LAPACK documentation.
+
+*/
+void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
+{
+  if (a == NULL) ra_ = a;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  THTensor *ra__ = NULL;
+
+  /* Prepare the input for LAPACK, making a copy if necessary. */
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  int m = ra__->size[0];
+  int n = ra__->size[1];
+  int k = (m < n ? m : n);
+  int lda = m;
+  THTensor_(resize1d)(rtau_, k);
+
+  /* Dry-run to query the suggested size of the workspace. */
+  int info = 0;
+  real wkopt = 0;
+  THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(rtau_),
+                   &wkopt, -1, &info);
+
+  /* Allocate the workspace and call LAPACK to do the real work. */
+  int lwork = (int)wkopt;
+  THTensor *work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(rtau_),
+                   THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup("Lapack Error %s : unknown Lapack error. info = %i",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "geqrf", info,"");
+
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+/*
+  The orgqr function allows reconstruction of a matrix Q with orthogonal
+  columns, from a sequence of elementary reflectors, such as is produced by the
+  geqrf function.
+
+  Args:
+  * `ra_` - result Tensor, which will contain the matrix Q.
+  * `a`   - input Tensor, which should be a matrix with the directions of the
+            elementary reflectors below the diagonal. If NULL, `ra_` is used as
+            input.
+  * `tau` - input Tensor, containing the magnitudes of the elementary
+            reflectors.
+
+  For further details, please see the LAPACK documentation.
+
+*/
+void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  THTensor *ra__ = NULL;
+  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
+
+  int m = ra__->size[0];
+  int n = ra__->size[1];
+  int k = tau->size[0];
+  int lda = m;
+
+  /* Dry-run to query the suggested size of the workspace. */
+  int info = 0;
+  real wkopt = 0;
+  THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(tau),
+                   &wkopt, -1, &info);
+
+  /* Allocate the workspace and call LAPACK to do the real work. */
+  int lwork = (int)wkopt;
+  THTensor *work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
+                   THTensor_(data)(tau),
+                   THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "orgqr", info,"");
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+/*
+  The ormqr function multiplies Q with another matrix from a sequence of
+  elementary reflectors, such as is produced by the geqrf function.
+
+  Args:
+  * `ra_`   - result Tensor, which will contain the matrix Q' c.
+  * `a`     - input Tensor, which should be a matrix with the directions of the
+              elementary reflectors below the diagonal. If NULL, `ra_` is used as
+              input.
+  * `tau`   - input Tensor, containing the magnitudes of the elementary
+              reflectors.
+  * `c`     - input Tensor, containing the matrix to be multiplied.
+  * `side`  - char, determining whether c is left- or right-multiplied with Q.
+  * `trans` - char, determining whether to transpose Q before multiplying.
+
+  For further details, please see the LAPACK documentation.
+
+*/
+void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans)
+{
+  if (a == NULL) a = ra_;
+  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
+
+  THTensor *ra__ = NULL;
+  ra__ = THTensor_(cloneColumnMajor)(ra_, c);
+
+  int m = c->size[0];
+  int n = c->size[1];
+  int k = tau->size[0];
+  int lda;
+  if (*side == 'L')
+  {
+    lda = m;
+  }
+  else
+  {
+    lda = n;
+  }
+  int ldc = m;
+
+  /* Dry-run to query the suggested size of the workspace. */
+  int info = 0;
+  real wkopt = 0;
+  THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
+                   THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
+                   &wkopt, -1, &info);
+
+  /* Allocate the workspace and call LAPACK to do the real work. */
+  int lwork = (int)wkopt;
+  THTensor *work = THTensor_(newWithSize1d)(lwork);
+  THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
+                   THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
+                   THTensor_(data)(work), lwork, &info);
+
+  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
+                           THCleanup(
+                               THTensor_(free)(ra__);
+                               THTensor_(free)(work);),
+                           "ormqr", info,"");
+  THTensor_(freeCopyTo)(ra__, ra_);
+  THTensor_(free)(work);
+}
+
+#endif
diff --git a/lib/TH/generic/THTensorLapack.h b/lib/TH/generic/THTensorLapack.h
new file mode 100644
index 0000000..1a19977
--- /dev/null
+++ b/lib/TH/generic/THTensorLapack.h
@@ -0,0 +1,22 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorLapack.h"
+#else
+
+TH_API void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_);
+TH_API void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_, const char *uplo, const char *trans, const char *diag);
+TH_API void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_);
+TH_API void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobz, const char *uplo);
+TH_API void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr);
+TH_API void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *jobu);
+TH_API void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char *jobu);
+TH_API void THTensor_(getri)(THTensor *ra_, THTensor *a);
+TH_API void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo);
+TH_API void THTensor_(potrs)(THTensor *rb_, THTensor *b_, THTensor *a_,  const char *uplo);
+TH_API void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo);
+TH_API void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a);
+TH_API void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a);
+TH_API void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau);
+TH_API void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans);
+TH_API void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor*a, const char* uplo, real tol);
+
+#endif
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
new file mode 100644
index 0000000..19843a8
--- /dev/null
+++ b/lib/TH/generic/THTensorMath.c
@@ -0,0 +1,2509 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorMath.c"
+#else
+
+#define TH_OMP_OVERHEAD_THRESHOLD 100000
+
+void THTensor_(fill)(THTensor *r_, real value)
+{
+  TH_TENSOR_APPLY(real, r_,
+                  THVector_(fill)(r__data, value, r__size); break;);
+}
+
+void THTensor_(zero)(THTensor *r_)
+{
+  TH_TENSOR_APPLY(real, r_,
+                  THVector_(fill)(r__data, 0, r__size); break;);
+}
+
+void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value)
+{
+  TH_TENSOR_APPLY2(real, tensor, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THFree(mask_counter);
+                     THFree(tensor_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     *tensor_data = value;
+                   });
+}
+
+void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src )
+{
+  THTensor *srct = THTensor_(newContiguous)(src);
+  real *src_data = THTensor_(data)(srct);
+  long cntr = 0;
+  long nelem = THTensor_(nElement)(srct);
+  if (THTensor_(nElement)(tensor) != THByteTensor_nElement(mask))
+  {
+    THTensor_(free)(srct);
+    THError("Number of elements of destination tensor != Number of elements in mask");
+  }
+  TH_TENSOR_APPLY2(real, tensor, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THTensor_(free)(srct);
+                     THFree(mask_counter);
+                     THFree(tensor_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     if (cntr == nelem)
+                     {
+                       THTensor_(free)(srct);
+                       THFree(mask_counter);
+                       THFree(tensor_counter);
+                       THError("Number of elements of src < number of ones in mask");
+                     }
+                     *tensor_data = *src_data;
+                     src_data++;
+                     cntr++;
+                   });
+  THTensor_(free)(srct);
+}
+
+void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask)
+{
+  long numel = THByteTensor_sumall(mask);
+  real *tensor_data;
+
+  THTensor_(resize1d)(tensor,numel);
+  tensor_data = THTensor_(data)(tensor);
+  TH_TENSOR_APPLY2(real, src, unsigned char, mask,
+                   if (*mask_data > 1)
+                   {
+                     THFree(mask_counter);
+                     THFree(src_counter);
+                     THError("Mask tensor can take 0 and 1 values only");
+                   }
+                   else if (*mask_data == 1)
+                   {
+                     *tensor_data = *src_data;
+                     tensor_data++;
+                   });
+}
+
+// Finds non-zero elements of a tensor and returns their subscripts
+void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
+{
+  long numel = 0;
+  long *subscript_data;
+  long i = 0;
+  long dim;
+  long div = 1;
+
+  /* First Pass to determine size of subscripts */
+  TH_TENSOR_APPLY(real, tensor,
+                  if (*tensor_data != 0) {
+                    ++numel;
+                  });
+  THLongTensor_resize2d(subscript, numel, tensor->nDimension);
+
+  /* Second pass populates subscripts */
+  subscript_data = THLongTensor_data(subscript);
+  TH_TENSOR_APPLY(real, tensor,
+                  if (*tensor_data != 0) {
+                    div = 1;
+
+                    for (dim = tensor->nDimension - 1; dim >= 0; dim--) {
+                      *(subscript_data + dim) = (i/div) % tensor->size[dim];
+                      div *= tensor->size[dim];
+                    }
+
+                    subscript_data += tensor->nDimension;
+                  }
+                  ++i;);
+}
+
+void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
+{
+  long i, numel;
+  THLongStorage *newSize;
+  THTensor *tSlice, *sSlice;
+  long *index_data;
+  real *tensor_data, *src_data;
+
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->nDimension, 4,"Indexing dim %d is out of bounds of tensor", dim+1);
+  THArgCheck(src->nDimension > 0,2,"Source tensor is empty");
+
+  numel = THLongTensor_nElement(index);
+
+  newSize = THLongStorage_newWithSize(src->nDimension);
+  THLongStorage_rawCopy(newSize,src->size);
+  newSize->data[dim] = numel;
+  THTensor_(resize)(tensor,newSize,NULL);
+  THLongStorage_free(newSize);
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  if (dim == 0 && THTensor_(isContiguous)(src) && THTensor_(isContiguous)(tensor))
+  {
+    tensor_data = THTensor_(data)(tensor);
+    src_data = THTensor_(data)(src);
+    long rowsize = THTensor_(nElement)(src) / src->size[0];
+
+    // check that the indices are within range
+    long max = src->size[0];
+    for (i=0; i<numel; i++) {
+      if (index_data[i] < 1 || index_data[i] > max) {
+        THLongTensor_free(index);
+        THError("index out of range");
+      }
+    }
+
+    if (src->nDimension == 1) {
+      #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<numel; i++)
+        tensor_data[i] = src_data[index_data[i]-1];
+    } else {
+      #pragma omp parallel for if(numel*rowsize > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<numel; i++)
+        memcpy(tensor_data + i*rowsize, src_data + (index_data[i]-1)*rowsize, rowsize*sizeof(real));
+    }
+  }
+  else if (src->nDimension == 1)
+  {
+    for (i=0; i<numel; i++)
+      THTensor_(set1d)(tensor,i,THTensor_(get1d)(src,index_data[i]-1));
+  }
+  else
+  {
+    for (i=0; i<numel; i++)
+    {
+      tSlice = THTensor_(new)();
+      sSlice = THTensor_(new)();
+      THTensor_(select)(tSlice, tensor, dim, i);
+      THTensor_(select)(sSlice, src, dim, index_data[i]-1);
+      THTensor_(copy)(tSlice, sSlice);
+      THTensor_(free)(tSlice);
+      THTensor_(free)(sSlice);
+    }
+  }
+
+  THLongTensor_free(index);
+}
+
+void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  long i, numel;
+  THTensor *tSlice, *sSlice;
+  long *index_data;
+
+  numel = THLongTensor_nElement(index);
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->nDimension, 4,"Indexing dim %d is out of bounds of tensor", dim+1);
+  THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)");
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  if (tensor->nDimension > 1 )
+  {
+    tSlice = THTensor_(new)();
+    sSlice = THTensor_(new)();
+
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(select)(tSlice, tensor, dim, index_data[i]-1);
+      THTensor_(select)(sSlice, src, dim, i);
+      THTensor_(copy)(tSlice, sSlice);
+    }
+
+    THTensor_(free)(tSlice);
+    THTensor_(free)(sSlice);
+  }
+  else
+  {
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(set1d)(tensor,index_data[i]-1,THTensor_(get1d)(src,i));
+    }
+  }
+  THLongTensor_free(index);
+}
+
+void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  long i, numel;
+  THTensor *tSlice, *sSlice;
+  long *index_data;
+
+  numel = THLongTensor_nElement(index);
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->nDimension, 4,"Indexing dim %d is out of bounds of tensor", dim+1);
+  THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)");
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  if (tensor->nDimension > 1 )
+  {
+    tSlice = THTensor_(new)();
+    sSlice = THTensor_(new)();
+
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(select)(tSlice, tensor, dim, index_data[i]-1);
+      THTensor_(select)(sSlice, src, dim, i);
+      THTensor_(cadd)(tSlice, tSlice, 1.0, sSlice);
+    }
+
+    THTensor_(free)(tSlice);
+    THTensor_(free)(sSlice);
+  }
+  else
+  {
+    for (i=0; i<numel; i++)
+    {
+      THTensor_(set1d)(tensor,index_data[i]-1,THTensor_(get1d)(src,i) + THTensor_(get1d)(tensor,index_data[i]-1));
+    }
+  }
+  THLongTensor_free(index);
+}
+
+void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
+{
+  long i, numel;
+  THTensor *tSlice;
+  long *index_data;
+
+  numel = THLongTensor_nElement(index);
+  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < tensor->nDimension, 4,"Indexing dim %d is out of bounds of tensor", dim+1);
+
+  index = THLongTensor_newContiguous(index);
+  index_data = THLongTensor_data(index);
+
+  for (i=0; i<numel; i++)
+  {
+    if (tensor->nDimension > 1 )
+    {
+      tSlice = THTensor_(new)();
+      THTensor_(select)(tSlice, tensor,dim,index_data[i]-1);
+      THTensor_(fill)(tSlice, val);
+      THTensor_(free)(tSlice);
+    }
+    else
+    {
+      THTensor_(set1d)(tensor,index_data[i]-1,val);
+    }
+  }
+  THLongTensor_free(index);
+}
+
+void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
+{
+  long elems_per_row, i, idx;
+
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 2,
+             "Input tensor must have same dimensions as output tensor");
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 3, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(src), 4,
+             "Index tensor must have same dimensions as input tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < 1 || idx > src_size)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in gather");
+                         }
+                         *(tensor_data + i*tensor_stride) = src_data[(idx - 1) * src_stride];
+                       })
+}
+
+void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
+{
+  long elems_per_row, i, idx;
+
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < 1 || idx > tensor_size)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in scatter");
+                         }
+                         tensor_data[(idx - 1) * tensor_stride] = *(src_data + i*src_stride);
+                       })
+}
+
+void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
+{
+  long elems_per_row, i, idx;
+
+  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+
+  elems_per_row = THLongTensor_size(index, dim);
+
+  TH_TENSOR_DIM_APPLY2(real, tensor, long, index, dim,
+                       for (i = 0; i < elems_per_row; ++i)
+                       {
+                         idx = *(index_data + i*index_stride);
+                         if (idx < 1 || idx > tensor_size)
+                         {
+                           THFree(TH_TENSOR_DIM_APPLY_counter);
+                           THError("Invalid index in scatter");
+                         }
+                         tensor_data[(idx - 1) * tensor_stride] = val;
+                       })
+}
+
+accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
+{
+  accreal sum = 0;
+  /* we use a trick here. careful with that. */
+  TH_TENSOR_APPLY2(real, tensor, real, src,
+                   long sz = (tensor_size-tensor_i < src_size-src_i ? tensor_size-tensor_i : src_size-src_i);
+                   sum += THBlas_(dot)(sz, src_data, src_stride, tensor_data, tensor_stride);
+                   tensor_i += sz;
+                   src_i += sz;
+                   tensor_data += sz*tensor_stride;
+                   src_data += sz*src_stride;
+                   break;);
+  return sum;
+}
+
+#undef th_isnan
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+#define th_isnan(val) \
+if (isnan(value)) break;
+#else
+#define th_isnan(val)
+#endif
+
+real THTensor_(minall)(THTensor *tensor)
+{
+  real theMin;
+  real value;
+
+  THArgCheck(tensor->nDimension > 0, 1, "tensor must have one dimension");
+  theMin = THTensor_(data)(tensor)[0];
+  TH_TENSOR_APPLY(real, tensor,
+                  value = *tensor_data;
+                  /* This is not the same as value<theMin in the case of NaNs */
+                  if(!(value >= theMin))
+                  {
+                    theMin = value;
+                    th_isnan(value)
+                  });
+  return theMin;
+}
+
+real THTensor_(maxall)(THTensor *tensor)
+{
+  real theMax;
+  real value;
+
+  THArgCheck(tensor->nDimension > 0, 1, "tensor must have one dimension");
+  theMax = THTensor_(data)(tensor)[0];
+  TH_TENSOR_APPLY(real, tensor,
+                  value = *tensor_data;
+                  /* This is not the same as value>theMax in the case of NaNs */
+                  if(!(value <= theMax))
+                  {
+                    theMax = value;
+                    th_isnan(value)
+                  });
+  return theMax;
+}
+
+accreal THTensor_(sumall)(THTensor *tensor)
+{
+  accreal sum = 0;
+  TH_TENSOR_APPLY(real, tensor, sum += *tensor_data;);
+  return sum;
+}
+
+accreal THTensor_(prodall)(THTensor *tensor)
+{
+  accreal prod = 1;
+  TH_TENSOR_APPLY(real, tensor, prod *= *tensor_data;);
+  return prod;
+}
+
+void THTensor_(add)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+          rp[i] = tp[i] + value;
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
+  }
+}
+
+void THTensor_(sub)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(add)(r_, t, -value);
+}
+
+void THTensor_(mul)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+          rp[i] = tp[i] * value;
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+  }
+}
+
+void THTensor_(div)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+          rp[i] = tp[i] / value;
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
+  }
+}
+
+void THTensor_(fmod)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+          rp[i] = fmod(tp[i], value);
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = fmod(*t_data, value););
+  }
+}
+
+void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+          rp[i] = (value == 0)? NAN : tp[i] - value * floor(tp[i] / value);
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
+  }
+}
+
+void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      real t_val;
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+          rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
+  }
+}
+
+void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+    if(r_ == t) {
+      THBlas_(axpy)(THTensor_(nElement)(t), value, THTensor_(data)(src), 1, THTensor_(data)(r_), 1);
+    } else {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i< sz; i++)
+          rp[i] = tp[i] + value * sp[i];
+    }
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;);
+  }
+}
+
+void THTensor_(csub)(THTensor *r_, THTensor *t, real value,THTensor *src)
+{
+  THTensor_(cadd)(r_, t, -value, src);
+}
+
+void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+        rp[i] = tp[i] * sp[i];
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;);
+  }
+}
+
+void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+        rp[i] = pow(tp[i], sp[i]);
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = pow(*t_data, *src_data););
+  }
+}
+
+void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+        rp[i] = tp[i] / sp[i];
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / *src_data;);
+  }
+}
+
+void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+        rp[i] = fmod(tp[i], sp[i]);
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = fmod(*t_data, *src_data););
+  }
+}
+
+void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
+      real *tp = THTensor_(data)(t);
+      real *sp = THTensor_(data)(src);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+          rp[i] = (sp[i] == 0)? NAN : tp[i] - sp[i] * floor(tp[i] / sp[i]);
+  } else {
+      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = (*src_data == 0)? NAN : *t_data - *src_data * floor(*t_data / *src_data););
+  }
+}
+
+void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+      real *tp = THTensor_(data)(t);
+      real *rp = THTensor_(data)(r_);
+      long sz = THTensor_(nElement)(t);
+      long i;
+      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
+      for (i=0; i<sz; i++)
+        rp[i] = pow(value, tp[i]);
+  } else {
+      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
+  }
+}
+
+void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2)
+{
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+
+  TH_TENSOR_APPLY3(real, r_, real, src1, real, src2, *r__data += value * *src1_data * *src2_data;);
+}
+
+
+void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2)
+{
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+
+  TH_TENSOR_APPLY3(real, r_, real, src1, real, src2, *r__data += value * *src1_data / *src2_data;);
+}
+
+void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec)
+{
+  if( (mat->nDimension != 2) || (vec->nDimension != 1) )
+    THError("matrix and vector expected, got %dD, %dD",
+      mat->nDimension, vec->nDimension);
+
+  if( mat->size[1] != vec->size[0] ) {
+    THDescBuff bm = THTensor_(sizeDesc)(mat);
+    THDescBuff bv = THTensor_(sizeDesc)(vec);
+    THError("size mismatch, %s, %s", bm.str, bv.str);
+  }
+
+  if(t->nDimension != 1)
+    THError("vector expected, got t: %dD", t->nDimension);
+
+  if(t->size[0] != mat->size[0]) {
+    THDescBuff bt = THTensor_(sizeDesc)(t);
+    THDescBuff bm = THTensor_(sizeDesc)(mat);
+    THError("size mismatch, t: %s, mat: %s", bt.str, bm.str);
+  }
+
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+
+  if(mat->stride[0] == 1)
+  {
+    THBlas_(gemv)('n', mat->size[0], mat->size[1],
+                  alpha, THTensor_(data)(mat), mat->stride[1],
+                  THTensor_(data)(vec), vec->stride[0],
+                  beta, THTensor_(data)(r_), r_->stride[0]);
+  }
+  else if(mat->stride[1] == 1)
+  {
+    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
+                  alpha, THTensor_(data)(mat), mat->stride[0],
+                  THTensor_(data)(vec), vec->stride[0],
+                  beta, THTensor_(data)(r_), r_->stride[0]);
+  }
+  else
+  {
+    THTensor *cmat = THTensor_(newContiguous)(mat);
+
+    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
+                  alpha, THTensor_(data)(cmat), cmat->stride[0],
+                  THTensor_(data)(vec), vec->stride[0],
+                  beta, THTensor_(data)(r_), r_->stride[0]);
+
+    THTensor_(free)(cmat);
+  }
+}
+
+void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain)
+{
+  long N1 = m1->size[0];
+  long N2 = m2->size[0];
+  long dim;
+  real *m1_p;
+  real *m2_p;
+  real *r_p;
+  long i;
+
+  THTensor_(resize2d)(r_, N1, N2);
+
+  m1 = THTensor_(newContiguous)(m1);
+  m2 = THTensor_(newContiguous)(m2);
+
+  THTensor_(resize2d)(m1, N1, THTensor_(nElement)(m1) / N1);
+  THTensor_(resize2d)(m2, N2, THTensor_(nElement)(m2) / N2);
+
+  dim = m1->size[1];
+  THArgCheck(m1->size[1] == m2->size[1], 3, "m1 and m2 must have the same inner vector dim");
+
+  m1_p = THTensor_(data)(m1);
+  m2_p = THTensor_(data)(m2);
+  r_p = THTensor_(data)(r_);
+
+#pragma omp parallel for private(i)
+  for (i=0; i<N1; i++) {
+    long j,k;
+    for (j=0; j<N2; j++) {
+      real sum = 0;
+      for (k=0; k<dim; k++) {
+        real term = m1_p[ i*dim + k ] - m2_p[ j*dim + k ];
+        sum += term*term;
+      }
+      r_p[ i*N2 + j ] = gain * sum;
+    }
+  }
+
+  THTensor_(free)(m1);
+  THTensor_(free)(m2);
+}
+
+void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *m1, THTensor *m2)
+{
+  char transpose_r, transpose_m1, transpose_m2;
+  THTensor *r__, *m1_, *m2_;
+
+  if( (m1->nDimension != 2) || (m2->nDimension != 2))
+    THError("matrices expected, got %dD, %dD tensors", m1->nDimension, m2->nDimension);
+
+  if(m1->size[1] != m2->size[0]) {
+    THDescBuff bm1 = THTensor_(sizeDesc)(m1);
+    THDescBuff bm2 = THTensor_(sizeDesc)(m2);
+    THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
+  }
+
+  if( t->nDimension != 2 )
+    THError("matrix expected, got %dD tensor for t", t->nDimension);
+
+  if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) {
+    THDescBuff bt  = THTensor_(sizeDesc)(t);
+    THDescBuff bm1 = THTensor_(sizeDesc)(m1);
+    THDescBuff bm2 = THTensor_(sizeDesc)(m2);
+    THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str);
+  }
+
+  if(t != r_)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+
+/*  printf("%ldx%ld = %ldx%ld X %ldx%ld\n", r_->size[0], r_->size[1], m1->size[0], m1->size[1], m2->size[0], m2->size[1]); */
+
+  /* r_ */
+  if(r_->stride[0] == 1 &&
+     r_->stride[1] != 0)
+  {
+    transpose_r = 'n';
+    r__ = r_;
+  }
+  else if(r_->stride[1] == 1 &&
+          r_->stride[0] != 0)
+  {
+    THTensor *swap = m2;
+    m2 = m1;
+    m1 = swap;
+    transpose_r = 't';
+    r__ = r_;
+  }
+  else
+  {
+    transpose_r = 'n';
+
+    THTensor *transp_r_ = THTensor_(newTranspose)(r_, 0, 1);
+    r__ = THTensor_(newClone)(transp_r_);
+    THTensor_(free)(transp_r_);
+    THTensor_(transpose)(r__, NULL, 0, 1);
+  }
+
+  /* m1 */
+  if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  {
+    transpose_m1 = 'n';
+    m1_ = m1;
+  }
+  else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  {
+    transpose_m1 = 't';
+    m1_ = m1;
+  }
+  else
+  {
+    transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
+    m1_ = THTensor_(newContiguous)(m1);
+  }
+
+  /* m2 */
+  if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+     m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+  {
+    transpose_m2 = 'n';
+    m2_ = m2;
+  }
+  else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+          m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+  {
+    transpose_m2 = 't';
+    m2_ = m2;
+  }
+  else
+  {
+    transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
+    m2_ = THTensor_(newContiguous)(m2);
+  }
+
+  /* do the operation */
+  THBlas_(gemm)(transpose_m1,
+                transpose_m2,
+                r__->size[(transpose_r == 'n' ? 0 : 1)],
+                r__->size[(transpose_r == 'n' ? 1 : 0)],
+                m1_->size[(transpose_r == 'n' ? 1 : 0)],
+                alpha,
+                THTensor_(data)(m1_),
+                (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                THTensor_(data)(m2_),
+                (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+                beta,
+                THTensor_(data)(r__),
+                r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+
+  /* free intermediate variables */
+  if(m1_ != m1)
+    THTensor_(free)(m1_);
+
+  if(m2_ != m2)
+    THTensor_(free)(m2_);
+
+  if(r__ != r_)
+    THTensor_(freeCopyTo)(r__, r_);
+}
+
+void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2)
+{
+  if( (vec1->nDimension != 1) || (vec2->nDimension != 1) )
+    THError("vector and vector expected, got %dD, %dD tensors",
+        vec1->nDimension, vec2->nDimension);
+
+  if(t->nDimension != 2)
+    THError("expected matrix, got %dD tensor for t", t->nDimension);
+
+  if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
+    THDescBuff bt  = THTensor_(sizeDesc)(t);
+    THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
+    THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
+    THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str);
+  }
+
+  if(r_ != t)
+  {
+    THTensor_(resizeAs)(r_, t);
+    THTensor_(copy)(r_, t);
+  }
+
+  if(beta != 1)
+    THTensor_(mul)(r_, r_, beta);
+
+  if(r_->stride[0] == 1)
+  {
+    THBlas_(ger)(vec1->size[0], vec2->size[0],
+                 alpha, THTensor_(data)(vec1), vec1->stride[0],
+                 THTensor_(data)(vec2), vec2->stride[0],
+                 THTensor_(data)(r_), r_->stride[1]);
+  }
+  else if(r_->stride[1] == 1)
+  {
+    THBlas_(ger)(vec2->size[0], vec1->size[0],
+                 alpha, THTensor_(data)(vec2), vec2->stride[0],
+                 THTensor_(data)(vec1), vec1->stride[0],
+                 THTensor_(data)(r_), r_->stride[0]);
+  }
+  else
+  {
+    THTensor *cr = THTensor_(newClone)(r_);
+
+    THBlas_(ger)(vec2->size[0], vec1->size[0],
+                 alpha, THTensor_(data)(vec2), vec2->stride[0],
+                 THTensor_(data)(vec1), vec1->stride[0],
+                 THTensor_(data)(cr), cr->stride[0]);
+
+    THTensor_(freeCopyTo)(cr, r_);
+  }
+}
+
+void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
+{
+  long batch;
+
+  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor");
+  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor");
+  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
+             "equal number of batches expected, got %d, %d",
+             THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
+  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
+             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
+             THTensor_(size)(batch1, 1), THTensor_(size)(batch1,2),
+             THTensor_(size)(batch2, 1), THTensor_(size)(batch2,2));
+
+  long dim1 = THTensor_(size)(batch1, 1);
+  long dim2 = THTensor_(size)(batch2, 2);
+  THArgCheck(THTensor_(size)(t, 0) == dim1, 1, "output tensor of incorrect size");
+  THArgCheck(THTensor_(size)(t, 1) == dim2, 1, "output tensor of incorrect size");
+
+  if (t != result) {
+    THTensor_(resizeAs)(result, t);
+    THTensor_(copy)(result, t);
+  }
+
+  THTensor *matrix1 = THTensor_(new)();
+  THTensor *matrix2 = THTensor_(new)();
+
+  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
+    THTensor_(select)(matrix1, batch1, 0, batch);
+    THTensor_(select)(matrix2, batch2, 0, batch);
+
+    THTensor_(addmm)(result, beta, result, alpha, matrix1, matrix2);
+    beta = 1; // accumulate output once
+  }
+
+  THTensor_(free)(matrix1);
+  THTensor_(free)(matrix2);
+}
+
+void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
+{
+  long batch;
+
+  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch1));
+  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch2));
+  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
+             "equal number of batches expected, got %d, %d",
+             THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
+  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
+             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
+             THTensor_(size)(batch1, 1), THTensor_(size)(batch1, 2),
+             THTensor_(size)(batch2, 1), THTensor_(size)(batch2, 2));
+
+  long bs = THTensor_(size)(batch1, 0);
+  long dim1 = THTensor_(size)(batch1, 1);
+  long dim2 = THTensor_(size)(batch2, 2);
+  THArgCheck(THTensor_(size)(t, 0) == bs, 1,   "output tensor of incorrect size");
+  THArgCheck(THTensor_(size)(t, 1) == dim1, 1, "output tensor of incorrect size");
+  THArgCheck(THTensor_(size)(t, 2) == dim2, 1, "output tensor of incorrect size");
+
+  if (t != result) {
+    THTensor_(resizeAs)(result, t);
+    THTensor_(copy)(result, t);
+  }
+
+  THTensor *matrix1 = THTensor_(new)();
+  THTensor *matrix2 = THTensor_(new)();
+  THTensor *result_matrix = THTensor_(new)();
+
+  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
+    THTensor_(select)(matrix1, batch1, 0, batch);
+    THTensor_(select)(matrix2, batch2, 0, batch);
+    THTensor_(select)(result_matrix, result, 0, batch);
+
+    THTensor_(addmm)(result_matrix, beta, result_matrix, alpha, matrix1, matrix2);
+  }
+
+  THTensor_(free)(matrix1);
+  THTensor_(free)(matrix2);
+  THTensor_(free)(result_matrix);
+}
+
+long THTensor_(numel)(THTensor *t)
+{
+  return THTensor_(nElement)(t);
+}
+
+void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+{
+  THLongStorage *dim;
+  real theMax;
+  real value;
+  long theIndex;
+  long i;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+                       theMax = t_data[0];
+                       theIndex = 0;
+
+                       for(i = 0; i < t_size; i++)
+                       {
+                         value = t_data[i*t_stride];
+                         /* This is not the same as value>theMax in the case of NaNs */
+                         if(!(value <= theMax))
+                         {
+                           theIndex = i;
+                           theMax = value;
+                           th_isnan(value)
+                         }
+                       }
+                       *indices__data = theIndex;
+                       *values__data = theMax;);
+}
+
+void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+{
+  THLongStorage *dim;
+  real theMin;
+  real value;
+  long theIndex;
+  long i;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+                       theMin = t_data[0];
+                       theIndex = 0;
+
+                       for(i = 0; i < t_size; i++)
+                       {
+                         value = t_data[i*t_stride];
+                         /* This is not the same as value<theMin in the case of NaNs */
+                         if(!(value >= theMin))
+                         {
+                           theIndex = i;
+                           theMin = value;
+                           th_isnan(value)
+                         }
+                       }
+                       *indices__data = theIndex;
+                       *values__data = theMin;);
+}
+
+
+void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal sum = 0;
+                       long i;
+                       for(i = 0; i < t_size; i++)
+                         sum += t_data[i*t_stride];
+                       *r__data = (real)sum;);
+}
+
+void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal prod = 1;
+                       long i;
+                       for(i = 0; i < t_size; i++)
+                         prod *= t_data[i*t_stride];
+                       *r__data = (real)prod;);
+
+}
+
+void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
+{
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension+1);
+
+  THTensor_(resizeAs)(r_, t);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal cumsum = 0;
+                       long i;
+                       for(i = 0; i < t_size; i++)
+                       {
+                         cumsum += t_data[i*t_stride];
+                         r__data[i*r__stride] = (real)cumsum;
+                       });
+}
+
+void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension)
+{
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
+      dimension+1);
+
+  THTensor_(resizeAs)(r_, t);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal cumprod = 1;
+                       long i;
+                       for(i = 0; i < t_size; i++)
+                       {
+                         cumprod *= t_data[i*t_stride];
+                         r__data[i*r__stride] = (real)cumprod;
+                       });
+}
+
+
+void THTensor_(sign)(THTensor *r_, THTensor *t)
+{
+  THTensor_(resizeAs)(r_, t);
+
+#if defined (TH_REAL_IS_BYTE)
+  TH_TENSOR_APPLY2(real, r_, real, t,
+		   if (*t_data > 0) *r__data = 1;
+		   else *r__data = 0;);
+#else
+  TH_TENSOR_APPLY2(real, r_, real, t,
+		   if (*t_data > 0) *r__data = 1;
+		   else if (*t_data < 0) *r__data = -1;
+		   else *r__data = 0;);
+#endif
+}
+
+
+accreal THTensor_(trace)(THTensor *t)
+{
+  real *t_data = THTensor_(data)(t);
+  accreal sum = 0;
+  long i = 0;
+  long t_stride_0, t_stride_1, t_diag_size;
+
+  THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
+
+  t_stride_0 = THTensor_(stride)(t, 0);
+  t_stride_1 = THTensor_(stride)(t, 1);
+  t_diag_size = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1));
+  while(i < t_diag_size)
+  {
+    sum += t_data[i*(t_stride_0+t_stride_1)];
+    i++;
+  }
+
+  return sum;
+}
+
+void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension)
+{
+  int i;
+
+  if(THTensor_(nDimension)(a) != THTensor_(nDimension)(b))
+    THError("inconsistent tensor dimension %dD, %dD",
+        THTensor_(nDimension)(a), THTensor_(nDimension)(b));
+
+  for(i = 0; i < THTensor_(nDimension)(a); i++)
+  {
+    if(THTensor_(size)(a, i) != THTensor_(size)(b, i)) {
+        THDescBuff ba = THTensor_(sizeDesc)(a);
+        THDescBuff bb = THTensor_(sizeDesc)(b);
+        THError("inconsistent tensor sizes %s, %s", ba.str, bb.str);
+    }
+  }
+
+  if(dimension < 0)
+  {
+    for(i = 0; i < THTensor_(nDimension)(a); i++)
+    {
+      if(THTensor_(size)(a, i) == 3)
+      {
+        dimension = i;
+        break;
+      }
+    }
+    if(dimension < 0) {
+      THDescBuff ba = THTensor_(sizeDesc)(a);
+      THError("no dimension of size 3 in a: %s", ba.str);
+    }
+  }
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(a), 3, "dimension %d out of range",
+      dimension+1);
+  THArgCheck(THTensor_(size)(a, dimension) == 3, 3, "dimension %d does not have size 3",
+      dimension+1);
+
+  THTensor_(resizeAs)(r_, a);
+
+  TH_TENSOR_DIM_APPLY3(real, a, real, b, real, r_, dimension,
+                       r__data[0*r__stride] = a_data[1*a_stride]*b_data[2*b_stride] - a_data[2*a_stride]*b_data[1*b_stride];
+                       r__data[1*r__stride] = a_data[2*a_stride]*b_data[0*b_stride] - a_data[0*a_stride]*b_data[2*b_stride];
+                       r__data[2*r__stride] = a_data[0*a_stride]*b_data[1*b_stride] - a_data[1*a_stride]*b_data[0*b_stride];);
+}
+
+void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY3(real, r, real, t, real, src,
+                   *r_data = *t_data > *src_data ? *t_data : *src_data;);
+}
+
+void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY3(real, r, real, t, real, src,
+                   *r_data = *t_data < *src_data ? *t_data : *src_data;);
+}
+
+void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY2(real, r, real, t,
+                   *r_data = *t_data > value ? *t_data : value;);
+}
+
+void THTensor_(cminValue)(THTensor *r, THTensor *t, real value) {
+  THTensor_(resizeAs)(r, t);
+  TH_TENSOR_APPLY2(real, r, real, t,
+                   *r_data = *t_data < value ? *t_data : value;);
+}
+
+void THTensor_(zeros)(THTensor *r_, THLongStorage *size)
+{
+  THTensor_(resize)(r_, size, NULL);
+  THTensor_(zero)(r_);
+}
+
+void THTensor_(ones)(THTensor *r_, THLongStorage *size)
+{
+  THTensor_(resize)(r_, size, NULL);
+  THTensor_(fill)(r_, 1);
+}
+
+void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
+{
+  THArgCheck(THTensor_(nDimension)(t) == 1 || THTensor_(nDimension)(t) == 2, 1, "matrix or a vector expected");
+
+  if(THTensor_(nDimension)(t) == 1)
+  {
+    real *t_data = THTensor_(data)(t);
+    long t_stride_0 = THTensor_(stride)(t, 0);
+    long t_size = THTensor_(size)(t, 0);
+    long sz = t_size + (k >= 0 ? k : -k);
+    real *r__data;
+    long r__stride_0;
+    long r__stride_1;
+    long i;
+
+    THTensor_(resize2d)(r_, sz, sz);
+    THTensor_(zero)(r_);
+    r__data = THTensor_(data)(r_);
+    r__stride_0 = THTensor_(stride)(r_, 0);
+    r__stride_1 = THTensor_(stride)(r_, 1);
+    r__data += (k >= 0 ? k*r__stride_1 : -k*r__stride_0);
+
+    for(i = 0; i < t_size; i++)
+      r__data[i*(r__stride_0+r__stride_1)] = t_data[i*t_stride_0];
+  }
+  else
+  {
+    real *t_data = THTensor_(data)(t);
+    long t_stride_0 = THTensor_(stride)(t, 0);
+    long t_stride_1 = THTensor_(stride)(t, 1);
+    long sz;
+    real *r__data;
+    long r__stride_0;
+    long i;
+
+    if(k >= 0)
+      sz = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1)-k);
+    else
+      sz = THMin(THTensor_(size)(t, 0)+k, THTensor_(size)(t, 1));
+    THTensor_(resize1d)(r_, sz);
+    r__data = THTensor_(data)(r_);
+    r__stride_0 = THTensor_(stride)(r_, 0);
+
+    t_data += (k >= 0 ? k*t_stride_1 : -k*t_stride_0);
+    for(i = 0; i < sz; i++)
+      r__data[i*r__stride_0] = t_data[i*(t_stride_0+t_stride_1)];
+  }
+}
+
+void THTensor_(eye)(THTensor *r_, long n, long m)
+{
+  real *r__data;
+  long i, sz;
+
+  THArgCheck(n > 0, 1, "invalid argument");
+
+  if(m <= 0)
+    m = n;
+
+  THTensor_(resize2d)(r_, n, m);
+  THTensor_(zero)(r_);
+
+  i = 0;
+  r__data = THTensor_(data)(r_);
+  sz = THMin(THTensor_(size)(r_, 0), THTensor_(size)(r_, 1));
+  for(i = 0; i < sz; i++)
+    r__data[i*(r_->stride[0]+r_->stride[1])] = 1;
+}
+
+
+void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step)
+{
+  long size;
+  real i = 0;
+
+  THArgCheck(step > 0 || step < 0, 3, "step must be a non-null number");
+  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
+              , 2, "upper bound and larger bound incoherent with step sign");
+
+  size = (long) (((xmax - xmin) / step) + 1);
+
+  if (THTensor_(nElement)(r_) != size) {
+    THTensor_(resize1d)(r_, size);
+  }
+
+  TH_TENSOR_APPLY(real, r_, *r__data = xmin + (i++)*step;);
+}
+
+void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n)
+{
+  real *r__data;
+  long r__stride_0;
+  long i;
+
+  THArgCheck(n > 0, 1, "must be strictly positive");
+
+  THTensor_(resize1d)(r_, n);
+  r__data = THTensor_(data)(r_);
+  r__stride_0 = THTensor_(stride)(r_,0);
+
+  for(i = 0; i < n; i++)
+    r__data[i*r__stride_0] = (real)(i);
+
+  for(i = 0; i < n-1; i++)
+  {
+    long z = THRandom_random(_generator) % (n-i);
+    real sav = r__data[i*r__stride_0];
+    r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0];
+    r__data[(z+i)*r__stride_0] = sav;
+  }
+}
+
+void THTensor_(reshape)(THTensor *r_, THTensor *t, THLongStorage *size)
+{
+  THTensor_(resize)(r_, size, NULL);
+  THTensor_(copy)(r_, t);
+}
+
+/* I cut and pasted (slightly adapted) the quicksort code from
+   Sedgewick's 1978 "Implementing Quicksort Programs" article
+   http://www.csie.ntu.edu.tw/~b93076/p847-sedgewick.pdf
+
+   It is the state of the art existing implementation. The macros
+   are here to make as close a match as possible to the pseudocode of
+   Program 2 p.851
+
+   Note that other partition schemes exist, and are typically presented
+   in textbook, but those are less efficient. See e.g.
+   http://cs.stackexchange.com/questions/11458/quicksort-partitioning-hoare-vs-lomuto
+
+   Julien, November 12th 2013
+*/
+#define MAX_LEVELS  300
+#define M_SMALL 10 /* Limit for small subfiles */
+
+#define ARR(III) arr[(III)*stride]
+#define IDX(III) idx[(III)*stride]
+
+#define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap
+#define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap
+
+#define BOTH_SWAP(III, JJJ) \
+  REAL_SWAP(ARR(III), ARR(JJJ)); \
+  LONG_SWAP(IDX(III), IDX(JJJ))
+
+static void THTensor_(quicksortascend)(real *arr, long *idx, long elements, long stride)
+{
+  long beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
+  real rswap, piv;
+  unsigned char done = 0;
+
+  /* beg[0]=0; end[0]=elements; */
+  stack = 0;
+  L = 0; R = elements-1;
+  done = elements-1 <= M_SMALL;
+
+  while(!done) {
+      /* Use median of three for pivot choice */
+      P=(L+R)>>1;
+      BOTH_SWAP(P, L+1);
+      if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
+      if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
+      if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+
+      i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+
+      do {
+          do { i = i+1; } while(ARR(i) < piv);
+          do { j = j-1; } while(ARR(j) > piv);
+          if (j < i)
+              break;
+          BOTH_SWAP(i, j);
+      } while(1);
+      BOTH_SWAP(L, j);
+      /* Left subfile is (L, j-1) */
+      /* Right subfile is (i, R) */
+      sz_left = j-L;
+      sz_right = R-i+1;
+      if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+          /* both subfiles are small */
+          /* if stack empty */
+          if (stack == 0) {
+              done = 1;
+          } else {
+              stack--;
+              L = beg[stack];
+              R = end[stack];
+          }
+      } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+              /* exactly one of the subfiles is small */
+              /* (L,R) = large subfile */
+              if (sz_left > sz_right) {
+                  /* Implicit: L = L; */
+                  R = j-1;
+              } else {
+                  L = i;
+                  /* Implicit: R = R; */
+              }
+      } else {
+          /* none of the subfiles is small */
+          /* push large subfile */
+          /* (L,R) = small subfile */
+          if (sz_left > sz_right) {
+              beg[stack] = L;
+              end[stack] = j-1;
+              stack++;
+              L = i;
+              /* Implicit: R = R */
+          } else {
+              beg[stack] = i;
+              end[stack] = R;
+              stack++;
+              /* Implicit: L = L; */
+              R = j-1;
+          }
+      }
+  } /* while not done */
+  /* Now insertion sort on the concatenation of subfiles */
+  for(i=elements-2; i>=0; i--) {
+    if (ARR(i) > ARR(i+1)) {
+          piv = ARR(i);
+      pid = IDX(i);
+      j = i+1;
+      do {
+          ARR(j-1) = ARR(j);
+          IDX(j-1) = IDX(j);
+          j = j+1;
+      } while(j < elements && ARR(j) < piv);
+      ARR(j-1) = piv;
+      IDX(j-1) = pid;
+     }
+  }
+}
+
+static void THTensor_(quicksortdescend)(real *arr, long *idx, long elements, long stride)
+{
+  long beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
+  real rswap, piv;
+  unsigned char done = 0;
+
+  /* beg[0]=0; end[0]=elements; */
+  stack = 0;
+  L = 0; R = elements-1;
+  done = elements-1 <= M_SMALL;
+
+  while(!done) {
+      /* Use median of three for pivot choice */
+      P=(L+R)>>1;
+      BOTH_SWAP(P, L+1);
+      if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
+      if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
+      if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
+
+      i = L+1; j = R; piv = ARR(L); pid = IDX(L);
+
+      do {
+          do { i = i+1; } while(ARR(i) > piv);
+          do { j = j-1; } while(ARR(j) < piv);
+          if (j < i)
+              break;
+          BOTH_SWAP(i, j);
+      } while(1);
+      BOTH_SWAP(L, j);
+      /* Left subfile is (L, j-1) */
+      /* Right subfile is (i, R) */
+      sz_left = j-L;
+      sz_right = R-i+1;
+      if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
+          /* both subfiles are small */
+          /* if stack empty */
+          if (stack == 0) {
+              done = 1;
+          } else {
+              stack--;
+              L = beg[stack];
+              R = end[stack];
+          }
+      } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
+              /* exactly one of the subfiles is small */
+              /* (L,R) = large subfile */
+              if (sz_left > sz_right) {
+                  /* Implicit: L = L; */
+                  R = j-1;
+              } else {
+                  L = i;
+                  /* Implicit: R = R; */
+              }
+      } else {
+          /* none of the subfiles is small */
+          /* push large subfile */
+          /* (L,R) = small subfile */
+          if (sz_left > sz_right) {
+              beg[stack] = L;
+              end[stack] = j-1;
+              stack++;
+              L = i;
+              /* Implicit: R = R */
+          } else {
+              beg[stack] = i;
+              end[stack] = R;
+              stack++;
+              /* Implicit: L = L; */
+              R = j-1;
+          }
+      }
+  } /* while not done */
+  /* Now insertion sort on the concatenation of subfiles */
+  for(i=elements-2; i>=0; i--) {
+    if (ARR(i) < ARR(i+1)) {
+          piv = ARR(i);
+      pid = IDX(i);
+      j = i+1;
+      do {
+          ARR(j-1) = ARR(j);
+          IDX(j-1) = IDX(j);
+          j = j+1;
+      } while(j < elements && ARR(j) > piv);
+      ARR(j-1) = piv;
+      IDX(j-1) = pid;
+     }
+  }
+}
+
+#undef MAX_LEVELS
+#undef M_SMALL
+
+void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder)
+{
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
+      dimension+1);
+
+  THTensor_(resizeAs)(rt_, t);
+  THTensor_(copy)(rt_, t);
+
+  {
+    THLongStorage *size = THTensor_(newSizeOf)(t);
+    THLongTensor_resize(ri_, size, NULL);
+    THLongStorage_free(size);
+  }
+
+  if(descendingOrder)
+  {
+    TH_TENSOR_DIM_APPLY2(real, rt_, long, ri_, dimension,
+                         long i;
+                         for(i = 0; i < ri__size; i++)
+                           ri__data[i*ri__stride] = i;
+                         THTensor_(quicksortdescend)(rt__data, ri__data, rt__size, rt__stride);)
+      }
+  else
+  {
+    TH_TENSOR_DIM_APPLY2(real, rt_, long, ri_, dimension,
+                         long i;
+                         for(i = 0; i < ri__size; i++)
+                           ri__data[i*ri__stride] = i;
+                         THTensor_(quicksortascend)(rt__data, ri__data, rt__size, rt__stride);)
+      }
+}
+
+/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
+public domain implementation at http://ndevilla.free.fr/median/median/
+Adapted similarly to the above Quicksort algorithm. */
+static void THTensor_(quickselect)(real *arr, long *idx, long k, long elements, long stride)
+{
+  long P, L, R, i, j, swap, pid;
+  real rswap, piv;
+  L = 0;
+  R = elements-1;
+
+  do {
+    if (R <= L) /* One element only */
+      return;
+
+    if (R == L+1) {  /* Two elements only */
+      if (ARR(L) > ARR(R)) {
+        BOTH_SWAP(L, R);
+      }
+      return;
+    }
+
+    /* Use median of three for pivot choice */
+    P=(L+R)>>1;
+    BOTH_SWAP(P, L+1);
+    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
+    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
+    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
+
+    i = L+1;
+    j = R;
+    piv = ARR(L);
+    pid = IDX(L);
+    do {
+      do i++; while(ARR(i) < piv);
+      do j--; while(ARR(j) > piv);
+      if (j < i)
+        break;
+      BOTH_SWAP(i, j);
+    } while(1);
+    BOTH_SWAP(L, j);
+
+    /* Re-set active partition */
+    if (j <= k) L=i;
+    if (j >= k) R=j-1;
+  } while(1);
+}
+
+#undef ARR
+#undef IDX
+#undef LONG_SWAP
+#undef REAL_SWAP
+#undef BOTH_SWAP
+
+void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+{
+  THLongStorage *dim;
+  THTensor *temp_;
+  THLongTensor *tempi_;
+  real *temp__data;
+  long *tempi__data;
+  long t_size_dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  t_size_dim = THTensor_(size)(t, dimension);
+
+  temp_ = THTensor_(new)();
+  THTensor_(resize1d)(temp_, t_size_dim);
+  temp__data = THTensor_(data)(temp_);
+
+  tempi_ = THLongTensor_new();
+  THLongTensor_resize1d(tempi_, t_size_dim);
+  tempi__data = THLongTensor_data(tempi_);
+
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+                       long i;
+                       long mode = 0;
+                       long modei = 0;
+                       long temp_freq = 0;
+                       long max_freq = 0;
+                       for(i = 0; i < t_size_dim; i++)
+                          temp__data[i] = t_data[i*t_stride];
+                       for(i = 0; i < t_size_dim; i++)
+                          tempi__data[i] = i;
+                       THTensor_(quicksortascend)(temp__data, tempi__data, t_size_dim, 1);
+
+                       for(i = 0; i < t_size_dim; i++)
+                       {
+                          temp_freq++;
+                          if ((i == t_size_dim - 1) || (temp__data[i] != temp__data[i+1]))
+                          {
+                              if (temp_freq > max_freq)
+                              {
+                                 mode = temp__data[i];
+                                 modei = tempi__data[i];
+                                 max_freq = temp_freq;
+                              }
+                              temp_freq = 0;
+                          }
+                       }
+                       *values__data = mode;
+                       *indices__data = modei;);
+
+  THTensor_(free)(temp_);
+  THLongTensor_free(tempi_);
+}
+
+void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension)
+{
+  THLongStorage *dim;
+  THTensor *temp_;
+  THLongTensor *tempi_;
+  real *temp__data;
+  long *tempi__data;
+  long t_size_dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
+  THArgCheck(k >= 0 && k < t->size[dimension], 2, "selected index out of range");
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(values_, dim, NULL);
+  THLongTensor_resize(indices_, dim, NULL);
+  THLongStorage_free(dim);
+
+  t_size_dim = THTensor_(size)(t, dimension);
+
+  temp_ = THTensor_(new)();
+  THTensor_(resize1d)(temp_, t_size_dim);
+  temp__data = THTensor_(data)(temp_);
+
+  tempi_ = THLongTensor_new();
+  THLongTensor_resize1d(tempi_, t_size_dim);
+  tempi__data = THLongTensor_data(tempi_);
+
+  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
+                       long i;
+                       for(i = 0; i < t_size_dim; i++)
+                          temp__data[i] = t_data[i*t_stride];
+                       for(i = 0; i < t_size_dim; i++)
+                          tempi__data[i] = i;
+                       THTensor_(quickselect)(temp__data, tempi__data, k, t_size_dim, 1);
+                       *values__data = temp__data[k];
+                       *indices__data = tempi__data[k];);
+
+  THTensor_(free)(temp_);
+  THLongTensor_free(tempi_);
+}
+
+void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension)
+{
+  long t_size_dim, k;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
+
+  t_size_dim = THTensor_(size)(t, dimension);
+  k = (t_size_dim-1) >> 1; /* take middle or one-before-middle element */
+
+  THTensor_(kthvalue)(values_, indices_, t, k, dimension);
+}
+
+void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted)
+{
+  int numDims = THTensor_(nDimension)(t);
+  THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
+
+  long sliceSize = THTensor_(size)(t, dim);
+  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
+
+  THTensor *tmpResults = THTensor_(new)();
+  THTensor_(resize1d)(tmpResults, sliceSize);
+  real *tmp__data = THTensor_(data)(tmpResults);
+
+  THLongTensor *tmpIndices = THLongTensor_new();
+  THLongTensor_resize1d(tmpIndices, sliceSize);
+  long *tmpi__data = THLongTensor_data(tmpIndices);
+
+  THLongStorage *topKSize = THTensor_(newSizeOf)(t);
+  THLongStorage_set(topKSize, dim, k);
+  THTensor_(resize)(rt_, topKSize, NULL);
+  THLongTensor_resize(ri_, topKSize, NULL);
+  THLongStorage_free(topKSize);
+
+  if (dir) {
+    /* k largest elements, descending order (optional: see sorted) */
+    long K = sliceSize - k;
+    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, long, ri_, dim,
+                         long i;
+                         for(i = 0; i < sliceSize; i++)
+                         {
+                           tmp__data[i] = t_data[i*t_stride];
+                           tmpi__data[i] = i;
+                         }
+                         if (K > 0)
+                           THTensor_(quickselect)(tmp__data, tmpi__data, K - 1, sliceSize, 1);
+                         if (sorted)
+                           THTensor_(quicksortdescend)(tmp__data + K, tmpi__data + K, k, 1);
+                         for(i = 0; i < k; i++)
+                         {
+                           rt__data[i*rt__stride] = tmp__data[i + K];
+                           ri__data[i*ri__stride] = tmpi__data[i + K];
+                         })
+  }
+  else {
+    /* k smallest elements, ascending order (optional: see sorted) */
+    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, long, ri_, dim,
+                         long i;
+                         for(i = 0; i < sliceSize; i++)
+                         {
+                           tmp__data[i] = t_data[i*t_stride];
+                           tmpi__data[i] = i;
+                         }
+                         THTensor_(quickselect)(tmp__data, tmpi__data, k - 1, sliceSize, 1);
+                         if (sorted)
+                           THTensor_(quicksortascend)(tmp__data, tmpi__data, k - 1, 1);
+                         for(i = 0; i < k; i++)
+                         {
+                           rt__data[i*rt__stride] = tmp__data[i];
+                           ri__data[i*ri__stride] = tmpi__data[i];
+                         })
+  }
+
+  THTensor_(free)(tmpResults);
+  THLongTensor_free(tmpIndices);
+}
+
+void THTensor_(tril)(THTensor *r_, THTensor *t, long k)
+{
+  long t_size_0, t_size_1;
+  long t_stride_0, t_stride_1;
+  long r__stride_0, r__stride_1;
+  real *t_data, *r__data;
+  long r, c;
+
+  THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
+
+  THTensor_(resizeAs)(r_, t);
+
+  t_size_0 = THTensor_(size)(t, 0);
+  t_size_1 = THTensor_(size)(t, 1);
+  t_stride_0 = THTensor_(stride)(t, 0);
+  t_stride_1 = THTensor_(stride)(t, 1);
+  r__stride_0 = THTensor_(stride)(r_, 0);
+  r__stride_1 = THTensor_(stride)(r_, 1);
+  r__data = THTensor_(data)(r_);
+  t_data = THTensor_(data)(t);
+
+  for(r = 0; r < t_size_0; r++)
+  {
+    long sz = THMin(r+k+1, t_size_1);
+    for(c = THMax(0, r+k); c < t_size_1; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = 0;
+    for(c = 0; c < sz; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
+  }
+}
+
+void THTensor_(triu)(THTensor *r_, THTensor *t, long k)
+{
+  long t_size_0, t_size_1;
+  long t_stride_0, t_stride_1;
+  long r__stride_0, r__stride_1;
+  real *t_data, *r__data;
+  long r, c;
+
+  THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
+
+  THTensor_(resizeAs)(r_, t);
+
+  t_size_0 = THTensor_(size)(t, 0);
+  t_size_1 = THTensor_(size)(t, 1);
+  t_stride_0 = THTensor_(stride)(t, 0);
+  t_stride_1 = THTensor_(stride)(t, 1);
+  r__stride_0 = THTensor_(stride)(r_, 0);
+  r__stride_1 = THTensor_(stride)(r_, 1);
+  r__data = THTensor_(data)(r_);
+  t_data = THTensor_(data)(t);
+
+  for(r = 0; r < t_size_0; r++)
+  {
+    long sz = THMin(r+k, t_size_1);
+    for(c = THMax(0, r+k); c < t_size_1; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
+    for(c = 0; c < sz; c++)
+      r__data[r*r__stride_0+c*r__stride_1] = 0;
+  }
+}
+
+void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension)
+{
+  THTensor* inputs[2];
+  inputs[0] = ta;
+  inputs[1] = tb;
+  THTensor_(catArray)(r_, inputs, 2, dimension);
+}
+
+void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension)
+{
+  THLongStorage *size;
+  int i, j;
+  long offset;
+  int ndim = dimension + 1;
+  for (i = 0; i < numInputs; i++)
+  {
+    ndim = THMax(ndim, inputs[i]->nDimension);
+  }
+
+  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
+  THArgCheck(dimension >= 0, 4, "invalid dimension %d", dimension+1);
+
+  size = THLongStorage_newWithSize(ndim);
+  for(i = 0; i < ndim; i++)
+  {
+    long dimSize = i < inputs[0]->nDimension ? inputs[0]->size[i] : 1;
+    if (i == dimension)
+    {
+      for (j = 1; j < numInputs; j++)
+      {
+        dimSize += i < inputs[j]->nDimension ? inputs[j]->size[i] : 1;
+      }
+    }
+    else
+    {
+      for (j = 1; j < numInputs; j++)
+      {
+        if (dimSize != (i < inputs[j]->nDimension ? inputs[j]->size[i] : 1))
+        {
+          THLongStorage_free(size);
+          THError("inconsistent tensor sizes");
+        }
+      }
+    }
+    size->data[i] = dimSize;
+  }
+
+  THTensor_(resize)(result, size, NULL);
+  THLongStorage_free(size);
+
+  offset = 0;
+  for (j = 0; j < numInputs; j++)
+  {
+    long dimSize = dimension < inputs[j]->nDimension ? inputs[j]->size[dimension] : 1;
+    THTensor *nt = THTensor_(newWithTensor)(result);
+    THTensor_(narrow)(nt, NULL, dimension, offset, dimSize);
+    THTensor_(copy)(nt, inputs[j]);
+    THTensor_(free)(nt);
+    offset += dimSize;
+  }
+}
+
+int THTensor_(equal)(THTensor *ta, THTensor* tb)
+{
+  int equal = 1;
+  if(!THTensor_(isSameSizeAs)(ta, tb))
+    return 0;
+
+  if (THTensor_(isContiguous)(ta) && THTensor_(isContiguous)(tb)) {
+    real *tap = THTensor_(data)(ta);
+    real *tbp = THTensor_(data)(tb);
+    long sz = THTensor_(nElement)(ta);
+    long i;
+    for (i=0; i<sz; ++i){
+      if(tap[i] != tbp[i]) return 0;
+    }
+  } else {
+    // Short-circuit the apply function on inequality
+    TH_TENSOR_APPLY2(real, ta, real, tb,
+                     if (equal && *ta_data != *tb_data) {
+                        equal = 0;
+                        TH_TENSOR_APPLY_hasFinished = 1; break;
+                     })
+  }
+  return equal;
+}
+
+#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)				\
+  void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value)	\
+  {									\
+    THByteTensor_rawResize(r_, t->nDimension, t->size, NULL);		\
+    THByteTensor_zero(r_);						\
+    TH_TENSOR_APPLY2(unsigned char, r_, real, t,			\
+		     if (*t_data OP value) *r__data = 1;);		\
+  }									\
+  void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value)	\
+  {									\
+    THTensor_(rawResize)(r_, t->nDimension, t->size, NULL);		\
+    THTensor_(zero)(r_);						\
+    TH_TENSOR_APPLY2(real, r_, real, t,					\
+		     if (*t_data OP value) *r__data = 1;);		\
+  }									\
+  void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
+  {									\
+    THByteTensor_rawResize(r_, ta->nDimension, ta->size, NULL);		\
+    THByteTensor_zero(r_);						\
+    TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb,		\
+		     if(*ta_data OP *tb_data) *r__data = 1;);		\
+  }									\
+  void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
+  {									\
+    THTensor_(rawResize)(r_, ta->nDimension, ta->size, NULL);		\
+    THTensor_(zero)(r_);						\
+    TH_TENSOR_APPLY3(real, r_, real, ta, real, tb,			\
+		     if(*ta_data OP *tb_data) *r__data = 1;);		\
+  }									\
+
+
+TENSOR_IMPLEMENT_LOGICAL(lt,<)
+TENSOR_IMPLEMENT_LOGICAL(gt,>)
+TENSOR_IMPLEMENT_LOGICAL(le,<=)
+TENSOR_IMPLEMENT_LOGICAL(ge,>=)
+TENSOR_IMPLEMENT_LOGICAL(eq,==)
+TENSOR_IMPLEMENT_LOGICAL(ne,!=)
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION(NAME, CFUNC)             \
+  void THTensor_(NAME)(THTensor *r_, THTensor *t)                \
+  {                                                           \
+    THTensor_(resizeAs)(r_, t);                               \
+    TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data);); \
+  }                                                           \
+
+#define LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(NAME, CFUNC)                 \
+  void THTensor_(NAME)(THTensor *r_, THTensor *t, real value)              \
+  {                                                                     \
+    THTensor_(resizeAs)(r_, t);                                         \
+    TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data, value);); \
+  }                                                                     \
+
+#if defined(TH_REAL_IS_LONG)
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs)
+#endif /* long only part */
+
+#if defined(TH_REAL_IS_INT)
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
+#endif /* int only part */
+
+#if defined(TH_REAL_IS_BYTE)
+
+#define TENSOR_IMPLEMENT_LOGICAL_SUM(NAME, OP, INIT_VALUE) \
+  int THTensor_(NAME)(THTensor *tensor) \
+  { \
+    THArgCheck(tensor->nDimension > 0, 1, "empty Tensor"); \
+    int sum = INIT_VALUE;                               \
+    TH_TENSOR_APPLY(real, tensor, sum = sum OP *tensor_data;); \
+    return sum; \
+  }
+
+TENSOR_IMPLEMENT_LOGICAL_SUM(logicalall, &&, 1)
+TENSOR_IMPLEMENT_LOGICAL_SUM(logicalany, ||, 0)
+
+#endif /* Byte only part */
+
+/* floating point only now */
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+LAB_IMPLEMENT_BASIC_FUNCTION(log,log)
+LAB_IMPLEMENT_BASIC_FUNCTION(log1p,log1p)
+LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_sigmoid)
+LAB_IMPLEMENT_BASIC_FUNCTION(exp,exp)
+LAB_IMPLEMENT_BASIC_FUNCTION(cos,cos)
+LAB_IMPLEMENT_BASIC_FUNCTION(acos,acos)
+LAB_IMPLEMENT_BASIC_FUNCTION(cosh,cosh)
+LAB_IMPLEMENT_BASIC_FUNCTION(sin,sin)
+LAB_IMPLEMENT_BASIC_FUNCTION(asin,asin)
+LAB_IMPLEMENT_BASIC_FUNCTION(sinh,sinh)
+LAB_IMPLEMENT_BASIC_FUNCTION(tan,tan)
+LAB_IMPLEMENT_BASIC_FUNCTION(atan,atan)
+LAB_IMPLEMENT_BASIC_FUNCTION(tanh,tanh)
+LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,pow)
+LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,sqrt)
+LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_rsqrt)
+LAB_IMPLEMENT_BASIC_FUNCTION(ceil,ceil)
+LAB_IMPLEMENT_BASIC_FUNCTION(floor,floor)
+LAB_IMPLEMENT_BASIC_FUNCTION(round,round)
+LAB_IMPLEMENT_BASIC_FUNCTION(abs,fabs)
+LAB_IMPLEMENT_BASIC_FUNCTION(trunc,trunc)
+LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_frac)
+LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
+LAB_IMPLEMENT_BASIC_FUNCTION(cinv, 1.0 / )
+
+void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty)
+{
+  THTensor_(resizeAs)(r_, tx);
+  TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = atan2(*tx_data,*ty_data););
+}
+
+void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
+{
+  THArgCheck(THTensor_(nElement)(a) == THTensor_(nElement)(b), 2, "sizes do not match");
+  THTensor_(resizeAs)(r_, a);
+  TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_lerp(*a_data, *b_data, weight););
+}
+
+void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal sum = 0;
+                       long i;
+                       for(i = 0; i < t_size; i++)
+                         sum += t_data[i*t_stride];
+                       *r__data = (real)sum/t_size;);
+}
+
+void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "invalid dimension %d",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal sum = 0;
+                       accreal sum2 = 0;
+                       long i;
+                       for(i = 0; i < t_size; i++)
+                       {
+                         real z = t_data[i*t_stride];
+                         sum += z;
+                         sum2 += z*z;
+                       }
+
+                       if(flag)
+                       {
+                         sum /= t_size;
+                         sum2 /= t_size;
+                         sum2 -= sum*sum;
+                         sum2 = (sum2 < 0 ? 0 : sum2);
+                         *r__data = (real)sqrt(sum2);
+                       }
+                       else
+                       {
+                         sum /= t_size;
+                         sum2 /= t_size-1;
+                         sum2 -= ((real)t_size)/((real)(t_size-1))*sum*sum;
+                         sum2 = (sum2 < 0 ? 0 : sum2);
+                         *r__data = (real)sqrt(sum2);
+                       });
+}
+
+void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "invalid dimension %d",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                       accreal sum = 0;
+                       accreal sum2 = 0;
+                       long i;
+                       for(i = 0; i < t_size; i++)
+                       {
+                         real z = t_data[i*t_stride];
+                         sum += z;
+                         sum2 += z*z;
+                       }
+
+                       if(flag)
+                       {
+                         sum /= t_size;
+                         sum2 /= t_size;
+                         sum2 -= sum*sum;
+                         sum2 = (sum2 < 0 ? 0 : sum2);
+                         *r__data = sum2;
+                       }
+                       else
+                       {
+                         sum /= t_size;
+                         sum2 /= t_size-1;
+                         sum2 -= ((real)t_size)/((real)(t_size-1))*sum*sum;
+                         sum2 = (sum2 < 0 ? 0 : sum2);
+                         *r__data = (real)sum2;
+                       });
+}
+
+void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension)
+{
+  THLongStorage *dim;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "invalid dimension %d",
+      dimension+1);
+
+  dim = THTensor_(newSizeOf)(t);
+  THLongStorage_set(dim, dimension, 1);
+  THTensor_(resize)(r_, dim, NULL);
+  THLongStorage_free(dim);
+
+  if(value == 0) {
+    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                         accreal sum = 0;
+                         long i;
+                         for(i = 0; i < t_size; i++)
+                           sum += t_data[i*t_stride] != 0.0;
+                         *r__data = sum;)
+  } else {
+    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
+                         accreal sum = 0;
+                         long i;
+                         for(i = 0; i < t_size; i++)
+                           sum += pow(fabs(t_data[i*t_stride]), value);
+                         *r__data = pow(sum, 1.0/value);)
+  }
+}
+
+accreal THTensor_(normall)(THTensor *tensor, real value)
+{
+  accreal sum = 0;
+  if(value == 0) {
+    TH_TENSOR_APPLY(real, tensor, sum += *tensor_data != 0.0;);
+    return sum;
+  } else if(value == 1) {
+    TH_TENSOR_APPLY(real, tensor, sum += fabs(*tensor_data););
+    return sum;
+  } else if(value == 2) {
+    TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += z*z;);
+    return sqrt(sum);
+  } else {
+    TH_TENSOR_APPLY(real, tensor, sum += pow(fabs(*tensor_data), value););
+    return pow(sum, 1.0/value);
+  }
+}
+
+void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension, real maxnorm)
+{
+  int i;
+  THTensor *rowR, *rowS;
+
+  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(src), 3, "invalid dimension %d",
+      dimension+1);
+  THArgCheck(value > 0, 2, "non-positive-norm not supported");
+  THArgCheck(THTensor_(nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions",
+      THTensor_(nDimension)(src));
+
+  rowR = THTensor_(new)();
+  rowS = THTensor_(new)();
+
+  THTensor_(resizeAs)(res, src);
+
+  for (i=0; i<src->size[dimension]; i++)
+  {
+    real norm = 0;
+    real new_norm;
+
+    THTensor_(select)(rowS, src, dimension, i);
+    THTensor_(select)(rowR, res, dimension, i);
+    if (value == 1) {
+      TH_TENSOR_APPLY(real, rowS, norm += fabs(*rowS_data););
+    } else if (value == 2) {
+      TH_TENSOR_APPLY(real, rowS, accreal z = *rowS_data; norm += z*z;);
+    } else {
+      TH_TENSOR_APPLY(real, rowS, norm += pow(fabs(*rowS_data), value););
+    }
+
+    norm = pow(norm, 1/value);
+
+    if (norm > maxnorm)
+    {
+      new_norm = maxnorm / (norm + 1e-7);
+
+      TH_TENSOR_APPLY2(
+        real, rowR, real, rowS,
+        *rowR_data = (*rowS_data) * new_norm;
+      )
+    }
+    else
+      THTensor_(copy)(rowR, rowS);
+  }
+
+  THTensor_(free)(rowR);
+  THTensor_(free)(rowS);
+}
+
+accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
+{
+  real sum = 0;
+  TH_TENSOR_APPLY2(real, tensor, real, src,
+	sum += pow(fabs(*tensor_data - *src_data), value);)
+  return pow(sum, 1.0/value);
+}
+
+accreal THTensor_(meanall)(THTensor *tensor)
+{
+  THArgCheck(tensor->nDimension > 0, 1, "empty Tensor");
+  return THTensor_(sumall)(tensor)/THTensor_(nElement)(tensor);
+}
+
+accreal THTensor_(varall)(THTensor *tensor)
+{
+  accreal mean = THTensor_(meanall)(tensor);
+  accreal sum = 0;
+  TH_TENSOR_APPLY(real, tensor, sum += (*tensor_data - mean)*(*tensor_data - mean););
+  sum /= (THTensor_(nElement)(tensor)-1);
+  return sum;
+}
+
+accreal THTensor_(stdall)(THTensor *tensor)
+{
+  return sqrt(THTensor_(varall)(tensor));
+}
+
+void THTensor_(linspace)(THTensor *r_, real a, real b, long n)
+{
+  real i = 0;
+
+  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+
+  if (THTensor_(nElement)(r_) != n) {
+    THTensor_(resize1d)(r_, n);
+  }
+
+  if(n == 1) {
+     TH_TENSOR_APPLY(real, r_,
+             *r__data = a;
+             i++;
+           );
+  } else {
+     TH_TENSOR_APPLY(real, r_,
+             *r__data = a + i*(b-a)/((real)(n-1));
+             i++;
+           );
+  }
+}
+
+void THTensor_(logspace)(THTensor *r_, real a, real b, long n)
+{
+  real i = 0;
+
+  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
+
+  if (THTensor_(nElement)(r_) != n) {
+    THTensor_(resize1d)(r_, n);
+  }
+
+  if(n == 1) {
+    TH_TENSOR_APPLY(real, r_,
+        *r__data = pow(10.0, a);
+        i++;
+        );
+  } else {
+    TH_TENSOR_APPLY(real, r_,
+        *r__data = pow(10.0, a + i*(b-a)/((real)(n-1)));
+        i++;
+        );
+  }
+}
+
+void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size)
+{
+  THTensor_(resize)(r_, size, NULL);
+  THTensor_(uniform)(r_, _generator, 0, 1);
+}
+
+void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size)
+{
+  THTensor_(resize)(r_, size, NULL);
+  THTensor_(normal)(r_, _generator, 0, 1);
+}
+
+void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue)
+{
+  THTensor *clone;
+  real minval;
+  real maxval;
+  real bins;
+  real *h_data;
+
+  THTensor_(resize1d)(hist, nbins);
+  THTensor_(zero)(hist);
+  minval = minvalue;
+  maxval = maxvalue;
+  if (minval == maxval)
+  {
+    minval = THTensor_(minall)(tensor);
+    maxval = THTensor_(maxall)(tensor);
+  }
+  if (minval == maxval)
+  {
+    minval = minval - 1;
+    maxval = maxval + 1;
+  }
+  bins = (real)(nbins)-1e-6;
+
+
+  clone = THTensor_(newWithSize1d)(THTensor_(nElement)(tensor));
+  THTensor_(copy)(clone,tensor);
+  THTensor_(add)(clone, clone, -minval);
+  THTensor_(div)(clone, clone, (maxval-minval));
+  THTensor_(mul)(clone, clone, bins);
+  THTensor_(floor)(clone, clone);
+  THTensor_(add)(clone, clone, 1);
+
+  h_data = THTensor_(data)(hist);
+
+  TH_TENSOR_APPLY(real, clone,                                         \
+                  if ((*clone_data <= nbins) && (*clone_data >= 1)) {  \
+                    *(h_data + (int)(*clone_data) - 1) += 1;           \
+                  });
+
+  THTensor_(free)(clone);
+}
+
+#endif /* floating point only part */
+#endif
diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h
new file mode 100644
index 0000000..d33406f
--- /dev/null
+++ b/lib/TH/generic/THTensorMath.h
@@ -0,0 +1,186 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorMath.h"
+#else
+
+
+
+TH_API void THTensor_(fill)(THTensor *r_, real value);
+TH_API void THTensor_(zero)(THTensor *r_);
+
+TH_API void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value);
+TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src);
+TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask);
+
+TH_API void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor);
+
+TH_API void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
+TH_API void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
+
+TH_API void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
+TH_API void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
+TH_API void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
+
+TH_API accreal THTensor_(dot)(THTensor *t, THTensor *src);
+
+TH_API real THTensor_(minall)(THTensor *t);
+TH_API real THTensor_(maxall)(THTensor *t);
+TH_API accreal THTensor_(sumall)(THTensor *t);
+TH_API accreal THTensor_(prodall)(THTensor *t);
+
+TH_API void THTensor_(neg)(THTensor *self, THTensor *src);
+TH_API void THTensor_(cinv)(THTensor *self, THTensor *src);
+
+TH_API void THTensor_(add)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(sub)(THTensor *self, THTensor *src, real value);
+TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(div)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value);
+
+TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src);
+TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, real value, THTensor *src2);
+TH_API void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src);
+TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src);
+
+TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
+TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
+
+TH_API void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat,  THTensor *vec);
+TH_API void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat1, THTensor *mat2);
+TH_API void THTensor_(addr)(THTensor *r_,  real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2);
+
+TH_API void THTensor_(addbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2);
+TH_API void THTensor_(baddbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2);
+
+TH_API void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain);
+
+TH_API long THTensor_(numel)(THTensor *t);
+TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
+TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
+TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension);
+TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
+TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension);
+TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(sign)(THTensor *r_, THTensor *t);
+TH_API accreal THTensor_(trace)(THTensor *t);
+TH_API void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension);
+
+TH_API void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src);
+TH_API void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src);
+TH_API void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value);
+TH_API void THTensor_(cminValue)(THTensor *r, THTensor *t, real value);
+
+TH_API void THTensor_(zeros)(THTensor *r_, THLongStorage *size);
+TH_API void THTensor_(ones)(THTensor *r_, THLongStorage *size);
+TH_API void THTensor_(diag)(THTensor *r_, THTensor *t, int k);
+TH_API void THTensor_(eye)(THTensor *r_, long n, long m);
+TH_API void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
+TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n);
+
+TH_API void THTensor_(reshape)(THTensor *r_, THTensor *t, THLongStorage *size);
+TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
+TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted);
+TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, long k);
+TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, long k);
+TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
+TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
+
+TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
+
+TH_API void THTensor_(ltValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(leValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(gtValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(geValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(neValue)(THByteTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(eqValue)(THByteTensor *r_, THTensor* t, real value);
+
+TH_API void THTensor_(ltValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(leValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(gtValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(geValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(neValueT)(THTensor *r_, THTensor* t, real value);
+TH_API void THTensor_(eqValueT)(THTensor *r_, THTensor* t, real value);
+
+TH_API void THTensor_(ltTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(leTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(gtTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(geTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(neTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(eqTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
+
+TH_API void THTensor_(ltTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(leTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(gtTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(geTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(neTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+TH_API void THTensor_(eqTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
+
+#if defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
+TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
+#endif
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+TH_API void THTensor_(sigmoid)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(log)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(log1p)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(exp)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(cos)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(acos)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(cosh)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(sin)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(asin)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(sinh)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(tan)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(atan)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty);
+TH_API void THTensor_(tanh)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(pow)(THTensor *r_, THTensor *t, real value);
+TH_API void THTensor_(tpow)(THTensor *r_, real value, THTensor *t);
+TH_API void THTensor_(sqrt)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(rsqrt)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(ceil)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(floor)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(round)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(trunc)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(frac)(THTensor *r_, THTensor *t);
+TH_API void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight);
+
+TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension);
+TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag);
+TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag);
+TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension);
+TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
+TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
+TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
+
+TH_API accreal THTensor_(meanall)(THTensor *self);
+TH_API accreal THTensor_(varall)(THTensor *self);
+TH_API accreal THTensor_(stdall)(THTensor *self);
+TH_API accreal THTensor_(normall)(THTensor *t, real value);
+
+TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, long n);
+TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, long n);
+TH_API void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
+TH_API void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
+
+#endif
+
+#if defined(TH_REAL_IS_BYTE)
+
+TH_API int THTensor_(logicalall)(THTensor *self);
+TH_API int THTensor_(logicalany)(THTensor *self);
+
+#endif /* TH_REAL_IS_BYTE */
+
+#endif
diff --git a/lib/TH/generic/THTensorRandom.c b/lib/TH/generic/THTensorRandom.c
new file mode 100644
index 0000000..f8097c8
--- /dev/null
+++ b/lib/TH/generic/THTensorRandom.c
@@ -0,0 +1,250 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorRandom.c"
+#else
+
+void THTensor_(random)(THTensor *self, THGenerator *_generator)
+{
+#if defined(TH_REAL_IS_BYTE)
+  TH_TENSOR_APPLY(real, self, *self_data = (unsigned char)(THRandom_random(_generator) % (UCHAR_MAX+1)););
+#elif defined(TH_REAL_IS_CHAR)
+  TH_TENSOR_APPLY(real, self, *self_data = (char)(THRandom_random(_generator) % (CHAR_MAX+1)););
+#elif defined(TH_REAL_IS_SHORT)
+  TH_TENSOR_APPLY(real, self, *self_data = (short)(THRandom_random(_generator) % (SHRT_MAX+1)););
+#elif defined(TH_REAL_IS_INT)
+  TH_TENSOR_APPLY(real, self, *self_data = (int)(THRandom_random(_generator) % (INT_MAX+1UL)););
+#elif defined(TH_REAL_IS_LONG)
+  TH_TENSOR_APPLY(real, self, *self_data = (long)(THRandom_random(_generator) % (LONG_MAX+1UL)););
+#elif defined(TH_REAL_IS_FLOAT)
+  TH_TENSOR_APPLY(real, self, *self_data = (float)(THRandom_random(_generator) % ((1UL << FLT_MANT_DIG)+1)););
+#elif defined(TH_REAL_IS_DOUBLE)
+  TH_TENSOR_APPLY(real, self, *self_data = (double)(THRandom_random(_generator) % ((1ULL << DBL_MANT_DIG)+1)););
+#else
+#error "Unknown type"
+#endif
+}
+
+void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p)
+{
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_geometric(_generator, p););
+}
+
+void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p)
+{
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p););
+}
+
+void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p)
+{
+  TH_TENSOR_APPLY2(real, self, float, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data););
+}
+
+void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p)
+{
+  TH_TENSOR_APPLY2(real, self, double, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data););
+}
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+
+void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b)
+{
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_uniform(_generator, a, b););
+}
+
+void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stdv)
+{
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_normal(_generator, mean, stdv););
+}
+
+void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda)
+{
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_exponential(_generator, lambda););
+}
+
+void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma)
+{
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_cauchy(_generator, median, sigma););
+}
+
+void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv)
+{
+  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_logNormal(_generator, mean, stdv););
+}
+
+void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement)
+{
+  int start_dim = THTensor_(nDimension)(prob_dist);
+  long n_dist;
+  long n_categories;
+  THDoubleTensor* cum_dist;
+  int i,j,k;
+
+  if (start_dim == 1)
+  {
+    THTensor_(resize2d)(prob_dist, 1, THTensor_(size)(prob_dist, 0));
+  }
+
+  n_dist = THTensor_(size)(prob_dist, 0);
+  n_categories = THTensor_(size)(prob_dist, 1);
+
+  THArgCheck(n_sample > 0, 2, "cannot sample n_sample < 0 samples");
+
+  if (!with_replacement)
+  {
+    THArgCheck((!with_replacement) && (n_sample <= n_categories), 2, \
+    "cannot sample n_sample > prob_dist:size(1) samples without replacement");
+  }
+
+  /* cumulative probability distribution vector */
+  cum_dist = THDoubleTensor_newWithSize1d(n_categories);
+
+  /* will contain multinomial samples (category indices to be returned) */
+  THLongTensor_resize2d(self, n_dist , n_sample);
+
+  for (i=0; i<n_dist; i++)
+  {
+    /* Get normalized cumulative distribution from prob distribution */
+    double sum = 0;
+    for (j=0; j<n_categories; j++)
+    {
+      sum += THStorage_(get)( \
+        prob_dist->storage, \
+        prob_dist->storageOffset+i*prob_dist->stride[0]+j*prob_dist->stride[1] \
+      );
+      THDoubleStorage_set(
+        cum_dist->storage, \
+        cum_dist->storageOffset+j*cum_dist->stride[0], \
+        sum \
+      );
+    }
+    THArgCheckWithCleanup((sum > 0), THCleanup(THDoubleTensor_free(cum_dist);), 2,
+                          "invalid multinomial distribution (sum of probabilities <= 0)");
+    /* normalize cumulative probability distribution so that last val is 1
+    i.e. dosen't assume original prob_dist row sums to one */
+    if ( (sum > 0) || ( ( sum < 1.00001) && (sum > 0.99999) ) )
+    {
+      for (j=0; j<n_categories; j++)
+      {
+        THDoubleTensor_data(cum_dist)[j*cum_dist->stride[0]] /= sum;
+      }
+    }
+
+    for (j=0; j<n_sample; j++)
+    {
+      /* sample a probability mass from a uniform distribution */
+      double uniform_sample = THRandom_uniform(_generator, 0, 1);
+      /* Do a binary search for the slot in which the prob falls
+      ie cum_dist[row][slot-1] < uniform_prob < cum_distr[row][slot] */
+      int left_pointer = 0;
+      int right_pointer = n_categories;
+      int mid_pointer;
+      double cum_prob;
+      int sample_idx;
+      /* Make sure the last cumulative distribution bucket sums to 1 */
+      THDoubleTensor_data(cum_dist)[(n_categories-1)*cum_dist->stride[0]] = 1;
+
+      while(right_pointer - left_pointer > 0)
+      {
+          mid_pointer = left_pointer + (right_pointer - left_pointer) / 2;
+          cum_prob = THDoubleStorage_get( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+mid_pointer*cum_dist->stride[0] \
+          );
+          if (cum_prob < uniform_sample)
+          {
+            left_pointer = mid_pointer + 1;
+          }
+          else
+          {
+            right_pointer = mid_pointer;
+          }
+      }
+      sample_idx = left_pointer;
+
+       /* store in result tensor (will be incremented for lua compat by wrapper) */
+      THLongStorage_set( \
+        self->storage, \
+        self->storageOffset+i*self->stride[0]+j*self->stride[1], \
+        sample_idx \
+      );
+
+      /* Once a sample is drawn, it cannot be drawn again. ie sample without replacement */
+      if (!with_replacement)
+      {
+        /* update cumulative distribution so that sample cannot be drawn again */
+        double diff;
+        double new_val = 0;
+        double sum;
+
+        if (sample_idx != 0)
+        {
+          new_val = THDoubleStorage_get( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride[0] \
+          );
+        }
+        /* marginal cumulative mass (i.e. original probability) of sample */
+        diff = THDoubleStorage_get( \
+          cum_dist->storage, \
+          cum_dist->storageOffset+sample_idx*cum_dist->stride[0] \
+        ) - new_val;
+        /* new sum of marginals is not one anymore... */
+        sum = 1.0 - diff;
+        for (k=0; k<n_categories; k++)
+        {
+          new_val = THDoubleStorage_get( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+k*cum_dist->stride[0] \
+          );
+          if (k >= sample_idx)
+          {
+            /* remove sampled probability mass from later cumulative probabilities */
+            new_val -= diff;
+          }
+          /* make total marginals sum to one */
+          new_val /= sum;
+          THDoubleStorage_set( \
+            cum_dist->storage, \
+            cum_dist->storageOffset+k*cum_dist->stride[0], \
+            new_val \
+          );
+        }
+      }
+    }
+  }
+
+  THDoubleTensor_free(cum_dist);
+
+  if (start_dim == 1)
+  {
+    THLongTensor_resize1d(self, n_sample);
+    THTensor_(resize1d)(prob_dist, n_categories);
+  }
+}
+
+#endif
+
+#if defined(TH_REAL_IS_BYTE)
+void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self)
+{
+  static const size_t size = sizeof(THGenerator);
+  THGenerator *rng_state;
+  THTensor_(resize1d)(self, size);
+  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
+  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
+  rng_state = (THGenerator *)THTensor_(data)(self);
+  THGenerator_copy(rng_state, _generator);
+}
+
+void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self)
+{
+  static const size_t size = sizeof(THGenerator);
+  THGenerator *rng_state;
+  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
+  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
+  rng_state = (THGenerator *)THTensor_(data)(self);
+  THArgCheck(THGenerator_isValid(rng_state), 1, "Invalid RNG state");
+  THGenerator_copy(_generator, rng_state);
+}
+#endif
+
+#endif
diff --git a/lib/TH/generic/THTensorRandom.h b/lib/TH/generic/THTensorRandom.h
new file mode 100644
index 0000000..d205142
--- /dev/null
+++ b/lib/TH/generic/THTensorRandom.h
@@ -0,0 +1,25 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THTensorRandom.h"
+#else
+
+TH_API void THTensor_(random)(THTensor *self, THGenerator *_generator);
+TH_API void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p);
+TH_API void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p);
+TH_API void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p);
+TH_API void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p);
+
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+TH_API void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b);
+TH_API void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
+TH_API void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda);
+TH_API void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma);
+TH_API void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
+TH_API void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement);
+#endif
+
+#if defined(TH_REAL_IS_BYTE)
+TH_API void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self);
+TH_API void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self);
+#endif
+
+#endif
diff --git a/lib/TH/generic/THVector.c b/lib/TH/generic/THVector.c
new file mode 100644
index 0000000..6c8a96b
--- /dev/null
+++ b/lib/TH/generic/THVector.c
@@ -0,0 +1,84 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVector.c"
+#else
+
+static TH_INLINE void THVector_(fill)(real *x, const real c, const long n) {
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    x[i] = c;
+    x[i+1] = c;
+    x[i+2] = c;
+    x[i+3] = c;
+  }
+
+  for(; i < n; i++)
+    x[i] = c;
+}
+
+static TH_INLINE void THVector_(add)(real *y, const real *x, const real c, const long n)
+{
+  long i = 0;
+
+  for(;i < n-4; i += 4)
+  {
+    y[i] += c * x[i];
+    y[i+1] += c * x[i+1];
+    y[i+2] += c * x[i+2];
+    y[i+3] += c * x[i+3];
+  }
+
+  for(; i < n; i++)
+    y[i] += c * x[i];
+}
+
+static TH_INLINE void THVector_(diff)(real *z, const real *x, const real *y, const long n)
+{
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    z[i] = x[i] - y[i];
+    z[i+1] = x[i+1] - y[i+1];
+    z[i+2] = x[i+2] - y[i+2];
+    z[i+3] = x[i+3] - y[i+3];
+  }
+
+  for(; i < n; i++)
+    z[i] = x[i] - y[i];
+}
+
+static TH_INLINE void THVector_(scale)(real *y, const real c, const long n)
+{
+  long i = 0;
+
+  for(; i < n-4; i +=4)
+  {
+    y[i] *= c;
+    y[i+1] *= c;
+    y[i+2] *= c;
+    y[i+3] *= c;
+  }
+
+  for(; i < n; i++)
+    y[i] *= c;
+}
+
+static TH_INLINE void THVector_(mul)(real *y, const real *x, const long n)
+{
+  long i = 0;
+
+  for(; i < n-4; i += 4)
+  {
+    y[i] *= x[i];
+    y[i+1] *= x[i+1];
+    y[i+2] *= x[i+2];
+    y[i+3] *= x[i+3];
+  }
+
+  for(; i < n; i++)
+    y[i] *= x[i];
+}
+
+#endif
diff --git a/lib/TH/generic/simd/common_simd.h b/lib/TH/generic/simd/common_simd.h
new file mode 100644
index 0000000..425b4b9
--- /dev/null
+++ b/lib/TH/generic/simd/common_simd.h
@@ -0,0 +1,395 @@
+#ifndef COMMON_SIMD_H
+#define COMMON_SIMD_H
+
+/* Weights */
+#define LOAD_WEIGHT(q, simd_type, inst_var) _m ## simd_type ## inst_var(*(q))
+
+#define DECLARE_WEIGHTS(simd_type) \
+__ ## simd_type weight0; \
+__ ## simd_type weight1; \
+__ ## simd_type weight2; \
+__ ## simd_type weight3; \
+__ ## simd_type weight4;
+
+#define LOAD_WEIGHTS(k, simd_type, inst_var) \
+weight0 = LOAD_WEIGHT(weight + 5 * 0 + k, simd_type, inst_var); \
+weight1 = LOAD_WEIGHT(weight + 5 * 1 + k, simd_type, inst_var); \
+weight2 = LOAD_WEIGHT(weight + 5 * 2 + k, simd_type, inst_var); \
+weight3 = LOAD_WEIGHT(weight + 5 * 3 + k, simd_type, inst_var); \
+weight4 = LOAD_WEIGHT(weight + 5 * 4 + k, simd_type, inst_var);
+
+/* Inputs declare */
+#define DECLARE_INPUT_0(i) \
+float* input0 = image + i; \
+
+#define DECLARE_INPUT_1() \
+float* input1 = input0 + inputStride; \
+float* input2 = input1 + inputStride; \
+float* input3 = input2 + inputStride; \
+float* input4 = input3 + inputStride;
+
+#define DECLARE_INPUT_2() \
+DECLARE_INPUT_1() \
+float* input5 = input4 + inputStride;
+
+#define DECLARE_INPUT_4() \
+DECLARE_INPUT_2() \
+float* input6 = input5 + inputStride; \
+float* input7 = input6 + inputStride;
+
+#define DECLARE_INPUT_5() \
+DECLARE_INPUT_4() \
+float* input8 = input7 + inputStride;
+
+#define DECLARE_INPUT_6() \
+DECLARE_INPUT_5() \
+float* input9 = input8 + inputStride;
+
+#define DECLARE_INPUT_7() \
+DECLARE_INPUT_6() \
+float* inputA = input9 + inputStride;
+
+#define DECLARE_INPUT_8() \
+DECLARE_INPUT_7() \
+float* inputB = inputA + inputStride;
+
+
+/* Inputs increment */
+#define INC_INPUT_1()\
+input0++; \
+input1++; \
+input2++; \
+input3++; \
+input4++; \
+
+#define INC_INPUT_2()\
+INC_INPUT_1() \
+input5++;
+
+#define INC_INPUT_4()\
+INC_INPUT_2() \
+input6++; \
+input7++;
+
+#define INC_INPUT_5()\
+INC_INPUT_4() \
+input8++;
+
+#define INC_INPUT_6()\
+INC_INPUT_5() \
+input9++;
+
+#define INC_INPUT_7()\
+INC_INPUT_6() \
+inputA++;
+
+#define INC_INPUT_8()\
+INC_INPUT_7() \
+inputB++;
+
+/* Outputs declare */
+#define DECLARE_OUTPUT_1() \
+float* output0 = output;
+
+#define DECLARE_OUTPUT_2() \
+DECLARE_OUTPUT_1() \
+float* output1 = output0 + outputStride;
+
+#define DECLARE_OUTPUT_4() \
+DECLARE_OUTPUT_2() \
+float* output2 = output1 + outputStride; \
+float* output3 = output2 + outputStride;
+
+#define DECLARE_OUTPUT_5() \
+DECLARE_OUTPUT_4() \
+float* output4 = output3 + outputStride;
+
+#define DECLARE_OUTPUT_6() \
+DECLARE_OUTPUT_5() \
+float* output5 = output4 + outputStride;
+
+#define DECLARE_OUTPUT_7() \
+DECLARE_OUTPUT_6() \
+float* output6 = output5 + outputStride;
+
+#define DECLARE_OUTPUT_8() \
+DECLARE_OUTPUT_7() \
+float* output7 = output6 + outputStride;
+
+/* Outputs increment */
+#define INC_OUTPUT_1(x) \
+output0 += x;
+
+#define INC_OUTPUT_2(x) \
+INC_OUTPUT_1(x) \
+output1 += x;
+
+#define INC_OUTPUT_4(x) \
+INC_OUTPUT_2(x) \
+output2 += x; \
+output3 += x;
+
+#define INC_OUTPUT_5(x) \
+INC_OUTPUT_4(x) \
+output4 += x;
+
+#define INC_OUTPUT_6(x) \
+INC_OUTPUT_5(x) \
+output5 += x;
+
+#define INC_OUTPUT_7(x) \
+INC_OUTPUT_6(x) \
+output6 += x;
+
+#define INC_OUTPUT_8(x) \
+INC_OUTPUT_7(x) \
+output7 += x;
+
+/* Image declare */
+#define DECLARE_IMAGE_1(simd_type) \
+__ ## simd_type image0; \
+__ ## simd_type image1; \
+__ ## simd_type image2; \
+__ ## simd_type image3; \
+__ ## simd_type image4;
+
+#define DECLARE_IMAGE_2(simd_type) \
+DECLARE_IMAGE_1(simd_type) \
+__ ## simd_type image5;
+
+#define DECLARE_IMAGE_4(simd_type) \
+DECLARE_IMAGE_2(simd_type) \
+__ ## simd_type image6; \
+__ ## simd_type image7;
+
+#define DECLARE_IMAGE_5(simd_type) \
+DECLARE_IMAGE_4(simd_type) \
+__ ## simd_type image8;
+
+#define DECLARE_IMAGE_6(simd_type) \
+DECLARE_IMAGE_5(simd_type) \
+__ ## simd_type image9;
+
+#define DECLARE_IMAGE_7(simd_type) \
+DECLARE_IMAGE_6(simd_type) \
+__ ## simd_type imageA;
+
+#define DECLARE_IMAGE_8(simd_type) \
+DECLARE_IMAGE_7(simd_type) \
+__ ## simd_type imageB;
+
+/* Sums declare */
+#define DECLARE_SUM_1(simd_type) \
+__ ## simd_type sum0;
+
+#define DECLARE_SUM_2(simd_type) \
+DECLARE_SUM_1(simd_type) \
+__ ## simd_type sum1;
+
+#define DECLARE_SUM_4(simd_type) \
+DECLARE_SUM_2(simd_type) \
+__ ## simd_type sum2; \
+__ ## simd_type sum3;
+
+#define DECLARE_SUM_5(simd_type) \
+DECLARE_SUM_4(simd_type) \
+__ ## simd_type sum4;
+
+#define DECLARE_SUM_6(simd_type) \
+DECLARE_SUM_5(simd_type) \
+__ ## simd_type sum5;
+
+#define DECLARE_SUM_7(simd_type) \
+DECLARE_SUM_6(simd_type) \
+__ ## simd_type sum6;
+
+#define DECLARE_SUM_8(simd_type) \
+DECLARE_SUM_7(simd_type) \
+__ ## simd_type sum7;
+
+/* Sums load */
+#define LOAD_SUM_1(simd_type) \
+sum0 = _m ## simd_type ## _loadu_ps(output0);
+
+#define LOAD_SUM_2(simd_type) \
+LOAD_SUM_1(simd_type) \
+sum1 = _m ## simd_type ## _loadu_ps(output1);
+
+#define LOAD_SUM_4(simd_type) \
+LOAD_SUM_2(simd_type) \
+sum2 = _m ## simd_type ## _loadu_ps(output2); \
+sum3 = _m ## simd_type ## _loadu_ps(output3);
+
+#define LOAD_SUM_5(simd_type) \
+LOAD_SUM_4(simd_type) \
+sum4 = _m ## simd_type ## _loadu_ps(output4);
+
+#define LOAD_SUM_6(simd_type) \
+LOAD_SUM_5(simd_type) \
+sum5 = _m ## simd_type ## _loadu_ps(output5);
+
+#define LOAD_SUM_7(simd_type) \
+LOAD_SUM_6(simd_type) \
+sum6 = _m ## simd_type ## _loadu_ps(output6);
+
+#define LOAD_SUM_8(simd_type) \
+LOAD_SUM_7(simd_type) \
+sum7 = _m ## simd_type ## _loadu_ps(output7);
+
+/* Sums store */
+#define STORE_SUM_1(simd_type) \
+_m ## simd_type ## _storeu_ps(output0, sum0);
+
+#define STORE_SUM_2(simd_type) \
+STORE_SUM_1(simd_type) \
+_m ## simd_type ## _storeu_ps(output1, sum1);
+
+#define STORE_SUM_4(simd_type) \
+STORE_SUM_2(simd_type) \
+_m ## simd_type ## _storeu_ps(output2, sum2); \
+_m ## simd_type ## _storeu_ps(output3, sum3);
+
+#define STORE_SUM_5(simd_type) \
+STORE_SUM_4(simd_type) \
+_m ## simd_type ## _storeu_ps(output4, sum4);
+
+#define STORE_SUM_6(simd_type) \
+STORE_SUM_5(simd_type) \
+_m ## simd_type ## _storeu_ps(output5, sum5);
+
+#define STORE_SUM_7(simd_type) \
+STORE_SUM_6(simd_type) \
+_m ## simd_type ## _storeu_ps(output6, sum6);
+
+#define STORE_SUM_8(simd_type) \
+STORE_SUM_7(simd_type) \
+_m ## simd_type ## _storeu_ps(output7, sum7);
+
+/* Convolution */
+#define CONVOLVE_1ROWS(simd_type) \
+image0 = _m ## simd_type ## _loadu_ps(input0); \
+image1 = _m ## simd_type ## _loadu_ps(input1); \
+image2 = _m ## simd_type ## _loadu_ps(input2); \
+image3 = _m ## simd_type ## _loadu_ps(input3); \
+image4 = _m ## simd_type ## _loadu_ps(input4); \
+\
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight0, image0)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight1, image1)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight2, image2)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight3, image3)); \
+sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight4, image4));
+
+#define CONVOLVE_2ROWS(simd_type) \
+CONVOLVE_1ROWS(simd_type) \
+image5 = _m ## simd_type ## _loadu_ps(input5); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight0, image1)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight1, image2)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight2, image3)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight3, image4)); \
+sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight4, image5));
+
+#define CONVOLVE_4ROWS(simd_type) \
+CONVOLVE_2ROWS(simd_type) \
+image6 = _m ## simd_type ## _loadu_ps(input6); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight0, image2)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight1, image3)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight2, image4)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight3, image5)); \
+sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight4, image6)); \
+\
+image7 = _m ## simd_type ## _loadu_ps(input7); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight0, image3)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight1, image4)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight2, image5)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight3, image6)); \
+sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight4, image7));
+
+#define CONVOLVE_5ROWS(simd_type) \
+CONVOLVE_4ROWS(simd_type) \
+image8 = _m ## simd_type ## _loadu_ps(input8); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight0, image4)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight1, image5)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight2, image6)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight3, image7)); \
+sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight4, image8));
+
+#define CONVOLVE_6ROWS(simd_type) \
+CONVOLVE_5ROWS(simd_type) \
+image9 = _m ## simd_type ## _loadu_ps(input9); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight0, image5)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight1, image6)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight2, image7)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight3, image8)); \
+sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight4, image9));
+
+#define CONVOLVE_7ROWS(simd_type) \
+CONVOLVE_6ROWS(simd_type) \
+imageA = _m ## simd_type ## _loadu_ps(inputA); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight0, image6)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight1, image7)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight2, image8)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight3, image9)); \
+sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight4, imageA));
+
+#define CONVOLVE_8ROWS(simd_type) \
+CONVOLVE_7ROWS(simd_type) \
+imageB = _m ## simd_type ## _loadu_ps(inputB); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight0, image7)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight1, image8)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight2, image9)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight3, imageA)); \
+sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight4, imageB));
+
+/* Convolution MEGA macro */
+#define DECLARE_SUMX(rows) DECLARE_SUM_ ## rows
+#define LOAD_SUMX(rows) LOAD_SUM_ ## rows
+#define DECLARE_INPUTX(rows) DECLARE_INPUT_ ## rows
+#define DECLARE_IMAGEX(rows) DECLARE_IMAGE_ ## rows
+#define CONVOLVEX(rows) CONVOLVE_ ## rows ## ROWS
+#define INC_INPUTX(rows) INC_INPUT_ ## rows
+#define STORE_SUMX(rows) STORE_SUM_ ## rows
+#define INC_OUTPUTX(rows) INC_OUTPUT_ ## rows
+
+#define CONVOLUTION_LOOP(rows, simd_type, simd_inst_prefex, simd_set, i) \
+DECLARE_SUMX(rows)(simd_type) \
+LOAD_SUMX(rows)(simd_inst_prefex) \
+DECLARE_WEIGHTS(simd_type) \
+DECLARE_INPUT_0(i) \
+DECLARE_INPUTX(rows)() \
+DECLARE_IMAGEX(rows)(simd_type) \
+\
+LOAD_WEIGHTS(0, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(1, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(2, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(3, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+INC_INPUTX(rows)() \
+\
+LOAD_WEIGHTS(4, simd_inst_prefex, simd_set) \
+CONVOLVEX(rows)(simd_inst_prefex) \
+\
+STORE_SUMX(rows)(simd_inst_prefex) \
+\
+INC_OUTPUTX(rows)(sizeof(__ ## simd_type) / sizeof(float))
+
+
+#define CONVOLVE_8COLS_XROWS(rows, i) \
+{ \
+CONVOLUTION_LOOP(rows, m256, m256, _set1_ps, i) \
+}
+
+#define CONVOLVE_4COLS_XROWS(rows, i) \
+{ \
+CONVOLUTION_LOOP(rows, m128, m, _set_ps1, i) \
+}
+
+#endif
diff --git a/lib/TH/generic/simd/convolve.c b/lib/TH/generic/simd/convolve.c
new file mode 100644
index 0000000..842af17
--- /dev/null
+++ b/lib/TH/generic/simd/convolve.c
@@ -0,0 +1,127 @@
+#if defined(USE_AVX)
+
+#ifdef _MSC_VER
+#include <intrin.h>
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+  unsigned int cpui[4];
+  __cpuid(cpui, __level);
+  *__eax = cpui[0]; *__ebx = cpui[1]; *__ecx = cpui[2]; *__edx = cpui[3];
+  return 1;
+}
+
+static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
+  *eax = 0; *edx = 0;
+  if (op == 0)
+      *eax = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+}
+
+#else
+
+#if __i386__
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+__asm("  pushl  %%ebx\n" \
+"  cpuid\n" \
+"  mov    %%ebx,%1\n" \
+"  popl   %%ebx" \
+: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+: "0"(__level))
+#else
+#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+: "0"(__level))
+#endif
+
+static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx) {
+  __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
+  return 1;
+}
+
+static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
+  __asm__ __volatile__
+  (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
+}
+
+#endif
+
+enum ECPUFeature
+{
+  kCPUFeature_SSE = 0x01,
+  kCPUFeature_SSE2 = 0x02,
+  kCPUFeature_SSE3 = 0x04,
+  kCPUFeature_SSE3_S = 0x08,
+  kCPUFeature_SSE4_1 = 0x10,
+  kCPUFeature_SSE4_2 = 0x20,
+  kCPUFeature_AVX = 0x40
+};
+
+static unsigned int checkCPUFeatures() {
+  unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+  unsigned int features = 0;
+  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
+  if( (edx & (1 << 25)) != 0 ) {
+    features |= kCPUFeature_SSE;
+  }
+  if( (edx & (1 << 26)) != 0 ) {
+    features |= kCPUFeature_SSE2;
+  }
+  if( (ecx & (1 << 0)) != 0 ) {
+    features |= kCPUFeature_SSE3;
+  }
+  if( (ecx & (1 << 9)) != 0 ) {
+    features |= kCPUFeature_SSE3_S;
+  }
+  if( (ecx & (1 << 19)) != 0 ) {
+    features |= kCPUFeature_SSE4_1;
+  }
+  if( (ecx & (1 << 20)) != 0 ) {
+    features |= kCPUFeature_SSE4_2;
+  }
+  if( (ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0 ) {
+    xgetbv(0, &eax, &edx);
+    if( (eax & 6) == 6 ) {
+      features |= kCPUFeature_AVX;
+    }
+  }
+  return features;
+}
+
+#include <stdio.h>
+
+static int haveCPUFeature(unsigned int feature) {
+  static unsigned int sCPUFeatures = 0;
+  static int sDetectedCPUFeatures = 0;
+  if (!sDetectedCPUFeatures) {
+    sDetectedCPUFeatures = 1;
+    sCPUFeatures = checkCPUFeatures();
+    if ((sCPUFeatures & kCPUFeature_AVX) != 0) {
+      printf("torch running avx\n");
+    } else {
+      printf("torch running sse \n");
+    }
+  }
+  return (sCPUFeatures & feature) != 0;
+}
+
+#endif
+
+void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
+void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
+
+void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols) {
+#if defined(USE_AVX)
+  int avx = haveCPUFeature(kCPUFeature_AVX);
+  if (avx)
+  {
+    convolve_5x5_avx(output, input, kernel, outRows, outCols, outCols, inCols);
+  }
+  else
+#endif
+  {
+    convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols);
+  }
+}
\ No newline at end of file
diff --git a/lib/TH/generic/simd/convolve.h b/lib/TH/generic/simd/convolve.h
new file mode 100644
index 0000000..7b9b04c
--- /dev/null
+++ b/lib/TH/generic/simd/convolve.h
@@ -0,0 +1 @@
+void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols);
\ No newline at end of file
diff --git a/lib/TH/generic/simd/convolve5x5_avx.c b/lib/TH/generic/simd/convolve5x5_avx.c
new file mode 100644
index 0000000..52b6d0f
--- /dev/null
+++ b/lib/TH/generic/simd/convolve5x5_avx.c
@@ -0,0 +1,212 @@
+#include <immintrin.h>
+#include "common_simd.h"
+
+#define CLEAR_AVX() _mm256_zeroupper()
+
+void convolve_5x5_1_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_1()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(1, i)
+  }
+}
+
+void convolve_5x5_2_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_2()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(2, i)
+  }
+}
+
+void convolve_5x5_4_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_4()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(4, i)
+  }
+}
+
+void convolve_5x5_5_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_5()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(5, i)
+  }
+}
+
+void convolve_5x5_6_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_6()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(6, i)
+  }
+}
+
+void convolve_5x5_7_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_7()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(7, i)
+  }
+}
+
+void convolve_5x5_8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount = count & 0xFFFFFFF8;
+  DECLARE_OUTPUT_8()
+  for (; i < alignedCount; i+=8) {
+    CONVOLVE_8COLS_XROWS(8, i)
+  }
+}
+
+void convolve_5x5_64x64_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  for(int i = 0; i < 60; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_8COLS_XROWS(6, 0)
+    CONVOLVE_8COLS_XROWS(6, 8)
+    CONVOLVE_8COLS_XROWS(6, 16)
+    CONVOLVE_8COLS_XROWS(6, 24)
+    CONVOLVE_8COLS_XROWS(6, 32)
+    CONVOLVE_8COLS_XROWS(6, 40)
+    CONVOLVE_8COLS_XROWS(6, 48)
+    CONVOLVE_8COLS_XROWS(6, 56)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_8COLS_XROWS(4, 0)
+  CONVOLVE_8COLS_XROWS(4, 8)
+  CONVOLVE_8COLS_XROWS(4, 16)
+  CONVOLVE_8COLS_XROWS(4, 24)
+  CONVOLVE_8COLS_XROWS(4, 32)
+  CONVOLVE_8COLS_XROWS(4, 40)
+  CONVOLVE_8COLS_XROWS(4, 48)
+  CONVOLVE_8COLS_XROWS(4, 56)
+}
+
+void convolve_5x5_32x32_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  for(int i = 0; i < 30; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_8COLS_XROWS(6, 0)
+    CONVOLVE_8COLS_XROWS(6, 8)
+    CONVOLVE_8COLS_XROWS(6, 16)
+    CONVOLVE_8COLS_XROWS(6, 24)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_2()
+  CONVOLVE_8COLS_XROWS(2, 0)
+  CONVOLVE_8COLS_XROWS(2, 8)
+  CONVOLVE_8COLS_XROWS(2, 16)
+  CONVOLVE_8COLS_XROWS(2, 24)
+}
+
+void convolve_5x5_16x16_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  for(int i = 0; i < 12; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_8COLS_XROWS(6, 0)
+    CONVOLVE_8COLS_XROWS(6, 8)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_8COLS_XROWS(4, 0)
+  CONVOLVE_8COLS_XROWS(4, 8)
+}
+
+void convolve_5x5_8x8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  DECLARE_OUTPUT_8()
+  CONVOLVE_8COLS_XROWS(8, 0)
+}
+
+void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
+
+void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols) {
+  long ic = inCols;
+  long yy = 0;
+  float* t_ = input;
+  float* r_ = output;
+  float* k_ = kernel;
+
+  if((outRows == 64) && (outCols == 64)) {
+    convolve_5x5_64x64_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 32) && (outCols == 32)) {
+    convolve_5x5_32x32_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 16) && (outCols == 16)) {
+    convolve_5x5_16x16_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 8) && (outCols == 8)) {
+    convolve_5x5_8x8_avx(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  for(; yy < (outRows / 6 ) * 6; yy += 6) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_6_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 6);
+  }
+
+  // more than 2 rows left to process and we ended up on a non-multiple of 4
+  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
+    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 2);
+    yy += 2;
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_4_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 4);
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 2);
+  }
+
+  for(; yy < outRows; yy += 1) {
+    float *pi_ = t_ + yy*ic;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_1_avx(r_, pis_, pw_, outCols, outStride, ic);
+    r_ += (outStride * 1);
+  }
+
+  long procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time
+  long remCols = outCols - procCols;
+
+  //process the rest using sse
+  if( remCols > 0) {
+    CLEAR_AVX();
+    convolve_5x5_sse(&output[procCols], &input[procCols], kernel, outRows, remCols, outStride, inCols);
+  }
+}
\ No newline at end of file
diff --git a/lib/TH/generic/simd/convolve5x5_sse.c b/lib/TH/generic/simd/convolve5x5_sse.c
new file mode 100644
index 0000000..04dc41b
--- /dev/null
+++ b/lib/TH/generic/simd/convolve5x5_sse.c
@@ -0,0 +1,320 @@
+#include <smmintrin.h>
+#include "common_simd.h"
+
+
+/* SSE variants */
+void convolve_5x5_1_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_1()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(1, i)
+  }
+  for (; i < (count); i++) {
+    float output0 = output[i + outputStride * 0];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+  }
+}
+
+void convolve_5x5_2_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_2()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(2, i)
+  }
+  for (; i < (count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+  }
+}
+
+void convolve_5x5_4_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_4()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(4, i)
+  }
+  for (; i < (count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    float output2 = output[i + outputStride * 2];
+    float output3 = output[i + outputStride * 3];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
+        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+    output[i + outputStride * 2] = output2;
+    output[i + outputStride * 3] = output3;
+  }
+}
+
+void convolve_5x5_6_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_6()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(6, i)
+  }
+  for (; i<(count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    float output2 = output[i + outputStride * 2];
+    float output3 = output[i + outputStride * 3];
+    float output4 = output[i + outputStride * 4];
+    float output5 = output[i + outputStride * 5];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
+        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
+        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
+        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+    output[i + outputStride * 2] = output2;
+    output[i + outputStride * 3] = output3;
+    output[i + outputStride * 4] = output4;
+    output[i + outputStride * 5] = output5;
+  }
+}
+
+void convolve_5x5_8_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  long i = 0;
+  long alignedCount4 = count & 0xFFFFFFFC;
+  DECLARE_OUTPUT_8()
+  for (; i < alignedCount4; i+=4) {
+    CONVOLVE_4COLS_XROWS(8, i)
+  }
+  for (; i<(count); i++) {
+    float output0 = output[i + outputStride * 0];
+    float output1 = output[i + outputStride * 1];
+    float output2 = output[i + outputStride * 2];
+    float output3 = output[i + outputStride * 3];
+    float output4 = output[i + outputStride * 4];
+    float output5 = output[i + outputStride * 5];
+    float output6 = output[i + outputStride * 6];
+    float output7 = output[i + outputStride * 7];
+    int row;
+    for (row = 0; row < 5; row++) {
+      int col;
+      for (col = 0; col < 5; col++) {
+        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
+        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
+        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
+        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
+        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
+        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
+        output6 += weight[5 * row + col] * image[i + (row + 6) * inputStride + col];
+        output7 += weight[5 * row + col] * image[i + (row + 7) * inputStride + col];
+      }
+    }
+    output[i + outputStride * 0] = output0;
+    output[i + outputStride * 1] = output1;
+    output[i + outputStride * 2] = output2;
+    output[i + outputStride * 3] = output3;
+    output[i + outputStride * 4] = output4;
+    output[i + outputStride * 5] = output5;
+    output[i + outputStride * 6] = output6;
+    output[i + outputStride * 7] = output7;
+  }
+}
+
+#define UNROLL_SSE_CONVOLUTION 0
+#if (UNROLL_SSE_CONVOLUTION)
+
+void convolve_5x5_64x64_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  for(int i = 0; i < 60; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_4COLS_XROWS(6, 0)
+    CONVOLVE_4COLS_XROWS(6, 4)
+    CONVOLVE_4COLS_XROWS(6, 8)
+    CONVOLVE_4COLS_XROWS(6, 12)
+    CONVOLVE_4COLS_XROWS(6, 16)
+    CONVOLVE_4COLS_XROWS(6, 20)
+    CONVOLVE_4COLS_XROWS(6, 24)
+    CONVOLVE_4COLS_XROWS(6, 28)
+    CONVOLVE_4COLS_XROWS(6, 32)
+    CONVOLVE_4COLS_XROWS(6, 36)
+    CONVOLVE_4COLS_XROWS(6, 40)
+    CONVOLVE_4COLS_XROWS(6, 44)
+    CONVOLVE_4COLS_XROWS(6, 48)
+    CONVOLVE_4COLS_XROWS(6, 52)
+    CONVOLVE_4COLS_XROWS(6, 56)
+    CONVOLVE_4COLS_XROWS(6, 60)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_4COLS_XROWS(4, 0)
+  CONVOLVE_4COLS_XROWS(4, 4)
+  CONVOLVE_4COLS_XROWS(4, 8)
+  CONVOLVE_4COLS_XROWS(4, 12)
+  CONVOLVE_4COLS_XROWS(4, 16)
+  CONVOLVE_4COLS_XROWS(4, 20)
+  CONVOLVE_4COLS_XROWS(4, 24)
+  CONVOLVE_4COLS_XROWS(4, 28)
+  CONVOLVE_4COLS_XROWS(4, 32)
+  CONVOLVE_4COLS_XROWS(4, 36)
+  CONVOLVE_4COLS_XROWS(4, 40)
+  CONVOLVE_4COLS_XROWS(4, 44)
+  CONVOLVE_4COLS_XROWS(4, 48)
+  CONVOLVE_4COLS_XROWS(4, 52)
+  CONVOLVE_4COLS_XROWS(4, 56)
+  CONVOLVE_4COLS_XROWS(4, 60)
+}
+
+void convolve_5x5_32x32_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  for(int i = 0; i < 30; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+
+      CONVOLVE_4COLS_XROWS(6, 0)
+      CONVOLVE_4COLS_XROWS(6, 4)
+      CONVOLVE_4COLS_XROWS(6, 8)
+      CONVOLVE_4COLS_XROWS(6, 12)
+      CONVOLVE_4COLS_XROWS(6, 16)
+      CONVOLVE_4COLS_XROWS(6, 20)
+      CONVOLVE_4COLS_XROWS(6, 24)
+      CONVOLVE_4COLS_XROWS(6, 28)
+
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_2()
+  CONVOLVE_4COLS_XROWS(2, 0)
+  CONVOLVE_4COLS_XROWS(2, 4)
+  CONVOLVE_4COLS_XROWS(2, 8)
+  CONVOLVE_4COLS_XROWS(2, 12)
+  CONVOLVE_4COLS_XROWS(2, 16)
+  CONVOLVE_4COLS_XROWS(2, 20)
+  CONVOLVE_4COLS_XROWS(2, 24)
+  CONVOLVE_4COLS_XROWS(2, 28)
+}
+
+void convolve_5x5_16x16_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  for(int i = 0; i < 12; i+=6)
+  {
+    DECLARE_OUTPUT_6()
+    CONVOLVE_4COLS_XROWS(6, 0)
+    CONVOLVE_4COLS_XROWS(6, 4)
+    CONVOLVE_4COLS_XROWS(6, 8)
+    CONVOLVE_4COLS_XROWS(6, 12)
+    output += outputStride * 6;
+    image += inputStride * 6;
+  }
+  DECLARE_OUTPUT_4()
+  CONVOLVE_4COLS_XROWS(4, 0)
+  CONVOLVE_4COLS_XROWS(4, 4)
+  CONVOLVE_4COLS_XROWS(4, 8)
+  CONVOLVE_4COLS_XROWS(4, 12)
+}
+
+void convolve_5x5_8x8_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
+  DECLARE_OUTPUT_8()
+  CONVOLVE_4COLS_XROWS(8, 0)
+  CONVOLVE_4COLS_XROWS(8, 4)
+}
+
+#endif
+
+void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols) {
+  long yy = 0;
+  float* t_ = input;
+  float* r_ = output;
+  float* k_ = kernel;
+#if (UNROLL_SSE_CONVOLUTION)
+  if((outRows == 64) && (outCols == 64)) {
+    convolve_5x5_64x64_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 32) && (outCols == 32)) {
+    convolve_5x5_32x32_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 16) && (outCols == 16)) {
+    convolve_5x5_16x16_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+
+  if((outRows == 8) && (outCols == 8)) {
+    convolve_5x5_8x8_sse(output, input, kernel, outRows, outStride, inCols);
+    return;
+  }
+#endif
+  for(; yy < (outRows / 6 ) * 6; yy += 6) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_6_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 6);
+  }
+  // more than 2 rows left to process and we ended up on a non-multiple of 4
+  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
+    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 2);
+    yy += 2;
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_4_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 4);
+  }
+
+  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 2);
+  }
+
+  for(; yy < outRows; yy += 1) {
+    float *pi_ = t_ + yy*inCols;
+    float *pw_ = k_;
+    float *pis_ = pi_;
+    convolve_5x5_1_sse(r_, pis_, pw_, outCols, outStride, inCols);
+    r_ += (outStride * 1);
+  }
+}
diff --git a/lib/luaT/CMakeLists.txt b/lib/luaT/CMakeLists.txt
new file mode 100644
index 0000000..b221a17
--- /dev/null
+++ b/lib/luaT/CMakeLists.txt
@@ -0,0 +1,41 @@
+# avoid some cmake warnings
+IF(POLICY CMP0026)
+ CMAKE_POLICY(SET CMP0026 OLD)
+ENDIF()
+
+INCLUDE_DIRECTORIES(${LUA_INCDIR})
+IF(LUALIB)
+  LINK_DIRECTORIES(${LUA_LIBDIR}) # note: must be done before defining target
+ENDIF()
+
+ADD_LIBRARY(luaT SHARED luaT.h luaT.c)
+if(BUILD_STATIC)
+  ADD_LIBRARY(luaT_static STATIC luaT.h luaT.c)
+endif()
+
+IF(APPLE)
+  SET_TARGET_PROPERTIES(luaT PROPERTIES
+    LINK_FLAGS "-undefined dynamic_lookup")
+ENDIF()
+
+IF(LUALIB)
+  TARGET_LINK_LIBRARIES(luaT ${LUALIB}) # must be done after ;)
+ENDIF()
+
+INSTALL(TARGETS luaT
+  EXPORT torch-exports
+  RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
+  LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
+  ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
+
+INSTALL(FILES luaT.h
+          DESTINATION "${Torch_INSTALL_INCLUDE_SUBDIR}")
+
+# Create luaT.cmake
+GET_TARGET_PROPERTY(LUAT_OUTPUT_NAME luaT LOCATION)
+GET_FILENAME_COMPONENT(LUAT_OUTPUT_NAME ${LUAT_OUTPUT_NAME} NAME)
+SET(LUAT_LIBRARIES "${Torch_INSTALL_LIB}/${LUAT_OUTPUT_NAME}")
+SET(LUAT_INCLUDE_DIR "${Torch_INSTALL_INCLUDE}")
+CONFIGURE_FILE(luaTConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake")
+INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/luaTConfig.cmake" 
+  DESTINATION "${Torch_INSTALL_CMAKE_SUBDIR}")
diff --git a/lib/luaT/README.md b/lib/luaT/README.md
new file mode 100644
index 0000000..431e37f
--- /dev/null
+++ b/lib/luaT/README.md
@@ -0,0 +1,266 @@
+<a name="luat.dok"/>
+# Lua Torch C API #
+
+luaT provides an API to interface Lua and C in Torch packages. It defines a
+concept of _classes_ to Lua for Torch, and provides a mechanism to easily
+handle these Lua classes from C.
+
+It additionally provides few functions that `luaL` should have defined, and
+defines several functions similar to `luaL` ones for better type error printing when using
+`luaT` classes.
+
+<a name="luat.memory.dok"/>
+## Memory functions ##
+
+Classical memory allocation functions which generate a Lua error in case of
+problem.
+
+<a name="luaT_alloc"/>
+### void* luaT_alloc(lua_State *L, long size) ###
+
+Allocates `size` bytes, and return a pointer on the allocated
+memory. A Lua error will be generated if running out of memory.
+
+<a name="luaT_realloc"/>
+### void* luaT_realloc(lua_State *L, void *ptr, long size) ###
+
+Realloc `ptr` to `size` bytes. `ptr` must have been previously
+allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions. A Lua error will be generated if running out of memory.
+
+<a name="luaT_free"/>
+### void luaT_free(lua_State *L, void *ptr) ###
+
+Free memory allocated at address `ptr`. The memory must have been
+previously allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions.
+
+<a name="luat.classcreate"/>
+## Class creation and basic handling ##
+
+A `luaT` class is basically either a Lua _table_ or _userdata_ with
+an appropriate _metatable_. This appropriate metatable is created with
+[luaT_newmetatable](#luaT_newmetatable). Contrary to luaL userdata
+functions, luaT mechanism handles inheritance. If the class inherit from
+another class, then the metatable will itself have a metatable
+corresponding to the _parent metatable_: the metatables are cascaded
+according to the class inheritance. Multiple inheritance is not supported.
+
+<a name="luat.operatoroverloading"/>
+### Operator overloading ###
+
+The metatable of a `luaT` object contains `Lua` operators like
+`__index`, `__newindex`, `__tostring`, `__add`
+(etc...). These operators will respectively look for `__index__`,
+`__newindex__`, `__tostring__`, `__add__` (etc...) in the
+metatable. If found, the corresponding function or value will be returned,
+else a Lua error will be raised.
+
+If one wants to provide `__index__` or `__newindex__` in the
+metaclass, these operators must follow a particular scheme:
+
+  * `__index__` must either return a value _and_ `true` or return `false` only. In the first case, it means `__index__` was able to handle the given argument (for e.g., the type was correct). The second case means it was not able to do anything, so `__index` in the root metatable can then try to see if the metaclass contains the required value.
+
+  * `__newindex__` must either return `true` or `false`. As for `__index__`, `true` means it could handle the argument and `false` not. If not, the root metatable `__newindex` will then raise an error if the object was a userdata, or apply a rawset if the object was a Lua table.
+
+Other metaclass operators like `__tostring__`, `__add__`, etc... do not have any particular constraint.
+
+<a name="luat_newlocalmetatable"/>
+### const char* luaT_newlocalmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory, int moduleidx) ###
+
+This function creates a new metatable, which is the Lua way to define a new
+object class. As for `luaL_newmetatable`, the metatable is registered in
+the Lua registry table, with the key `tname`. In addition, `tname` is
+also registered in the Lua registry, with the metatable as key (the
+typename of a given object can be thus easily retrieved).
+
+The class name `tname` must be of the form `modulename.classname`. If not
+NULL, `parenttname` must be a valid typename corresponding to the parent
+class of the new class.
+
+If `constructor` is not NULL, a function `new` will be added to the
+metatable, pointing to this given function.
+
+A "constructor table" will be created by `luaT_newlocalmetatable`: it will
+contain all the class methods, and be callable, calling the `constructor`, if
+a `constructor` has been passed. The constructor table is either stored into
+`modulename.classname` (that is in the global namespace) if `moduleidx <=
+0` or in the table at index `moduleidx` in the stack (if `moduleidx > 0`).
+
+If not NULL, `destructor` will be called when garbage collecting the object.
+
+If not NULL, `factory` must be a Lua C function creating an empty object
+instance of the class. This functions are used in Torch for serialization.
+
+Note that classes can be partly defined in C and partly defined in Lua:
+once the metatable is created in C, it can be filled up with additional
+methods in Lua.
+
+The return value is the value returned by [luaT_typenameid](#luat_typenameid).
+
+<a name="luat_newmetatable"/>
+### const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory) ###
+
+Same as [luaT_newlocalmetatable](#luat_newmetatable), but where the
+constructor table is assigned in the global namespace (`moduleidx = 0`).
+
+<a name="luat_pushmetatable"/>
+### int luaT_pushmetatable(lua_State *L, const name *tname) ###
+
+Push the metatable with type name `tname` on the stack, if `tname` is a
+valid Torch class name (previously registered with luaT_newmetatable).
+
+On success, returns 1. If `tname` is invalid, nothing is pushed and it
+returns 0.
+
+<a name="luat_typenameid"/>
+### const char* luaT_typenameid(lua_State *L, const char *tname) ###
+
+If `tname` is a valid Torch class name, then returns a unique string (the
+contents will be the same as `tname`) pointing to the string registered
+in the Lua registry. This string is thus valid as long as Lua is
+running. The returned string shall not be freed.
+
+If `tname` is an invalid class name, returns NULL.
+
+<a name="luat_typename"/>
+### const char* luaT_typename(lua_State *L, int ud) ###
+
+Returns the typename of the object at index `ud` on the stack. If it is
+not a valid Torch object, returns NULL.
+
+<a name="luat_pushudata"/>
+### void luaT_pushudata(lua_State *L, void *udata, const char *tname) ###
+
+Given a C structure `udata`, push a userdata object on the stack with
+metatable corresponding to `tname`. Obviously, `tname` must be a valid
+Torch name registered with [luaT_newmetatable](#luat_newmetatable).
+
+<a name="luat_toudata"/>
+### void *luaT_toudata(lua_State *L, int ud, const char *tname) ###
+
+Returns a pointer to the original C structure previously pushed on the
+stack with [luaT_pushudata](#luat_pushudata), if the object at index
+`ud` is a valid Torch class name. Returns NULL otherwise.
+
+<a name="luat_isudata"/>
+### int luaT_isudata(lua_State *L, int ud, const char *tname) ###
+
+Returns 1 if the object at index `ud` on the stack is a valid Torch class name `tname`.
+Returns 0 otherwise.
+
+<a name="luat_getfield"/>
+### Checking fields of a table ###
+
+This functions check that the table at the given index `ud` on the Lua
+stack has a field named `field`, and that it is of the specified type.
+These function raises a Lua error on failure.
+
+<a name="luat_getfieldcheckudata"/>
+## void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+Torch class name `tname`.  Returns the pointer of the C structure
+previously pushed on the stack with [luaT_pushudata](#luat_pushudata) on
+success. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecklightudata"/>
+## void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+lightuserdata.  Returns the lightuserdata pointer on success. The function
+raises a Lua error on failure.
+
+<a name="luat_getfieldcheckint"/>
+## int luaT_getfieldcheckint(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is an
+int. Returns the int value pointer on success. The function raises a Lua
+error on failure.
+
+<a name="luat_getfieldcheckstring"/>
+## const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+string. Returns a pointer to the string on success. The function raises a
+Lua error on failure.
+
+<a name="luat_getfieldcheckboolean"/>
+## int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+boolean. On success, returns 1 if the boolean is `true`, 0 if it is
+`false`. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecktable"/>
+## void luaT_getfieldchecktable(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+table. On success, push the table on the stack. The function raises a Lua
+error on failure.
+
+<a name="luat_typerror"/>
+### int luaT_typerror(lua_State *L, int ud, const char *tname) ###
+
+Raises a `luaL_argerror` (and returns its value), claiming that the
+object at index `ud` on the stack is not of type `tname`. Note that
+this function does not check the type, it only raises an error.
+
+<a name="luat_checkboolean"/>
+### int luaT_checkboolean(lua_State *L, int ud) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. The function raises a Lua
+error on failure.
+
+<a name="luat_optboolean"/>
+### int luaT_optboolean(lua_State *L, int ud, int def) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. If there is no value at
+index `ud`, returns `def`. In any other cases, raises an error.
+
+<a name="luat_registeratname"/>
+### void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name) ###
+
+This function assume a table is on the stack. It creates a table field
+`name` in the table (if this field does not exist yet), and fill up
+`methods` in this table field.
+
+<a name="luat_classrootname"/>
+### const char *luaT_classrootname(const char *tname) ###
+
+Assuming `tname` is of the form `A.b.c`, returns 'c'. The returned value
+shall not be freed. It is a pointer inside `tname` string.
+
+<a name="luat_classmodulename"/>
+### int luaT_classmodulename(const char *tname, char *parent_name) ###
+Alias to `luaT_fullparentname ` for ensuring backwards compatibilty; 
+use of `luaT_fullparentname` is preferred.
+
+<a name="luat_fullparentname"/>
+### int luaT_fullparentname(const char *tname, char *parent_name) ###
+
+Returns a 0-1 valued integer indicating whether `tname` has a parent module.
+Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `A.b`.
+
+<a name="luat_classmodulename"/>
+### int luaT_outerparentname(const char *tname, char *parent_name) ###
+
+Returns a 0-1 valued integer indicating whether `tname` has a parent module.
+Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `A`.
+
+<a name="luat_classmodulename"/>
+### int luaT_innerparentname(const char *tname, char *parent_name) ###
+
+Returns a 0-1 valued integer indicating whether `tname` has a parent module.
+Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `b`.
+
+<a name="luat_stackdump"/>
+### void luaT_stackdump(lua_State *L) ###
+
+This function print outs the state of the Lua stack. It is useful for debug
+purposes.
+
diff --git a/lib/luaT/luaT.c b/lib/luaT/luaT.c
new file mode 100644
index 0000000..657cca2
--- /dev/null
+++ b/lib/luaT/luaT.c
@@ -0,0 +1,1338 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "luaT.h"
+
+void* luaT_alloc(lua_State *L, long size)
+{
+  void *ptr;
+
+  if(size == 0)
+    return NULL;
+
+  if(size < 0)
+    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+  ptr = malloc(size);
+  if(!ptr)
+    luaL_error(L, "$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
+
+  return ptr;
+}
+
+void* luaT_realloc(lua_State *L, void *ptr, long size)
+{
+  if(!ptr)
+    return(luaT_alloc(L, size));
+
+  if(size == 0)
+  {
+    luaT_free(L, ptr);
+    return NULL;
+  }
+
+  if(size < 0)
+    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+  ptr = realloc(ptr, size);
+  if(!ptr)
+    luaL_error(L, "$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
+  return ptr;
+}
+
+void luaT_free(lua_State *L, void *ptr)
+{
+  free(ptr);
+}
+
+void luaT_setfuncs(lua_State *L, const luaL_Reg *l, int nup)
+{
+#if LUA_VERSION_NUM == 501
+  luaL_checkstack(L, nup+1, "too many upvalues");
+  for (; l->name != NULL; l++) {  /* fill the table with given functions */
+    int i;
+    lua_pushstring(L, l->name);
+    for (i = 0; i < nup; i++)  /* copy upvalues to the top */
+      lua_pushvalue(L, -(nup+1));
+    lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
+    lua_settable(L, -(nup + 3));
+  }
+  lua_pop(L, nup);  /* remove upvalues */
+#else
+  luaL_setfuncs(L, l, nup);
+#endif
+}
+
+void luaT_stackdump(lua_State *L)
+{
+  int i;
+  const char *tname = NULL;
+  int top = lua_gettop(L);
+  for(i = 1; i <= top; i++)
+  {
+    int t = lua_type(L, i);
+    printf("%3d. ", i);
+    switch(t)
+    {
+      case LUA_TSTRING:
+        printf("'%s'", lua_tostring(L,i));
+        break;
+      case LUA_TBOOLEAN:
+        printf(lua_toboolean(L, i) ? "true" : "false");
+        break;
+      case LUA_TNUMBER:
+        printf("%g", lua_tonumber(L,i));
+        break;
+      case LUA_TUSERDATA:
+        tname = luaT_typename(L, i);
+        printf("userdata %p [%s]", lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        break;
+      case 10:
+        tname = luaT_typename(L, i);
+        printf("cdata %p [%s]", lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        break;
+      case LUA_TTABLE:
+        lua_pushvalue(L, i);
+        lua_rawget(L, LUA_REGISTRYINDEX);
+        if(lua_isstring(L, -1))
+          tname = lua_tostring(L, -1); /*luaT_typenameid(L, lua_tostring(L, -1)); */
+        else
+          tname = NULL;
+        lua_pop(L, 1);
+        if(tname)
+          printf("metatable [%s]", tname);
+        else
+        {
+          tname = luaT_typename(L, i);
+          printf("table %p [%s]", lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        }
+        break;
+      default:
+        printf("Lua object type: %s", lua_typename(L,t));
+        break;
+    }
+    printf("\n");
+  }
+  printf("---------------------------------------------\n");
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L);
+static int luaT_mt__newindex(lua_State *L);
+static int luaT_mt__tostring(lua_State *L);
+static int luaT_mt__add(lua_State *L);
+static int luaT_mt__sub(lua_State *L);
+static int luaT_mt__mul(lua_State *L);
+static int luaT_mt__div(lua_State *L);
+static int luaT_mt__mod(lua_State *L);
+static int luaT_mt__pow(lua_State *L);
+static int luaT_mt__unm(lua_State *L);
+static int luaT_mt__concat(lua_State *L);
+static int luaT_mt__len(lua_State *L);
+static int luaT_mt__eq(lua_State *L);
+static int luaT_mt__lt(lua_State *L);
+static int luaT_mt__le(lua_State *L);
+static int luaT_mt__call(lua_State *L);
+
+/* Constructor-metatable methods */
+static int luaT_cmt__call(lua_State *L);
+static int luaT_cmt__newindex(lua_State *L);
+
+const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parent_tname,
+                              lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory)
+{
+  return luaT_newlocalmetatable(L, tname, parent_tname,
+                                constructor, destructor, factory, 0);
+}
+
+const char* luaT_newlocalmetatable(lua_State *L, const char *tname, const char *parent_tname,
+                                   lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory, int moduleidx)
+{
+  lua_pushcfunction(L, luaT_lua_newmetatable);
+  lua_pushstring(L, tname);
+  (parent_tname ? lua_pushstring(L, parent_tname) : lua_pushnil(L));
+  (constructor ? lua_pushcfunction(L, constructor) : lua_pushnil(L));
+  (destructor ? lua_pushcfunction(L, destructor) : lua_pushnil(L));
+  (factory ? lua_pushcfunction(L, factory) : lua_pushnil(L));
+  (moduleidx > 0 ? lua_pushvalue(L, moduleidx) : lua_pushnil(L));
+  lua_call(L, 6, 1);
+  return luaT_typenameid(L, tname);
+}
+
+int luaT_pushmetatable(lua_State *L, const char *tname)
+{
+  lua_getfield(L, LUA_REGISTRYINDEX, tname);
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);
+    return 0;
+  }
+  return 1;
+}
+
+const char *luaT_typenameid(lua_State *L, const char *tname)
+{
+  if(luaT_pushmetatable(L, tname))
+  {
+    const char *tnameid = NULL;
+    lua_rawget(L, LUA_REGISTRYINDEX);
+    if(lua_isstring(L, -1))
+      tnameid = lua_tostring(L, -1);
+    lua_pop(L, 1); /* the string/nil */
+    return tnameid;
+  }
+  return NULL;
+}
+
+static const char cdataname[] = ""
+  "local ok, ffi = pcall(require, 'ffi')\n"
+  "if ok then\n"
+  "  local id2name = {}\n"
+  "  return function(cdata, name)\n"
+  "    local id\n"
+  "    if jit then\n"
+  "      id = tonumber(ffi.typeof(cdata))\n"
+  "    else\n"
+  "      id = tostring(ffi.typeof(cdata))\n"
+  "    end\n"
+  "    if id then\n"
+  "      if name then\n"
+  "        id2name[id] = name\n"
+  "        return name\n"
+  "      else\n"
+  "        return rawget(id2name, id)\n"
+  "      end\n"
+  "    end\n"
+  "    return nil\n"
+  "  end\n"
+  "else\n"
+  "  return function() end\n"
+  "end\n";
+
+static const char* luaT_cdataname(lua_State *L, int ud, const char *tname)
+{
+  lua_pushstring(L, "__cdataname");
+  lua_rawget(L, LUA_REGISTRYINDEX);
+  if(lua_isnil(L,-1))
+  {
+    lua_pop(L, 1);
+
+    if(luaL_dostring(L, cdataname)) /* did something go wrong? */
+      luaL_error(L, "internal error (could not load cdataname): %s", lua_tostring(L, -1));
+
+    lua_pushstring(L, "__cdataname");
+    lua_pushvalue(L, -2);
+    lua_rawset(L, LUA_REGISTRYINDEX);
+  }
+  if(!lua_isfunction(L, -1)) /* should not happen */
+    luaL_error(L, "internal error (cdataname is not a function)");
+
+  lua_pushvalue(L, ud);
+  if(tname)
+    lua_pushstring(L, tname);
+  if(lua_pcall(L, (tname ? 2 : 1), 1, 0))
+    luaL_error(L, "internal error (cdataname): %s", lua_tostring(L, -1));
+
+  tname = lua_tostring(L, -1);
+  lua_pop(L, 1);
+
+  return tname;
+}
+
+static void* CDATA_MT_KEY = &CDATA_MT_KEY;
+static const char cdatamt[] = ""
+  "local ok, ffi = pcall(require, 'ffi')\n"
+  "if ok and not jit then\n"
+  "  return ffi.debug().cdata_mt\n"
+  "else\n"
+  "  return {}\n"
+  "end\n";
+
+static int luaT_iscdata(lua_State *L, int ud)
+{
+  int type = lua_type(L, ud);
+  if(type == 10)
+    return 1;
+  if(type != LUA_TUSERDATA)
+    return 0;
+  if(!lua_getmetatable(L, ud))
+    return 0;
+
+  lua_pushlightuserdata(L, CDATA_MT_KEY);
+  lua_rawget(L, LUA_REGISTRYINDEX);
+  if (lua_isnil(L, -1))
+  {
+    // initialize cdata metatable
+    lua_pop(L, 1);
+    if(luaL_dostring(L, cdatamt))
+      luaL_error(L, "internal error (could not load cdata mt): %s", lua_tostring(L, -1));
+
+    lua_pushlightuserdata(L, CDATA_MT_KEY);
+    lua_pushvalue(L, -2);
+    lua_rawset(L, LUA_REGISTRYINDEX);
+  }
+
+  int iscdata = lua_rawequal(L, -1, -2);
+  lua_pop(L, 2);
+  return iscdata;
+}
+
+const char* luaT_typename(lua_State *L, int ud)
+{
+  if(luaT_iscdata(L, ud))
+    return luaT_cdataname(L, ud, NULL);
+  else if(lua_getmetatable(L, ud))
+  {
+    const char *tname = NULL;
+    lua_rawget(L, LUA_REGISTRYINDEX);
+    if(lua_isstring(L, -1))
+      tname = lua_tostring(L, -1);
+    lua_pop(L, 1); /* the string/nil */
+    return tname;
+  }
+  return NULL;
+}
+
+void luaT_pushudata(lua_State *L, void *udata, const char *tname)
+{
+  if(udata)
+  {
+    void **udata_p = lua_newuserdata(L, sizeof(void*));
+    *udata_p = udata;
+    if(!luaT_pushmetatable(L, tname))
+      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+    lua_setmetatable(L, -2);
+  }
+  else
+    lua_pushnil(L);
+}
+
+void *luaT_toudata(lua_State *L, int ud, const char *tname)
+{
+  void **p = lua_touserdata(L, ud);
+  if(p != NULL) /* value is a userdata? */
+  {
+    if(!luaT_pushmetatable(L, tname))
+      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+
+    /* initialize the table we want to get the metatable on */
+    /* note that we have to be careful with indices, as we just inserted stuff */
+    lua_pushvalue(L, (ud < 0 ? ud - 1 : ud));
+    while(lua_getmetatable(L, -1)) /* get the next metatable */
+    {
+      lua_remove(L, -2); /* remove the previous metatable [or object, if first time] */
+      if(lua_rawequal(L, -1, -2))
+      {
+        lua_pop(L, 2);  /* remove the two metatables */
+        return *p;
+      }
+    }
+    lua_pop(L, 2); /* remove the two metatables */
+  }
+  return NULL;
+}
+
+int luaT_isudata(lua_State *L, int ud, const char *tname)
+{
+  if(luaT_toudata(L, ud, tname))
+    return 1;
+  else
+    return 0;
+}
+
+void *luaT_checkudata(lua_State *L, int ud, const char *tname)
+{
+  void *p = luaT_toudata(L, ud, tname);
+  if(!p)
+    luaT_typerror(L, ud, tname);
+  return p;
+}
+
+void luaT_pushlong(lua_State *L, long n)
+{
+#if LUA_VERSION_NUM >= 503
+  /* Only push the value as an integer if it fits in lua_Integer,
+   or if the lua_Number representation will be even worse */
+  if (sizeof(lua_Integer) >= sizeof(long) || sizeof(lua_Number) == sizeof(lua_Integer)) {
+    lua_pushinteger(L, n);
+  } else {
+    lua_pushnumber(L, (lua_Number)n);
+  }
+#else
+  lua_pushnumber(L, (lua_Number)n);
+#endif
+}
+
+long luaT_checklong(lua_State *L, int idx)
+{
+#if LUA_VERSION_NUM >= 503
+  if (sizeof(lua_Integer) >= sizeof(long) || sizeof(lua_Number) == sizeof(lua_Integer)) {
+    return (long)luaL_checkinteger(L, idx);
+  } else {
+    return (long)luaL_checknumber(L, idx);
+  }
+#else
+  return (long)luaL_checknumber(L, idx);
+#endif
+}
+
+long luaT_tolong(lua_State *L, int idx)
+{
+#if LUA_VERSION_NUM == 503
+  if (sizeof(lua_Integer) >= sizeof(long) || sizeof(lua_Number) == sizeof(lua_Integer)) {
+    return (long)lua_tointeger(L, idx);
+  } else {
+    return (long)lua_tonumber(L, idx);
+  }
+#else
+  return (long)lua_tonumber(L, idx);
+#endif
+}
+
+void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname)
+{
+  void *p;
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  p = luaT_toudata(L, -1, tname);
+  if(!p)
+    luaL_error(L, "bad argument #%d (field %s is not a %s)", ud, field, tname);
+  return p;
+}
+
+void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field)
+{
+  void *p;
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+
+  if(!lua_islightuserdata(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a light userdata)", ud, field);
+
+  p = lua_touserdata(L, -1);
+
+  return p;
+}
+
+double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isnumber(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+  return lua_tonumber(L, -1);
+}
+
+int luaT_getfieldcheckint(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isnumber(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+  return (int)lua_tonumber(L, -1);
+}
+
+const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isstring(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a string)", ud, field);
+  return lua_tostring(L, -1);
+}
+
+int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isboolean(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a boolean)", ud, field);
+  return lua_toboolean(L, -1);
+}
+
+void luaT_getfieldchecktable(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_istable(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a table)", ud, field);
+}
+
+/**** type checks as in luaL ****/
+int luaT_typerror(lua_State *L, int ud, const char *tname)
+{
+  const char *msg;
+  const char *tnameud = luaT_typename(L, ud);
+
+  if(!tnameud)
+    tnameud = lua_typename(L, ud);
+
+  msg = lua_pushfstring(L, "%s expected, got %s",
+                        tname,
+                        (tnameud ? tnameud : "unknown object"));
+
+  return luaL_argerror(L, ud, msg);
+}
+
+int luaT_checkboolean(lua_State *L, int ud)
+{
+  if(!lua_isboolean(L, ud))
+    luaT_typerror(L, ud, lua_typename(L, LUA_TBOOLEAN));
+  return lua_toboolean(L, ud);
+}
+
+int luaT_optboolean(lua_State *L, int ud, int def)
+{
+  if(lua_isnoneornil(L,ud))
+    return def;
+
+  return luaT_checkboolean(L, ud);
+}
+
+void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name)
+{
+  int idx = lua_gettop(L);
+
+  luaL_checktype(L, idx, LUA_TTABLE);
+  lua_pushstring(L, name);
+  lua_rawget(L, idx);
+
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);
+    lua_pushstring(L, name);
+    lua_newtable(L);
+    lua_rawset(L, idx);
+
+    lua_pushstring(L, name);
+    lua_rawget(L, idx);
+  }
+
+  luaT_setfuncs(L, methods, 0);
+  lua_pop(L, 1);
+}
+
+
+/* returns the name of the class itself (sans nesting) */
+const char* luaT_classrootname(const char *tname)
+{
+  int idx;
+  int sz = strlen(tname);
+
+  for(idx = sz-1; idx >= 0 ; idx--)
+  {
+    if(tname[idx] == '.')
+      return tname+idx+1;
+  }
+  return tname;
+}
+
+/* parent_name must be a buffer at least as big as tname.
+ * If class has a parent, returns true; and, sets
+ * parent name to that of full parent hierarchy (e.g.
+ * given class `A.b.c`, sets parent_name to `A.b`)
+ */
+int luaT_fullparentname(const char *tname, char *parent_name)
+{
+  int sz = strlen(tname);
+  int idx;
+  for(idx = sz-1; idx > 0 ; idx--)
+    if(tname[idx] == '.' || tname[idx] == '\0') break;
+
+  if (idx > 0) strncpy(parent_name, tname, idx);
+  parent_name[idx] = '\0';
+  return tname[idx] == '.';
+}
+
+/* alias for ensuring backwards compatibilty;
+ * use of luaT_fullparentname is preferred.
+ */
+int luaT_classmodulename(const char *tname, char *parent_name)
+{
+  return luaT_fullparentname(tname, parent_name);
+}
+
+/* parent_name must be a buffer at least as big as tname.
+ * If class has a parent, returns true; and, sets
+ * parent name to that of outermost parent (e.g.
+ * given class `A.b.c`, sets parent_name to `A`)
+ */
+int luaT_outerparentname(const char *tname, char *parent_name)
+{
+  char chars[] = {'.', '\0'};
+  size_t idx;
+  idx = strcspn(tname, chars);
+  strncpy(parent_name, tname, idx);
+  parent_name[idx] = '\0';
+  return tname[idx] == '.';
+}
+
+/* parent_name must be a buffer at least as big as tname.
+ * If class has a parent, returns true; and, sets parent
+ * name to that of innermost parent (e.g. given class
+ * `A.b.c`, sets parent_name to `b`). In the comments
+ * below, the inner parent name is abbreviated as IPN.
+ */
+int luaT_innerparentname(const char *tname, char *parent_name)
+{
+  int sz = strlen(tname);
+  int tail, head;
+  for(tail = sz-1; tail >= 0 ; tail--) // tail points to
+    if(tname[tail] == '.') break;      // just past IPN
+
+  if (tail == 0) return 0;
+
+  for(head = tail-1; head >= 0; head--) // head points to
+    if(tname[head] == '.') break;       // just before IPN
+
+  head += 1; // update head to start of IPN
+  tail -= head; // update tail to strlen(IPN)
+  strncpy(parent_name, tname+head, tail);
+  parent_name[tail] = '\0';
+  return 1;
+}
+
+/* Method for pushing a class's immediate parent to the
+ * stack (e.g. given class `A.b.c`, pushes `b` to the stack)
+ */
+void luaT_getinnerparent(lua_State *L, const char *tname)
+{
+  /* Local variables */
+  char term[256];
+  char chars[] = {'.', '\0'};
+  const char *tname_full = tname; // used for error case
+
+  /* Get outermost table from Lua */
+  int n = strcspn(tname, chars);
+  strncpy(term, tname, n);
+  term[n] = '\0';
+  lua_getglobal(L, term);
+  tname  += n + 1;
+
+  /* Traverse hierarchy down to last table*/
+  n = strcspn(tname, chars);
+  while(n < strlen(tname))
+  {
+    /* Check that current parent is a table (i.e. a module) */
+    if(!lua_istable(L, -1)){
+      strncpy(term, tname_full, tname - tname_full - 1);
+      term[tname - tname_full] = '\0';
+      luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname_full, term);
+    }
+    strncpy(term, tname, n);
+    term[n] = '\0';
+    lua_getfield(L, -1, term);
+    lua_remove(L, -2);
+    tname += n + 1;
+    n = strcspn(tname, chars); // prepare for next
+  }
+
+  /* Check that resulting parent is a table (i.e. a module) */
+  if(!lua_istable(L, -1)){
+    strncpy(term, tname_full, tname - tname_full - 1);
+    term[tname - tname_full] = '\0';
+    luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname_full, term);
+  }
+}
+
+
+int luaT_lua_newmetatable(lua_State *L)
+{
+  /* Local Variables */
+  const char* tname = luaL_checkstring(L, 1);
+  char parent_name[256];
+  int is_in_module = 0;
+
+  /* Argument Checking */
+  lua_settop(L, 6);
+  luaL_argcheck(L, lua_isnoneornil(L, 2) || lua_isstring(L, 2), 2, "parent class name or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 3) || lua_isfunction(L, 3), 3, "constructor function or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 4) || lua_isfunction(L, 4), 4, "destructor function or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 5) || lua_isfunction(L, 5), 5, "factory function or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 6) || lua_istable(L, 6), 6, "module table or nil expected");
+
+  /* Push immediate parent module to stack */
+  if(lua_isnoneornil(L, 6)) {
+    lua_pop(L, 1); /* remove the nil */
+    is_in_module = luaT_fullparentname(tname, parent_name);
+    if (is_in_module)
+      luaT_getinnerparent(L, tname);
+    else
+      lua_pushglobaltable(L);
+  }
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname, parent_name);
+
+  /* we first create the new metaclass if we have to */
+  if(!luaT_pushmetatable(L, tname))
+  {
+    /* create the metatable */
+    lua_newtable(L);
+
+    /* registry[name] = metatable */
+    lua_pushvalue(L, -1);
+    lua_setfield(L, LUA_REGISTRYINDEX, tname);
+
+    /* registry[metatable] = tname */
+    lua_pushvalue(L, -1);
+    lua_pushstring(L, tname);
+    lua_rawset(L, LUA_REGISTRYINDEX);
+
+    /* __index handling */
+    lua_pushcfunction(L, luaT_mt__index);
+    lua_setfield(L, -2, "__index");
+
+    /* __newindex handling */
+    lua_pushcfunction(L, luaT_mt__newindex);
+    lua_setfield(L, -2, "__newindex");
+
+    /* __typename contains the typename */
+    lua_pushstring(L, tname);
+    lua_setfield(L, -2, "__typename");
+
+    /* __metatable is self */
+    lua_pushvalue(L, -1);
+    lua_setfield(L, -2, "__metatable");
+
+    /* by default, __version equals 1 */
+    lua_pushnumber(L, 1);
+    lua_setfield(L, -2, "__version");
+
+    /* assign default operator functions */
+    lua_pushcfunction(L, luaT_mt__tostring);
+    lua_setfield(L, -2, "__tostring");
+
+    lua_pushcfunction(L, luaT_mt__add);
+    lua_setfield(L, -2, "__add");
+
+    lua_pushcfunction(L, luaT_mt__sub);
+    lua_setfield(L, -2, "__sub");
+
+    lua_pushcfunction(L, luaT_mt__mul);
+    lua_setfield(L, -2, "__mul");
+
+    lua_pushcfunction(L, luaT_mt__div);
+    lua_setfield(L, -2, "__div");
+
+    lua_pushcfunction(L, luaT_mt__mod);
+    lua_setfield(L, -2, "__mod");
+
+    lua_pushcfunction(L, luaT_mt__pow);
+    lua_setfield(L, -2, "__pow");
+
+    lua_pushcfunction(L, luaT_mt__unm);
+    lua_setfield(L, -2, "__unm");
+
+    lua_pushcfunction(L, luaT_mt__concat);
+    lua_setfield(L, -2, "__concat");
+
+    lua_pushcfunction(L, luaT_mt__len);
+    lua_setfield(L, -2, "__len");
+
+    lua_pushcfunction(L, luaT_mt__eq);
+    lua_setfield(L, -2, "__eq");
+
+    lua_pushcfunction(L, luaT_mt__lt);
+    lua_setfield(L, -2, "__lt");
+
+    lua_pushcfunction(L, luaT_mt__le);
+    lua_setfield(L, -2, "__le");
+
+    lua_pushcfunction(L, luaT_mt__call);
+    lua_setfield(L, -2, "__call");
+  }
+
+  /* we assign the parent class if necessary */
+  if(!lua_isnoneornil(L, 2))
+  {
+    if(lua_getmetatable(L, -1))
+      luaL_error(L, "class %s has been already assigned a parent class\n", tname);
+    else
+    {
+      const char* parent_tname = luaL_checkstring(L, 2);
+      if(!luaT_pushmetatable(L, parent_tname))
+        luaL_error(L, "bad argument #2 (invalid parent class name %s)", parent_tname);
+      lua_setmetatable(L, -2);
+    }
+  }
+
+  /* register the destructor function  */
+  if(!lua_isnoneornil(L, 4))
+  {
+    /* does it exists already? */
+    lua_pushstring(L, "__gc");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__gc");
+      lua_pushvalue(L, 4);
+      lua_rawset(L, -3);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a destructor", tname);
+  }
+
+  /* register the factory function  */
+  if(!lua_isnoneornil(L, 5))
+  {
+    /* does it exists already? */
+    lua_pushstring(L, "__factory");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__factory");
+      lua_pushvalue(L, 5);
+      lua_rawset(L, -3);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a factory", tname);
+  }
+
+  /******** Constructor table and metatable ********/
+  lua_pushstring(L, "__constructor");
+  lua_rawget(L, -2);
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);                        /* pop nil */
+    lua_newtable(L);                      /* fancy table */
+    lua_newtable(L);                      /* fancy metatable */
+
+    lua_pushvalue(L, -3);                 /* metatable */
+    lua_setfield(L, -2, "__index");       /* so we can get the methods */
+
+    lua_pushcfunction(L, luaT_cmt__newindex);
+    lua_setfield(L, -2, "__newindex");    /* so we add new methods */
+
+    lua_pushcfunction(L, luaT_cmt__call);
+    lua_setfield(L, -2, "__call");        /* so we can create, we are here for only that */
+
+    lua_pushvalue(L, -3);
+    lua_setfield(L, -2, "__metatable");   /* redirect to metatable with methods */
+
+    lua_setmetatable(L, -2);              /* constructor metatable is ... this fancy metatable */
+
+    /* set metatable[__constructor] = constructor-metatable */
+    lua_pushstring(L, "__constructor");
+    lua_pushvalue(L, -2);
+    lua_rawset(L, -4);
+  }
+
+  /* register the constructor function  */
+  if(!lua_isnoneornil(L, 3))
+  {
+    /* get constructor metatable */
+    lua_getmetatable(L, -1);
+
+    /* does it exists already? */
+    lua_pushstring(L, "__new");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__new");
+      lua_pushvalue(L, 3);
+      lua_rawset(L, -3);
+
+      /* set "new" in the metatable too */
+      lua_pushstring(L, "new");
+      lua_pushvalue(L, 3);
+      lua_rawset(L, -5);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a constructor", tname);
+
+    /* pop constructor metatable */
+    lua_pop(L, 1);
+  }
+
+  /* module.name = constructor metatable */
+  lua_setfield(L, 6, luaT_classrootname(tname));
+
+  return 1; /* returns the metatable */
+}
+
+/* Lua only utility functions */
+
+/* add any custom type, provided the object has a metatable */
+int luaT_lua_metatype(lua_State *L)
+{
+  if( (lua_gettop(L) != 2) && (lua_gettop(L) != 3) )
+    luaL_error(L, "expecting: string table [ctype]");
+
+  luaL_checkstring(L, 1);
+  luaL_checktype(L, 2, LUA_TTABLE);
+
+  if(lua_gettop(L) == 3)
+  {
+    if(!luaT_cdataname(L, 3, lua_tostring(L, 1)))
+      luaL_error(L, "could not register cdata type -- missing ffi library?");
+  }
+
+  /* registry[name] = metatable */
+  lua_pushvalue(L, 1);
+  lua_pushvalue(L, 2);
+  lua_rawset(L, LUA_REGISTRYINDEX);
+
+  /* registry[metatable] = tname */
+  lua_pushvalue(L, 2);
+  lua_pushvalue(L, 1);
+  lua_rawset(L, LUA_REGISTRYINDEX);
+
+  return 0;
+}
+
+/* return a userdata from a C pointer */
+/* you are better to know what you are doing */
+int luaT_lua_pushudata(lua_State *L)
+{
+  void *udata = NULL;
+  const char *tname = luaL_checkstring(L, 2);
+
+  if(lua_type(L, 1) == 10)
+    udata = *((void**)lua_topointer(L, 1));
+  else if(luaT_iscdata(L, 1))
+    udata = ((void**)lua_topointer(L, 1))[4];
+  else if(lua_isnumber(L, 1))
+    udata = (void*)(uintptr_t)lua_tonumber(L, 1);
+  else
+    luaL_argerror(L, 1, "expecting number or cdata");
+
+  luaT_pushudata(L, udata, tname);
+
+  return 1;
+}
+
+int luaT_lua_factory(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname) && !lua_isnil(L, -1))
+  {
+    lua_pushstring(L, "__factory");
+    lua_rawget(L, -2);
+  }
+  else
+  {
+    lua_pushnil(L);
+  }
+  return 1;
+}
+
+int luaT_lua_getconstructortable(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname))
+  {
+    lua_pushstring(L, "__constructor");
+    lua_rawget(L, -2);
+    return 1;
+  }
+  return 0;
+}
+
+
+int luaT_lua_typename(lua_State *L)
+{
+  const char* tname = NULL;
+  luaL_checkany(L, 1);
+  if((tname = luaT_typename(L, 1)))
+  {
+    lua_pushstring(L, tname);
+    return 1;
+  }
+  return 0;
+}
+
+int luaT_lua_isequal(lua_State *L)
+{
+  if(lua_isuserdata(L, 1) && lua_isuserdata(L, 2))
+  {
+    void **u1, **u2;
+    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+    luaL_argcheck(L, luaT_typename(L, 2), 2, "Torch object expected");
+
+    u1 = lua_touserdata(L, 1);
+    u2 = lua_touserdata(L, 2);
+    if(*u1 == *u2)
+      lua_pushboolean(L, 1);
+    else
+      lua_pushboolean(L, 0);
+  }
+  else if(lua_istable(L, 1) && lua_istable(L, 2))
+    lua_pushboolean(L, lua_rawequal(L, 1, 2));
+  else
+    lua_pushboolean(L, 0);
+  return 1;
+}
+
+static void luaT_pushpointer(lua_State *L, const void *ptr)
+{
+  // 2^53 - this assumes that lua_Number is a double
+  if ((uintptr_t)ptr > 9007199254740992LLU)
+    luaL_error(L, "Pointer value can't be represented as a Lua number (an overflow would occur)");
+  lua_pushnumber(L, (uintptr_t)(ptr));
+}
+
+int luaT_lua_pointer(lua_State *L)
+{
+  if(lua_type(L, 1) == 10) /* luajit cdata */
+  {
+    /* we want the pointer holded by cdata */
+    /* not the pointer on the cdata object */
+    const void* ptr = *((void**)lua_topointer(L, 1));
+    luaT_pushpointer(L, ptr);
+    return 1;
+  }
+  else if (luaT_iscdata(L, 1)) /* luaffi cdata */
+  {
+    void** ptr = (void**)lua_touserdata(L, 1);
+    luaT_pushpointer(L, ptr[4]);
+    return 1;
+  }
+  else if(lua_isuserdata(L, 1))
+  {
+    void **ptr;
+    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+    ptr = lua_touserdata(L, 1);
+    luaT_pushpointer(L, *ptr);
+    return 1;
+  }
+  else if(lua_istable(L, 1) || lua_isthread(L, 1) || lua_isfunction(L, 1))
+  {
+    const void* ptr = lua_topointer(L, 1);
+    luaT_pushpointer(L, ptr);
+    return 1;
+  }
+  else if(lua_isstring(L, 1))
+  {
+    const char* ptr = lua_tostring(L, 1);
+    luaT_pushpointer(L, ptr);
+    return 1;
+  }
+  else
+    luaL_error(L, "Torch object, table, thread, cdata or function expected");
+
+  return 0;
+}
+
+int luaT_lua_setenv(lua_State *L)
+{
+  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+    luaL_typerror(L, 1, "function or userdata");
+  luaL_checktype(L, 2, LUA_TTABLE);
+  lua_setuservalue(L, 1);
+  return 0;
+}
+
+int luaT_lua_getenv(lua_State *L)
+{
+  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+    luaL_typerror(L, 1, "function or userdata");
+  lua_getuservalue(L, 1);
+  if (lua_isnil(L, -1))
+    lua_newtable(L);
+  return 1;
+}
+
+int luaT_lua_getmetatable(lua_State *L)
+{
+  const char *tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname))
+    return 1;
+  return 0;
+}
+
+int luaT_lua_version(lua_State *L)
+{
+  luaL_checkany(L, 1);
+
+  if(luaT_iscdata(L, 1))
+  {
+    const char *tname = luaT_cdataname(L, 1, NULL);
+    if(tname)
+    {
+      luaT_pushmetatable(L, tname);
+      lua_pushstring(L, "__version");
+      lua_rawget(L, -2);
+      return 1;
+    }
+    return 0;
+  }
+  else if(lua_getmetatable(L, 1))
+  {
+    lua_pushstring(L, "__version");
+    lua_rawget(L, -2);
+    return 1;
+  }
+  return 0;
+}
+
+int luaT_lua_setmetatable(lua_State *L)
+{
+  const char *tname = luaL_checkstring(L, 2);
+  luaL_checktype(L, 1, LUA_TTABLE);
+
+  if(!luaT_pushmetatable(L, tname))
+    luaL_error(L, "unknown typename %s\n", tname);
+  lua_setmetatable(L, 1);
+
+  return 1;
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L)
+{
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "critical internal indexing error: no metatable found");
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "critical internal indexing error: not a metatable");
+
+  /* test for __index__ method first */
+  lua_getfield(L, -1, "__index__");
+  if(!lua_isnil(L, -1))
+  {
+    int result;
+
+    if(!lua_isfunction(L, -1))
+      luaL_error(L, "critical internal indexing error: __index__ is not a function");
+
+    lua_pushvalue(L, 1);
+    lua_pushvalue(L, 2);
+
+    lua_call(L, 2, LUA_MULTRET); /* DEBUG: risque: faut vraiment retourner 1 ou 2 valeurs... */
+
+    result = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+
+    if(result)
+      return 1;
+
+    /* on the stack: 1. the object 2. the value 3. the metatable */
+    /* apparently, __index wants only one element returned */
+    /* return lua_gettop(L)-3; */
+
+  }
+  else
+    lua_pop(L, 1); /* remove nil __index__ on the stack */
+
+  lua_pushvalue(L, 2);
+  lua_gettable(L, -2);
+
+  return 1;
+}
+
+static int luaT_mt__newindex(lua_State *L)
+{
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "critical internal indexing error: no metatable found");
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "critical internal indexing error: not a metatable");
+
+  /* test for __newindex__ method first */
+  lua_getfield(L, -1, "__newindex__");
+  if(!lua_isnil(L, -1))
+  {
+    int result;
+
+    if(!lua_isfunction(L, -1))
+      luaL_error(L, "critical internal indexing error: __newindex__ is not a function");
+
+    lua_pushvalue(L, 1);
+    lua_pushvalue(L, 2);
+    lua_pushvalue(L, 3);
+
+    lua_call(L, 3, 1); /* DEBUG: risque: faut vraiment retourner qqch */
+
+    result = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+
+    if(result)
+      return 0;
+  }
+  else
+    lua_pop(L, 1); /* remove nil __newindex__ on the stack */
+
+  lua_pop(L, 1);    /* pop the metatable */
+  if(lua_istable(L, 1))
+    lua_rawset(L, 1);
+  else
+    luaL_error(L, "the class %s cannot be indexed", luaT_typename(L, 1));
+
+  return 0;
+}
+
+
+#define MT_UNI_OPERATOR_GET_HANDLER(NAME)                               \
+    if(!lua_getmetatable(L, 1))                                         \
+      luaL_error(L, "internal error in __" #NAME ": no metatable");
+
+#define MT_BIN_OPERATOR_GET_HANDLER(NAME)                               \
+    if(!lua_getmetatable(L, 1) && !lua_getmetatable(L,2) )              \
+      luaL_error(L, "internal error in __" #NAME                        \
+              ": no metatable in both operands");
+
+#define MT_DECLARE_OPERATOR_BODY(NAME, NIL_BEHAVIOR)                    \
+                                                                        \
+    lua_getfield(L, -1, "__" #NAME "__");                               \
+    if(lua_isnil(L, -1))                                                \
+    {                                                                   \
+      NIL_BEHAVIOR;                                                     \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      if(lua_isfunction(L, -1))                                         \
+      {                                                                 \
+        lua_insert(L, 1); /* insert function */                         \
+        lua_pop(L, 1); /* remove metatable */                           \
+        lua_call(L, lua_gettop(L)-1, LUA_MULTRET);                      \
+          /* we return the result of the call */                        \
+        return lua_gettop(L);                                           \
+      }                                                                 \
+      /* we return the thing the user left in __tostring__ */           \
+    }                                                                   \
+    return 0;                                                           \
+
+/* note: check dans metatable pour ca, donc necessaire */
+#define MT_DECLARE_OPERATOR(NAME, NIL_BEHAVIOR)                         \
+  int luaT_mt__##NAME(lua_State *L)                                     \
+  {                                                                     \
+    MT_UNI_OPERATOR_GET_HANDLER(NAME)                                   \
+    MT_DECLARE_OPERATOR_BODY(NAME,NIL_BEHAVIOR)                         \
+  }
+
+#define MT_DECLARE_BIN_OPERATOR(NAME, NIL_BEHAVIOR)                     \
+  int luaT_mt__##NAME(lua_State *L)                                     \
+  {                                                                     \
+    MT_BIN_OPERATOR_GET_HANDLER(NAME)                                   \
+    MT_DECLARE_OPERATOR_BODY(NAME,NIL_BEHAVIOR)                         \
+  }
+
+
+#define BIN_OPERATOR_ERROR(NAME)                                        \
+    luaL_error(L, "both %s and %s have no " #NAME " operator",          \
+            luaT_typename(L, 1), luaT_typename(L,2))
+
+MT_DECLARE_BIN_OPERATOR(add,    BIN_OPERATOR_ERROR(addition) )
+MT_DECLARE_BIN_OPERATOR(sub,    BIN_OPERATOR_ERROR(substraction) )
+MT_DECLARE_BIN_OPERATOR(mul,    BIN_OPERATOR_ERROR(multiplication) )
+MT_DECLARE_BIN_OPERATOR(div,    BIN_OPERATOR_ERROR(division) )
+MT_DECLARE_BIN_OPERATOR(mod,    BIN_OPERATOR_ERROR(modulo) )
+MT_DECLARE_BIN_OPERATOR(pow,    BIN_OPERATOR_ERROR(power) )
+MT_DECLARE_BIN_OPERATOR(concat, BIN_OPERATOR_ERROR(concat) )
+MT_DECLARE_BIN_OPERATOR(eq,
+                    lua_settop(L, 2);
+                    lua_pushcfunction(L, luaT_lua_isequal);
+                    lua_insert(L, 1);
+                    lua_call(L, 2, 1);
+                    return 1;)
+MT_DECLARE_BIN_OPERATOR(lt, BIN_OPERATOR_ERROR(less-than) )
+MT_DECLARE_BIN_OPERATOR(le, BIN_OPERATOR_ERROR(less-equal) )
+
+MT_DECLARE_OPERATOR(tostring,
+                    lua_pushstring(L, luaT_typename(L, 1));
+                    return 1;)
+MT_DECLARE_OPERATOR(call, luaL_error(L, "%s has no call operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(unm, luaL_error(L, "%s has no negation operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(len, luaL_error(L, "%s has no length operator", luaT_typename(L, 1)))
+
+
+/* constructor metatable methods */
+int luaT_cmt__call(lua_State *L)
+{
+  if(!lua_istable(L, 1))
+    luaL_error(L, "internal error in __call: not a constructor table");
+
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "internal error in __call: no metatable available");
+
+  lua_pushstring(L, "__new");
+  lua_rawget(L, -2);
+
+  if(lua_isnil(L, -1))
+    luaL_error(L, "no constructor available");
+
+  lua_remove(L, 1); /* remove constructor atable */
+  lua_insert(L, 1); /* insert constructor */
+  lua_pop(L, 1);    /* remove fancy metatable */
+
+  lua_call(L, lua_gettop(L)-1, LUA_MULTRET);
+  return lua_gettop(L);
+}
+
+int luaT_cmt__newindex(lua_State *L)
+{
+  if(!lua_istable(L, 1))
+    luaL_error(L, "internal error in __newindex: not a constructor table");
+
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "internal error in __newindex: no metatable available");
+
+  lua_pushstring(L, "__metatable");
+  lua_rawget(L, -2);
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "internal error in __newindex: no metaclass available");
+
+  lua_insert(L, 2);
+  lua_pop(L, 1); /* remove the metatable over the constructor table */
+
+  lua_rawset(L, -3);
+
+  return 0;
+}
+
+/******************** deprecated functions ********************/
+int luaT_pushmetaclass(lua_State *L, const char *tname)
+{
+  return luaT_pushmetatable(L, tname);
+}
+
+const char* luaT_id(lua_State *L, int ud)
+{
+  return luaT_typename(L, ud);
+}
+
+const char* luaT_id2typename(lua_State *L, const char *id)
+{
+  return id;
+}
+
+const char* luaT_typename2id(lua_State *L, const char *tname)
+{
+  return luaT_typenameid(L, tname);
+}
+
+int luaT_getmetaclass(lua_State *L, int index)
+{
+  return lua_getmetatable(L, index);
+}
+
+const char* luaT_checktypename2id(lua_State *L, const char *tname)
+{
+  const char* id = luaT_typenameid(L, tname);
+  if(!id)
+    luaL_error(L, "unknown class <%s>", tname);
+  return id;
+}
+
+void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id)
+{
+  luaT_registeratname(L, methods, id);
+}
+
+/**************************************************************/
diff --git a/lib/luaT/luaT.h b/lib/luaT/luaT.h
new file mode 100644
index 0000000..b1b6cd9
--- /dev/null
+++ b/lib/luaT/luaT.h
@@ -0,0 +1,132 @@
+#ifndef LUAT_UTILS_INC
+#define LUAT_UTILS_INC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <lua.h>
+#include <lauxlib.h>
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef LUA_EXTERNC
+# ifdef __cplusplus
+#  define LUA_EXTERNC extern "C"
+# else
+#  define LUA_EXTERNC extern
+# endif
+#endif
+
+#if (defined(_MSC_VER) || defined(__MINGW32__))
+# define DLL_EXPORT __declspec(dllexport)
+# define DLL_IMPORT __declspec(dllimport)
+# ifdef luaT_EXPORTS
+#  define LUAT_API LUA_EXTERNC DLL_EXPORT
+# else
+#  define LUAT_API LUA_EXTERNC DLL_IMPORT
+# endif
+#else
+# define DLL_EXPORT
+# define DLL_IMPORT
+# define LUAT_API LUA_EXTERNC
+#endif
+
+#if LUA_VERSION_NUM == 501
+# define lua_pushglobaltable(L) lua_pushvalue(L, LUA_GLOBALSINDEX)
+# define lua_setuservalue lua_setfenv
+# define lua_getuservalue lua_getfenv
+#else
+# define lua_objlen lua_rawlen
+static int luaL_typerror(lua_State *L, int narg, const char *tname)
+{
+  return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, narg));
+}
+#endif
+
+
+/* C functions */
+
+LUAT_API void* luaT_alloc(lua_State *L, long size);
+LUAT_API void* luaT_realloc(lua_State *L, void *ptr, long size);
+LUAT_API void luaT_free(lua_State *L, void *ptr);
+
+LUAT_API void luaT_setfuncs(lua_State *L, const luaL_Reg *l, int nup);
+
+LUAT_API const char* luaT_newlocalmetatable(lua_State *L, const char *tname, const char *parent_tname,
+                                            lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory, int moduleidx);
+
+LUAT_API const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
+                                       lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory);
+
+LUAT_API int luaT_pushmetatable(lua_State *L, const char *tname);
+
+LUAT_API const char* luaT_typenameid(lua_State *L, const char *tname);
+LUAT_API const char* luaT_typename(lua_State *L, int ud);
+
+LUAT_API void luaT_pushudata(lua_State *L, void *udata, const char *tname);
+LUAT_API void *luaT_toudata(lua_State *L, int ud, const char *tname);
+LUAT_API int luaT_isudata(lua_State *L, int ud, const char *tname);
+LUAT_API void *luaT_checkudata(lua_State *L, int ud, const char *tname);
+
+LUAT_API void luaT_pushlong(lua_State *L, long n);
+LUAT_API long luaT_checklong(lua_State *L, int idx);
+LUAT_API long luaT_tolong(lua_State *L, int idx);
+
+LUAT_API void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname);
+LUAT_API void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field);
+LUAT_API double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckint(lua_State *L, int ud, const char *field);
+LUAT_API const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field);
+LUAT_API void luaT_getfieldchecktable(lua_State *L, int ud, const char *field);
+
+LUAT_API int luaT_typerror(lua_State *L, int ud, const char *tname);
+
+LUAT_API int luaT_checkboolean(lua_State *L, int ud);
+LUAT_API int luaT_optboolean(lua_State *L, int ud, int def);
+
+LUAT_API void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name);
+
+/* utility functions */
+LUAT_API const char *luaT_classrootname(const char *tname);
+LUAT_API int luaT_classmodulename(const char *tname, char *module_name);
+
+/* debug */
+LUAT_API void luaT_stackdump(lua_State *L);
+
+/* Lua functions */
+LUAT_API int luaT_lua_newmetatable(lua_State *L);
+LUAT_API int luaT_lua_factory(lua_State *L);
+LUAT_API int luaT_lua_getconstructortable(lua_State *L);
+LUAT_API int luaT_lua_typename(lua_State *L);
+LUAT_API int luaT_lua_isequal(lua_State *L);
+LUAT_API int luaT_lua_pointer(lua_State *L);
+LUAT_API int luaT_lua_setenv(lua_State *L);
+LUAT_API int luaT_lua_getenv(lua_State *L);
+LUAT_API int luaT_lua_getmetatable(lua_State *L);
+LUAT_API int luaT_lua_version(lua_State *L);
+LUAT_API int luaT_lua_setmetatable(lua_State *L);
+LUAT_API int luaT_lua_metatype(lua_State *L);
+LUAT_API int luaT_lua_pushudata(lua_State *L);
+
+/* deprecated functions */
+/* ids have been replaced by string names to identify classes */
+/* comments show what function (that you should use) they call now */
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define LUAT_DEPRECATED  __attribute__((__deprecated__))
+#elif (defined(_MSC_VER) || defined(__MINGW32__))
+#define LUAT_DEPRECATED __declspec(deprecated)
+#else
+#define LUAT_DEPRECATED
+#endif
+
+LUAT_API LUAT_DEPRECATED int luaT_pushmetaclass(lua_State *L, const char *tname); /* same as luaT_pushmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_id(lua_State *L, int ud); /* same as luaT_typename */
+LUAT_API LUAT_DEPRECATED const char* luaT_id2typename(lua_State *L, const char *id); /*  same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED const char* luaT_typename2id(lua_State *L, const char*); /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED int luaT_getmetaclass(lua_State *L, int index); /* same as luaT_getmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_checktypename2id(lua_State *L, const char *tname);  /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id); /* same as luaT_registeratname */
+
+#endif
diff --git a/lib/luaT/luaTConfig.cmake.in b/lib/luaT/luaTConfig.cmake.in
new file mode 100644
index 0000000..bfb20b8
--- /dev/null
+++ b/lib/luaT/luaTConfig.cmake.in
@@ -0,0 +1,9 @@
+# Find the luaT includes and library
+#
+# LUAT_INCLUDE_DIR -- where to find the includes
+# LUAT_LIBRARIES -- list of libraries to link against
+# LUAT_FOUND -- set to 1 if found
+
+SET(LUAT_FOUND 1)
+SET(LUAT_INCLUDE_DIR "@LUAT_INCLUDE_DIR@")
+SET(LUAT_LIBRARIES "@LUAT_LIBRARIES@")
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..39a34d7
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,21 @@
+site_name: torch7
+theme : simplex
+repo_url : https://github.com/torch/torch7
+use_directory_urls : false
+markdown_extensions: [extra]
+docs_dir : doc
+pages:
+- [index.md, Home]
+- [tensor.md, Tensor Library, Tensor]
+- [maths.md, Tensor Library, Tensor Math]
+- [storage.md, Tensor Library, Storage]
+- [file.md, File I/O Library, File Interface]
+- [diskfile.md, File I/O Library, Disk File]
+- [memoryfile.md, File I/O Library, Memory File]
+- [pipefile.md, File I/O Library, Pipe File]
+- [serialization.md, File I/O Library, Serialization]
+- [utility.md, Useful Utilities, Class]
+- [timer.md, Useful Utilities, Timer]
+- [tester.md, Useful Utilities, Tester]
+- [cmdline.md, Useful Utilities, CmdLine]
+- [random.md, Useful Utilities, Random]
diff --git a/random.lua b/random.lua
new file mode 100644
index 0000000..a6f0c3d
--- /dev/null
+++ b/random.lua
@@ -0,0 +1,53 @@
+local wrap = require 'cwrap'
+
+require 'torchcwrap'
+
+local interface = wrap.CInterface.new()
+
+interface:print(
+   [[
+#include "luaT.h"
+#include "TH.h"
+
+extern void torch_Generator_init(lua_State *L);
+extern void torch_Generator_new(lua_State *L);
+   ]])
+
+for _,name in ipairs({"seed", "initialSeed"}) do
+   interface:wrap(name,
+                  string.format("THRandom_%s",name),
+                  {{name='Generator', default=true},
+                   {name="long", creturned=true}})
+end
+
+interface:wrap('manualSeed',
+               'THRandom_manualSeed',
+               {{name='Generator', default=true},
+                {name="long"}})
+
+interface:wrap('getRNGState',
+                'THByteTensor_getRNGState',
+                {{name='Generator', default=true},
+                 {name='ByteTensor',default=true,returned=true,method={default='nil'}}
+                 })
+
+interface:wrap('setRNGState',
+                'THByteTensor_setRNGState',
+                {{name='Generator', default=true},
+                 {name='ByteTensor',default=true,returned=true,method={default='nil'}}
+                 })
+
+interface:register("random__")
+                
+interface:print(
+   [[
+void torch_random_init(lua_State *L)
+{
+  torch_Generator_init(L);
+  torch_Generator_new(L);
+  lua_setfield(L, -2, "_gen");
+  luaT_setfuncs(L, random__, 0);
+}
+]])
+
+interface:tofile(arg[1])
diff --git a/rocks/torch-scm-1.rockspec b/rocks/torch-scm-1.rockspec
new file mode 100644
index 0000000..2228726
--- /dev/null
+++ b/rocks/torch-scm-1.rockspec
@@ -0,0 +1,36 @@
+package = "torch"
+version = "scm-1"
+
+source = {
+   url = "git://github.com/torch/torch7.git",
+}
+
+description = {
+   summary = "Torch7",
+   detailed = [[
+   ]],
+   homepage = "https://github.com/torch/torch7",
+   license = "BSD"
+}
+
+dependencies = {
+   "lua >= 5.1",
+   "paths >= 1.0",
+   "cwrap >= 1.0"
+}
+
+build = {
+   type = "command",
+   build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DLUA=$(LUA) -DLUALIB=$(LUALIB) -DLUA_BINDIR="$(LUA_BINDIR)" -DLUA_INCDIR="$(LUA_INCDIR)" -DLUA_LIBDIR="$(LUA_LIBDIR)" -DLUADIR="$(LUADIR)" -DLIBDIR="$(LIBDIR)" -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN)
+]],
+	 platforms = {
+      windows = {
+           build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DLUA=$(LUA) -DLUALIB=$(LUALIB) -DLUA_BINDIR="$(LUA_BINDIR)" -DLUA_INCDIR="$(LUA_INCDIR)" -DLUA_LIBDIR="$(LUA_LIBDIR)" -DLUADIR="$(LUADIR)" -DLIBDIR="$(LIBDIR)" -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
+]]
+      }
+   },
+   install_command = "cd build && $(MAKE) install"
+}
+
diff --git a/test/longSize.lua b/test/longSize.lua
new file mode 100644
index 0000000..82eef04
--- /dev/null
+++ b/test/longSize.lua
@@ -0,0 +1,42 @@
+tensor = torch.rand(2,3)
+f = torch.DiskFile('tensor8.bin','w')
+f:binary()
+f:longSize(8)
+f:writeObject(tensor)
+f:close()
+f = torch.DiskFile('tensor8.bin','r')
+f:binary()
+f:longSize(8)
+tensor2 = f:readObject()
+f:close()
+print('Tensors are same: ',tensor:norm()==tensor2:norm())
+
+f = torch.DiskFile('tensor4.bin','w')
+f:binary()
+f:longSize(4)
+f:writeObject(tensor)
+f:close()
+f = torch.DiskFile('tensor4.bin','r')
+f:binary()
+f:longSize(4)
+tensor2 = f:readObject()
+f:close()
+print('Tensors are same: ',tensor:norm()==tensor2:norm())
+
+f = torch.MemoryFile()
+f:binary()
+f:longSize(8)
+f:writeObject(tensor)
+f:seek(1)
+tensor2 = f:readObject()
+f:close()
+print('Tensors are same: ',tensor:norm()==tensor2:norm())
+
+f = torch.MemoryFile()
+f:binary()
+f:longSize(4)
+f:writeObject(tensor)
+f:seek(1)
+tensor2 = f:readObject()
+f:close()
+print('Tensors are same: ',tensor:norm()==tensor2:norm())
diff --git a/test/test.lua b/test/test.lua
new file mode 100644
index 0000000..20ca035
--- /dev/null
+++ b/test/test.lua
@@ -0,0 +1,3425 @@
+--require 'torch'
+
+local mytester
+local torchtest = torch.TestSuite()
+local msize = 100
+local precision
+
+-- Lua 5.2 compatibility
+local loadstring = loadstring or load
+local unpack = unpack or table.unpack
+
+local function maxdiff(x,y)
+   local d = x-y
+   if x:type() == 'torch.DoubleTensor' or x:type() == 'torch.FloatTensor' then
+      return d:abs():max()
+   else
+      local dd = torch.Tensor():resize(d:size()):copy(d)
+      return dd:abs():max()
+   end
+end
+
+function torchtest.dot()
+   local types = {
+      ['torch.DoubleTensor'] = 1e-8, -- for ddot
+      ['torch.FloatTensor']  = 1e-4, -- for sdot
+   }
+   for tname, prec in pairs(types) do
+      local v1 = torch.randn(100):type(tname)
+      local v2 = torch.randn(100):type(tname)
+
+      local res1 = torch.dot(v1,v2)
+
+      local res2 = 0
+      for i = 1,v1:size(1) do
+         res2 = res2 + v1[i] * v2[i]
+      end
+
+      local err = math.abs(res1-res2)
+
+      mytester:assertlt(err, prec, 'error in torch.dot (' .. tname .. ')')
+   end
+end
+
+local genericSingleOpTest = [[
+   -- [res] torch.functionname([res,] x)
+   -- contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.functionname(m1[{ 4,{} }])
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.functionname(m1[4][i])
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerrc = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerrc then
+         maxerrc = err[i]
+      end
+   end
+
+   -- non-contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.functionname(m1[{ {}, 4 }])
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.functionname(m1[i][4])
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerrnc = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerrnc then
+         maxerrnc = err[i]
+      end
+   end
+   return maxerrc, maxerrnc
+]]
+
+function torchtest.sin()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'sin'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.sinh()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'sinh'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.asin()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'asin'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.cos()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'cos'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.cosh()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'cosh'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.acos()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'acos'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.tan()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'tan'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.tanh()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'tanh'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.atan()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'atan'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.log()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'log'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.sqrt()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'sqrt'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.rsqrt()
+   local function TH_rsqrt(x)
+      return 1 / math.sqrt(x)
+   end
+
+   local f
+   local t = genericSingleOpTest:gsub('functionname', 'rsqrt'):gsub('math.rsqrt', 'TH_rsqrt')
+   local env = { TH_rsqrt=TH_rsqrt, torch=torch, math=math }
+   if not setfenv then -- Lua 5.2
+      f = load(t, 'test', 't', env)
+   else
+      f = loadstring(t)
+      setfenv(f, env)
+   end
+
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.sigmoid()
+   -- cant use genericSingleOpTest, since `math.sigmoid` doesnt exist, have to use
+   -- `torch.sigmoid` instead
+   local inputValues = {-1000,-1,0,0.5,1,2,1000}
+   local expectedOutput = {0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000}
+
+   local precision_4dps = 0.0002
+
+   -- float
+   local inputFT = torch.FloatTensor(inputValues)
+   local expectedFT = torch.FloatTensor(expectedOutput)
+   mytester:assertlt((torch.sigmoid(inputFT) - expectedFT):abs():max(), precision_4dps, 'error in torch.sigmoid - single')
+   mytester:assertlt((inputFT - torch.FloatTensor(inputValues)):abs():max(), precision_4dps, 'error in torch.sigmoid - single')
+   local sigmoidFT = torch.FloatTensor(inputValues):sigmoid()
+   mytester:assertlt((sigmoidFT - expectedFT):abs():max(), precision_4dps, 'error in torch.sigmoid - single')
+
+   -- double
+   local inputDT = torch.DoubleTensor(inputValues)
+   local expectedDT = torch.DoubleTensor(expectedOutput)
+   mytester:assertlt((torch.sigmoid(inputDT) - expectedDT):abs():max(), precision_4dps, 'error in torch.sigmoid - double')
+   mytester:assertlt((inputDT - torch.DoubleTensor(inputValues)):abs():max(), precision_4dps, 'error in torch.sigmoid - double')
+   local sigmoidDT = torch.DoubleTensor(inputValues):sigmoid()
+   mytester:assertlt((sigmoidDT - expectedDT):abs():max(), precision_4dps, 'error in torch.sigmoid - double')
+end
+
+function torchtest.exp()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'exp'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.floor()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'floor'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.ceil()
+   local f = loadstring(string.gsub(genericSingleOpTest, 'functionname', 'ceil'))
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.frac()
+   local function TH_frac(x)
+      return math.fmod(x, 1)
+   end
+
+   local f
+   local t = genericSingleOpTest:gsub('functionname', 'frac'):gsub('math.frac', 'TH_frac')   
+   local env = { TH_frac=TH_frac, torch=torch, math=math }
+   if not setfenv then -- Lua 5.2
+      f = load(t, 'test', 't', env)
+   else
+      f = loadstring(t)
+      setfenv(f, env)
+   end
+
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.trunc()
+   local function TH_trunc(x)
+      return x - math.fmod(x, 1)
+   end
+
+   local f
+   local t = genericSingleOpTest:gsub('functionname', 'trunc'):gsub('math.trunc', 'TH_trunc')
+   local env = { TH_trunc=TH_trunc, torch=torch, math=math }
+   if not setfenv then -- Lua 5.2
+      f = load(t, 'test', 't', env)
+   else
+      f = loadstring(t)
+      setfenv(f, env)
+   end
+
+   local maxerrc, maxerrnc = f()
+   mytester:assertlt(maxerrc, precision, 'error in torch.functionname - contiguous')
+   mytester:assertlt(maxerrnc, precision, 'error in torch.functionname - non-contiguous')
+end
+
+function torchtest.round()
+   -- [res] torch.round([res,] x)
+   -- contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.round(m1[{ 4,{} }])
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.floor(m1[4][i]+0.5)
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerrc = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerrc then
+         maxerrc = err[i]
+      end
+   end
+   mytester:assertlt(maxerrc, precision, 'error in torch.round - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.round(m1[{ {}, 4 }])
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.floor(m1[i][4]+0.5)
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerrnc = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerrnc then
+         maxerrnc = err[i]
+      end
+   end
+   mytester:assertlt(maxerrnc, precision, 'error in torch.round - non-contiguous')
+end
+
+function torchtest.max()  -- torch.max([resval, resind,] x [,dim])
+   -- torch.max( x )
+   -- contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.max(m1)
+   local res2 = m1[1][1]
+   for i = 1,m1:size(1) do
+      for j = 1,m1:size(2) do
+         if m1[i][j] > res2 then
+            res2 = m1[i][j]
+         end
+      end
+   end
+   local err = res1 - res2
+   mytester:assertlt(err, precision, 'error in torch.max - contiguous')
+   -- non-contiguous
+   local m1 = torch.randn(10,10,10)
+   local m2 = m1[{{}, 4, {}}]
+   local res1 = torch.max(m2)
+   local res2 = m2[1][1]
+   for i = 1,m2:size(1) do
+      for j = 1,m2:size(2) do
+         if m2[i][j] > res2 then
+            res2 = m2[i][j]
+         end
+      end
+   end
+   local err = res1 - res2
+   mytester:assertlt(err, precision, 'error in torch.max - non-contiguous')
+   -- torch.max([resval, resind,] x ,dim])
+   local m1 = torch.randn(100,100)
+   local res1val, res1ind = torch.max(m1, 2)
+   local res2val = res1val:clone():zero()
+   local res2ind = res1ind:clone():zero()
+   for i=1, m1:size(1) do
+      res2val[i] = m1[i][1]
+      res2ind[i] = 1
+      for j=1, m1:size(2) do
+         if m1[i][j] > res2val[i][1] then
+            res2val[i] = m1[i][j]
+            res2ind[i] = j
+         end
+      end
+   end
+   local errval = res1val:clone():zero()
+   for i = 1, res1val:size(1) do
+      errval[i] = math.abs(res1val[i][1] - res2val[i][1])
+      mytester:asserteq(res1ind[i][1], res2ind[i][1], 'error in torch.max - non-contiguous')
+   end
+   local maxerr = 0
+   for i = 1, errval:size(1) do
+      if errval[i][1] > maxerr then
+         maxerr = errval[i]
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.max - non-contiguous')
+   -- NaNs
+   for index in pairs{1, 5, 100} do
+      local m1 = torch.randn(100)
+      m1[index] = 0/0
+      local res1val, res1ind = torch.max(m1, 1)
+      mytester:assert(res1val[1] ~= res1val[1], 'error in torch.max (value) - NaNs')
+      mytester:assert(res1ind[1] == index, 'error in torch.max (index) - NaNs')
+      local res1val = torch.max(m1)
+      mytester:assert(res1val ~= res1val, 'error in torch.max - NaNs')
+   end
+end
+
+function torchtest.min()  -- torch.min([resval, resind,] x [,dim])
+   -- torch.min( x )
+   -- contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.min(m1)
+   local res2 = m1[1][1]
+   for i = 1,m1:size(1) do
+      for j = 1,m1:size(2) do
+         if m1[i][j] < res2 then
+            res2 = m1[i][j]
+         end
+      end
+   end
+   local err = res1 - res2
+   mytester:assertlt(err, precision, 'error in torch.min - contiguous')
+   -- non-contiguous
+   local m1 = torch.randn(10,10,10)
+   local m2 = m1[{{}, 4, {}}]
+   local res1 = torch.min(m2)
+   local res2 = m2[1][1]
+   for i = 1,m2:size(1) do
+      for j = 1,m2:size(2) do
+         if m2[i][j] < res2 then
+            res2 = m2[i][j]
+         end
+      end
+   end
+   local err = res1 - res2
+   mytester:assertlt(err, precision, 'error in torch.min - non-contiguous')
+   -- torch.min([resval, resind,] x ,dim])
+   local m1 = torch.randn(100,100)
+   local res1val, res1ind = torch.min(m1, 2)
+   local res2val = res1val:clone():zero()
+   local res2ind = res1ind:clone():zero()
+   for i=1, m1:size(1) do
+      res2val[i] = m1[i][1]
+      res2ind[i] = 1
+      for j=1, m1:size(2) do
+         if m1[i][j] < res2val[i][1] then
+            res2val[i] = m1[i][j]
+            res2ind[i] = j
+         end
+      end
+   end
+   local errval = res1val:clone():zero()
+   for i = 1, res1val:size(1) do
+      errval[i] = math.abs(res1val[i][1] - res2val[i][1])
+      mytester:asserteq(res1ind[i][1], res2ind[i][1], 'error in torch.min - non-contiguous')
+   end
+   local minerr = 0
+   for i = 1, errval:size(1) do
+      if errval[i][1] < minerr then
+         minerr = errval[i]
+      end
+   end
+   mytester:assertlt(minerr, precision, 'error in torch.min - non-contiguous')
+   -- NaNs
+   for index in pairs{1, 5, 100} do
+      local m1 = torch.randn(100)
+      m1[index] = 0/0
+      local res1val, res1ind = torch.min(m1, 1)
+      mytester:assert(res1val[1] ~= res1val[1], 'error in torch.min (value) - NaNs')
+      mytester:assert(res1ind[1] == index, 'error in torch.min (index) - NaNs')
+      local res1val = torch.min(m1)
+      mytester:assert(res1val ~= res1val, 'error in torch.min - NaNs')
+   end
+end
+
+function torchtest.cmax()
+  -- Two tensors.
+  local a = torch.rand(msize, msize)
+  local b = torch.rand(msize, msize)
+  local c = torch.cmax(a, b)
+  local expected_c = torch.zeros(msize, msize)
+  expected_c:map2(a, b, function(_, a, b) return math.max(a, b) end)
+  mytester:assertTensorEq(expected_c, c, 0,
+                          'error in torch.cmax(tensor, tensor)')
+
+  -- Tensor and scalar.
+  local v = torch.uniform()
+  c = torch.cmax(a, v)
+  expected_c:map(a, function(_, a) return math.max(a, v) end)
+  mytester:assertTensorEq(expected_c, c, 0,
+                          'error in torch.cmax(tensor, scalar).')
+end
+
+function torchtest.cmin()
+  -- Two tensors.
+  local a = torch.rand(msize, msize)
+  local b = torch.rand(msize, msize)
+  local c = torch.cmin(a, b)
+  local expected_c = torch.zeros(msize, msize)
+  expected_c:map2(a, b, function(_, a, b) return math.min(a, b) end)
+  mytester:assertTensorEq(expected_c, c, 0,
+                          'error in torch.cmin(tensor, tensor)')
+
+  -- Tensor and scalar.
+  local v = torch.uniform()
+  c = torch.cmin(a, v)
+  expected_c:map(a, function(_, a) return math.min(a, v) end)
+  mytester:assertTensorEq(expected_c, c, 0,
+                          'error in torch.cmin(tensor, scalar).')
+end
+
+function torchtest.lerp()
+   local function TH_lerp(a, b, weight)
+      return a + weight * (b-a);
+   end
+
+   local a = torch.rand(msize, msize)
+   local b = torch.rand(msize, msize)
+   local w = math.random()
+   local result = torch.lerp(a, b, w)
+   local expected = a:new()
+   expected:map2(a, b, function(_, a, b) return TH_lerp(a, b, w) end)
+   mytester:assertTensorEq(expected, result, precision, 'error in torch.lerp(tensor, tensor, weight)')
+
+   local a = (math.random()*2-1) * 100000
+   local b = (math.random()*2-1) * 100000
+   local w = math.random()
+   local result = torch.lerp(a, b, w)
+   local expected = TH_lerp(a, b, w)
+   mytester:assertalmosteq(expected, result, precision, 'error in torch.lerp(scalar, scalar, weight)')
+end
+
+for i, v in ipairs{{10}, {5, 5}} do
+   torchtest['allAndAny' .. i] =
+      function ()
+           local x = torch.ones(unpack(v)):byte()
+           mytester:assert(x:all(), 'error in all()')
+           mytester:assert(x:any(), 'error in any()')
+
+           x[3] = 0
+           mytester:assert(not x:all(), 'error in all()')
+           mytester:assert(x:any(), 'error in any()')
+
+           x:zero()
+           mytester:assert(not x:all(), 'error in all()')
+           mytester:assert(not x:any(), 'error in any()')
+
+           x:fill(2)
+           mytester:assert(x:all(), 'error in all()')
+           mytester:assert(x:any(), 'error in any()')
+       end
+end
+
+function torchtest.mv()
+   local m1 = torch.randn(100,100)
+   local v1 = torch.randn(100)
+
+   local res1 = torch.mv(m1,v1)
+
+   local res2 = res1:clone():zero()
+   for i = 1,m1:size(1) do
+      for j = 1,m1:size(2) do
+         res2[i] = res2[i] + m1[i][j] * v1[j]
+      end
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.mv')
+end
+
+function torchtest.add()
+   -- [res] torch.add([res,] tensor1, tensor2)
+   local m1 = torch.randn(100,100)
+   local v1 = torch.randn(100)
+
+   local res1 = torch.add(m1[{ 4,{} }],v1)
+
+   local res2 = res1:clone():zero()
+   for i = 1,m1:size(2) do
+      res2[i] = m1[4][i] + v1[i]
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.add - contiguous')
+
+   local m1 = torch.randn(100,100)
+   local v1 = torch.randn(100)
+
+   local res1 = torch.add(m1[{ {},4 }],v1)
+
+   local res2 = res1:clone():zero()
+   for i = 1,m1:size(1) do
+      res2[i] = m1[i][4] + v1[i]
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.add - non contiguous')
+
+   -- [res] torch.add([res,] tensor, value)
+   local m1 = torch.randn(10,10)
+   local res1 = m1:clone()
+   res1[{ 3,{} }]:add(2)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ 3,i }] = res2[{ 3,i }] + 2
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.add - scalar, contiguous')
+
+   local m1 = torch.randn(10,10)
+   local res1 = m1:clone()
+   res1[{ {},3 }]:add(2)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = res2[{ i,3 }] + 2
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.add - scalar, non contiguous')
+
+   -- [res] torch.add([res,] tensor1, value, tensor2)
+end
+
+function torchtest.csub()
+   local rngState = torch.getRNGState()
+   torch.manualSeed(123)
+
+   local a = torch.randn(100,90)
+   local b = a:clone():normal()
+
+   local res_add = torch.add(a, -1, b)
+   local res_csub = a:clone()
+   res_csub:csub(b)
+
+   mytester:assertlt((res_add - res_csub):abs():max(), 0.00001)
+
+   local _ = torch.setRNGState(rngState)
+end
+
+function torchtest.csub_scalar()
+   local rngState = torch.getRNGState()
+   torch.manualSeed(123)
+
+   local a = torch.randn(100,100)
+
+   local scalar = 123.5
+   local res_add = torch.add(a, -scalar)
+   local res_csub = a:clone()
+   res_csub:csub(scalar)
+
+   mytester:assertlt((res_add - res_csub):abs():max(), 0.00001)
+
+   local _ = torch.setRNGState(rngState)
+end
+
+function torchtest.neg()
+   local rngState = torch.getRNGState()
+   torch.manualSeed(123)
+
+   local a = torch.randn(100,90)
+   local zeros = torch.Tensor():resizeAs(a):zero()
+
+   local res_add = torch.add(zeros, -1, a)
+   local res_neg = a:clone()
+   res_neg:neg()
+
+   mytester:assertlt((res_add - res_neg):abs():max(), 0.00001)
+
+   local _ = torch.setRNGState(rngState)
+end
+
+function torchtest.cinv()
+   local rngState = torch.getRNGState()
+   torch.manualSeed(123)
+
+   local a = torch.randn(100,89)
+   local zeros = torch.Tensor():resizeAs(a):zero()
+
+   local res_pow = torch.pow(a, -1)
+   local res_inv = a:clone()
+   res_inv:cinv()
+
+   mytester:assertlt((res_pow - res_inv):abs():max(), 0.00001)
+
+   local _ = torch.setRNGState(rngState)
+end
+
+function torchtest.mul()
+   local m1 = torch.randn(10,10)
+   local res1 = m1:clone()
+
+   res1[{ {},3 }]:mul(2)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = res2[{ i,3 }] * 2
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.mul - scalar, non contiguous')
+end
+
+function torchtest.div()
+   local m1 = torch.randn(10,10)
+   local res1 = m1:clone()
+
+   res1[{ {},3 }]:div(2)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = res2[{ i,3 }] / 2
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.div - scalar, non contiguous')
+end
+
+function torchtest.fmod()
+   local m1 = torch.Tensor(10,10):uniform(-10, 10)
+   local res1 = m1:clone()
+
+   local q = 2.1
+   res1[{ {},3 }]:fmod(q)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = math.fmod(res2[{ i,3 }], q)
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.fmod - scalar, non contiguous')
+end
+
+function torchtest.remainder()
+   local m1 = torch.Tensor(10, 10):uniform(-10, 10)
+   local res1 = m1:clone()
+
+   local q = 2.1
+   res1[{ {},3 }]:remainder(q)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      res2[{ i,3 }] = res2[{ i,3 }] % q
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.remainder - scalar, non contiguous')
+end
+
+function torchtest.mm()
+   -- helper function
+   local function matrixmultiply(mat1,mat2)
+      local n = mat1:size(1)
+      local m = mat1:size(2)
+      local p = mat2:size(2)
+      local res = torch.zeros(n,p)
+      for i = 1, n do
+         for j = 1, p do
+            local sum = 0
+            for k = 1, m do
+               sum = sum + mat1[i][k]*mat2[k][j]
+            end
+            res[i][j] = sum
+         end
+      end
+      return res
+   end
+
+   -- contiguous case
+   local n, m, p = 10, 10, 5
+   local mat1 = torch.randn(n,m)
+   local mat2 = torch.randn(m,p)
+   local res = torch.mm(mat1,mat2)
+
+   local res2 = matrixmultiply(mat1,mat2)
+   mytester:assertTensorEq(res,res2,precision,'error in torch.mm')
+
+   -- non contiguous case 1
+   local n, m, p = 10, 10, 5
+   local mat1 = torch.randn(n,m)
+   local mat2 = torch.randn(p,m):t()
+   local res = torch.mm(mat1,mat2)
+
+   local res2 = matrixmultiply(mat1,mat2)
+   mytester:assertTensorEq(res,res2,precision,'error in torch.mm, non contiguous')
+
+   -- non contiguous case 2
+   local n, m, p = 10, 10, 5
+   local mat1 = torch.randn(m,n):t()
+   local mat2 = torch.randn(m,p)
+   local res = torch.mm(mat1,mat2)
+
+   local res2 = matrixmultiply(mat1,mat2)
+   mytester:assertTensorEq(res,res2,precision,'error in torch.mm, non contiguous')
+
+   -- non contiguous case 3
+   local n, m, p = 10, 10, 5
+   local mat1 = torch.randn(m,n):t()
+   local mat2 = torch.randn(p,m):t()
+   local res = torch.mm(mat1,mat2)
+
+   local res2 = matrixmultiply(mat1,mat2)
+   mytester:assertTensorEq(res,res2,precision,'error in torch.mm, non contiguous')
+
+   -- test with zero stride
+   local n, m, p = 10, 10, 5
+   local mat1 = torch.randn(n,m)
+   local mat2 = torch.randn(m,1):expand(m,p)
+   local res = torch.mm(mat1,mat2)
+
+   local res2 = matrixmultiply(mat1,mat2)
+   mytester:assertTensorEq(res,res2,precision,'error in torch.mm, non contiguous, zero stride')
+
+end
+
+function torchtest.bmm()
+   local num_batches = 10
+   local M, N, O = 23, 8, 12
+   local b1 = torch.randn(num_batches, M, N)
+   local b2 = torch.randn(num_batches, N, O)
+   local res = torch.bmm(b1, b2)
+
+   for i = 1, num_batches do
+     local r = torch.mm(b1[i], b2[i])
+     mytester:assertTensorEq(r, res[i], precision, 'result matrix ' .. i .. ' wrong')
+   end
+end
+
+function torchtest.addbmm()
+   local num_batches = 10
+   local M, N, O = 12, 8, 5
+   local b1 = torch.randn(num_batches, M, N)
+   local b2 = torch.randn(num_batches, N, O)
+   local res = torch.bmm(b1, b2)
+   local res2 = torch.Tensor():resizeAs(res[1]):zero()
+
+   res2:addbmm(b1,b2)
+   mytester:assertTensorEq(res2, res:sum(1)[1], precision, 'addbmm result wrong')
+
+   res2:addbmm(1,b1,b2)
+   mytester:assertTensorEq(res2, res:sum(1)[1]*2, precision, 'addbmm result wrong')
+
+   res2:addbmm(1,res2,.5,b1,b2)
+   mytester:assertTensorEq(res2, res:sum(1)[1]*2.5, precision, 'addbmm result wrong')
+
+   local res3 = torch.addbmm(1,res2,0,b1,b2)
+   mytester:assertTensorEq(res3, res2, precision, 'addbmm result wrong')
+
+   local res4 = torch.addbmm(1,res2,.5,b1,b2)
+   mytester:assertTensorEq(res4, res:sum(1)[1]*3, precision, 'addbmm result wrong')
+
+   local res5 = torch.addbmm(0,res2,1,b1,b2)
+   mytester:assertTensorEq(res5, res:sum(1)[1], precision, 'addbmm result wrong')
+
+   local res6 = torch.addbmm(.1,res2,.5,b1,b2)
+   mytester:assertTensorEq(res6, res2*.1 + res:sum(1)*.5, precision, 'addbmm result wrong')
+end
+
+function torchtest.baddbmm()
+   local num_batches = 10
+   local M, N, O = 12, 8, 5
+   local b1 = torch.randn(num_batches, M, N)
+   local b2 = torch.randn(num_batches, N, O)
+   local res = torch.bmm(b1, b2)
+   local res2 = torch.Tensor():resizeAs(res):zero()
+
+   res2:baddbmm(b1,b2)
+   mytester:assertTensorEq(res2, res, precision, 'baddbmm result wrong')
+
+   res2:baddbmm(1,b1,b2)
+   mytester:assertTensorEq(res2, res*2, precision, 'baddbmm result wrong')
+
+   res2:baddbmm(1,res2,.5,b1,b2)
+   mytester:assertTensorEq(res2, res*2.5, precision, 'baddbmm result wrong')
+
+   local res3 = torch.baddbmm(1,res2,0,b1,b2)
+   mytester:assertTensorEq(res3, res2, precision, 'baddbmm result wrong')
+
+   local res4 = torch.baddbmm(1,res2,.5,b1,b2)
+   mytester:assertTensorEq(res4, res*3, precision, 'baddbmm result wrong')
+
+   local res5 = torch.baddbmm(0,res2,1,b1,b2)
+   mytester:assertTensorEq(res5, res, precision, 'baddbmm result wrong')
+
+   local res6 = torch.baddbmm(.1,res2,.5,b1,b2)
+   mytester:assertTensorEq(res6, res2*.1 + res*.5, precision, 'baddbmm result wrong')
+end
+
+function torchtest.clamp()
+   local m1 = torch.rand(100):mul(5):add(-2.5)  -- uniform in [-2.5, 2.5]
+   -- just in case we're extremely lucky:
+   local min_val = -1
+   local max_val = 1
+   m1[1] = min_val
+   m1[2] = max_val
+   local res1 = m1:clone()
+
+   res1:clamp(min_val, max_val)
+
+   local res2 = m1:clone()
+   for i = 1,m1:size(1) do
+      if res2[i] > max_val then
+         res2[i] = max_val
+      elseif res2[i] < min_val then
+         res2[i] = min_val
+      end
+   end
+
+   local err = (res1-res2):abs():max()
+
+   mytester:assertlt(err, precision, 'error in torch.clamp - scalar, non contiguous')
+end
+
+function torchtest.pow() -- [res] torch.pow([res,] x)
+   -- base - tensor, exponent - number
+   -- contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.pow(m1[{ 4,{} }], 3)
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.pow(m1[4][i], 3)
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerr then
+         maxerr = err[i]
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.pow - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.pow(m1[{ {}, 4 }], 3)
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.pow(m1[i][4], 3)
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerr then
+         maxerr = err[i]
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.pow - non-contiguous')
+
+   -- base - number, exponent - tensor
+   -- contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.pow(3, m1[{ 4,{} }])
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.pow(3, m1[4][i])
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerr then
+         maxerr = err[i]
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.pow - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.randn(100,100)
+   local res1 = torch.pow(3, m1[{ {}, 4 }])
+   local res2 = res1:clone():zero()
+   for i = 1,res1:size(1) do
+      res2[i] = math.pow(3, m1[i][4])
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      err[i] = math.abs(res1[i] - res2[i])
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      if err[i] > maxerr then
+         maxerr = err[i]
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.pow - non-contiguous')
+end
+
+function torchtest.cdiv()  -- [res] torch.cdiv([res,] tensor1, tensor2)
+   -- contiguous
+   local m1 = torch.randn(10, 10, 10)
+   local m2 = torch.randn(10, 10 * 10)
+   local sm1 = m1[{4, {}, {}}]
+   local sm2 = m2[{4, {}}]
+   local res1 = torch.cdiv(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = sm1[i][j] / sm2[idx1d]
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cdiv - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.randn(10, 10, 10)
+   local m2 = torch.randn(10 * 10, 10 * 10)
+   local sm1 = m1[{{}, 4, {}}]
+   local sm2 = m2[{{}, 4}]
+   local res1 = torch.cdiv(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = sm1[i][j] / sm2[idx1d]
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cdiv - non-contiguous')
+end
+
+function torchtest.cfmod()
+   -- contiguous
+   local m1 = torch.Tensor(10, 10, 10):uniform(-10, 10)
+   local m2 = torch.Tensor(10, 10 * 10):uniform(-3, 3)
+   local sm1 = m1[{4, {}, {}}]
+   local sm2 = m2[{4, {}}]
+   local res1 = torch.cfmod(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = math.fmod(sm1[i][j], sm2[idx1d])
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cfmod - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.Tensor(10, 10, 10):uniform(-10, 10)
+   local m2 = torch.Tensor(10 * 10, 10 * 10):uniform(-3, 3)
+   local sm1 = m1[{{}, 4, {}}]
+   local sm2 = m2[{{}, 4}]
+   local res1 = torch.cfmod(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = math.fmod(sm1[i][j], sm2[idx1d])
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cfmod - non-contiguous')
+end
+
+function torchtest.cremainder()
+   -- contiguous
+   local m1 = torch.Tensor(10, 10, 10):uniform(-10, 10)
+   local m2 = torch.Tensor(10, 10 * 10):uniform(-3, 3)
+   local sm1 = m1[{4, {}, {}}]
+   local sm2 = m2[{4, {}}]
+   local res1 = torch.cremainder(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = sm1[i][j] % sm2[idx1d]
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cremainder - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.Tensor(10, 10, 10):uniform(-10, 10)
+   local m2 = torch.Tensor(10 * 10, 10 * 10):uniform(-3, 3)
+   local sm1 = m1[{{}, 4, {}}]
+   local sm2 = m2[{{}, 4}]
+   local res1 = torch.cremainder(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = sm1[i][j] % sm2[idx1d]
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cremainder - non-contiguous')
+end
+
+function torchtest.cmul()  -- [res] torch.cmul([res,] tensor1, tensor2)
+   -- contiguous
+   local m1 = torch.randn(10, 10, 10)
+   local m2 = torch.randn(10, 10 * 10)
+   local sm1 = m1[{4, {}, {}}]
+   local sm2 = m2[{4, {}}]
+   local res1 = torch.cmul(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = sm1[i][j] * sm2[idx1d]
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cmul - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.randn(10, 10, 10)
+   local m2 = torch.randn(10 * 10, 10 * 10)
+   local sm1 = m1[{{}, 4, {}}]
+   local sm2 = m2[{{}, 4}]
+   local res1 = torch.cmul(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = sm1[i][j] * sm2[idx1d]
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cmul - non-contiguous')
+end
+
+function torchtest.cpow()  -- [res] torch.cpow([res,] tensor1, tensor2)
+   -- contiguous
+   local m1 = torch.rand(10, 10, 10)
+   local m2 = torch.rand(10, 10 * 10)
+   local sm1 = m1[{4, {}, {}}]
+   local sm2 = m2[{4, {}}]
+   local res1 = torch.cpow(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = math.pow(sm1[i][j], sm2[idx1d])
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cpow - contiguous')
+
+   -- non-contiguous
+   local m1 = torch.rand(10, 10, 10)
+   local m2 = torch.rand(10 * 10, 10 * 10)
+   local sm1 = m1[{{}, 4, {}}]
+   local sm2 = m2[{{}, 4}]
+   local res1 = torch.cpow(sm1, sm2)
+   local res2 = res1:clone():zero()
+   for i = 1,sm1:size(1) do
+      for j = 1, sm1:size(2) do
+         local idx1d = (((i-1)*sm1:size(1)))+j
+         res2[i][j] = math.pow(sm1[i][j],sm2[idx1d])
+      end
+   end
+   local err = res1:clone():zero()
+   -- find absolute error
+   for i = 1, res1:size(1) do
+      for j = 1, res1:size(2) do
+         err[i][j] = math.abs(res1[i][j] - res2[i][j])
+      end
+   end
+   -- find maximum element of error
+   local maxerr = 0
+   for i = 1, err:size(1) do
+      for j = 1, err:size(2) do
+         if err[i][j] > maxerr then
+            maxerr = err[i][j]
+         end
+      end
+   end
+   mytester:assertlt(maxerr, precision, 'error in torch.cpow - non-contiguous')
+end
+
+function torchtest.sum()
+   local x = torch.rand(msize,msize)
+   local mx = torch.sum(x,2)
+   local mxx = torch.Tensor()
+   torch.sum(mxx,x,2)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.sum value')
+end
+function torchtest.prod()
+   local x = torch.rand(msize,msize)
+   local mx = torch.prod(x,2)
+   local mxx = torch.Tensor()
+   torch.prod(mxx,x,2)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.prod value')
+end
+function torchtest.cumsum()
+   local x = torch.rand(msize,msize)
+   local mx = torch.cumsum(x,2)
+   local mxx = torch.Tensor()
+   torch.cumsum(mxx,x,2)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.cumsum value')
+end
+function torchtest.cumprod()
+   local x = torch.rand(msize,msize)
+   local mx = torch.cumprod(x,2)
+   local mxx = torch.Tensor()
+   torch.cumprod(mxx,x,2)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.cumprod value')
+end
+function torchtest.cross()
+   local x = torch.rand(msize,3,msize)
+   local y = torch.rand(msize,3,msize)
+   local mx = torch.cross(x,y)
+   local mxx = torch.Tensor()
+   torch.cross(mxx,x,y)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.cross value')
+end
+function torchtest.zeros()
+   local mx = torch.zeros(msize,msize)
+   local mxx = torch.Tensor()
+   torch.zeros(mxx,msize,msize)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.zeros value')
+end
+function torchtest.histc()
+   local x = torch.Tensor{ 2, 4, 2, 2, 5, 4 }
+   local y = torch.histc(x, 5, 1, 5) -- nbins, min, max
+   local z = torch.Tensor{ 0, 3, 0, 2, 1 }
+   mytester:assertTensorEq(y,z,precision,'error in torch.histc')
+end
+function torchtest.ones()
+   local mx = torch.ones(msize,msize)
+   local mxx = torch.Tensor()
+   torch.ones(mxx,msize,msize)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.ones value')
+end
+function torchtest.diag()
+   local x = torch.rand(msize,msize)
+   local mx = torch.diag(x)
+   local mxx = torch.Tensor()
+   torch.diag(mxx,x)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.diag value')
+end
+function torchtest.eye()
+   local mx = torch.eye(msize,msize)
+   local mxx = torch.Tensor()
+   torch.eye(mxx,msize,msize)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.eye value')
+end
+function torchtest.renorm()
+   local m1 = torch.randn(10,5)
+   local res1 = torch.Tensor()
+   local m2
+
+   local function renorm(matrix, value, dim, max_norm)
+      local m1 = matrix:transpose(dim, 1):contiguous()
+      -- collapse non-dim dimensions:
+      m2 = m1:reshape(m1:size(1), m1:nElement()/m1:size(1))
+      local norms = m2:norm(value,2)
+      -- clip
+      local new_norms = norms:clone()
+      new_norms[torch.gt(norms, max_norm)] = max_norm
+      new_norms:cdiv(norms:add(1e-7))
+      -- renormalize
+      m1:cmul(new_norms:expandAs(m1))
+      return m1:transpose(dim, 1)
+   end
+
+   -- note that the axis fed to torch.renorm is different (2~=1)
+   local maxnorm = m1:norm(2,1):mean()
+   m2 = renorm(m1,2,2,maxnorm)
+
+   m1:renorm(2,2,maxnorm)
+   mytester:assertTensorEq(m1, m2, 0.00001, 'error in renorm')
+   mytester:assertTensorEq(m1:norm(2,1), m2:norm(2,1), 0.00001, 'error in renorm')
+
+   m1 = torch.randn(3,4,5)
+   m2 = m1:transpose(2,3):contiguous():reshape(15,4)
+
+   maxnorm = m2:norm(2,1):mean()
+   m2 = renorm(m2,2,2,maxnorm)
+
+   m1:renorm(2,2,maxnorm)
+   local m3 = m1:transpose(2,3):contiguous():reshape(15,4)
+   mytester:assertTensorEq(m3, m2, 0.00001, 'error in renorm')
+   mytester:assertTensorEq(m3:norm(2,1), m2:norm(2,1), 0.00001, 'error in renorm')
+end
+function torchtest.multinomialwithreplacement()
+   local n_row = 3
+   for n_col=4,5 do
+      local t=os.time()
+      torch.manualSeed(t)
+      local prob_dist = torch.rand(n_row,n_col)
+      prob_dist:select(2,n_col):fill(0) --index n_col shouldn't be sampled
+      local n_sample = n_col
+      local sample_indices = torch.multinomial(prob_dist, n_sample, true)
+      mytester:assert(prob_dist:dim() == 2, "wrong number of prob_dist dimensions")
+      mytester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
+      for i=1,n_row do
+         for j=1,n_sample do
+            mytester:assert(sample_indices[{i,j}] ~= n_col, "sampled an index with zero probability")
+         end
+      end
+   end
+end
+function torchtest.multinomialwithoutreplacement()
+   local n_row = 3
+   for n_col=4,5 do
+      local t=os.time()
+      torch.manualSeed(t)
+      local prob_dist = torch.rand(n_row,n_col)
+      prob_dist:select(2,n_col):fill(0) --index n_col shouldn't be sampled
+      local n_sample = 3
+      local sample_indices = torch.multinomial(prob_dist, n_sample, false)
+      mytester:assert(prob_dist:dim() == 2, "wrong number of prob_dist dimensions")
+      mytester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
+      for i=1,n_row do
+         local row_samples = {}
+         for j=1,n_sample do
+            local sample_idx = sample_indices[{i,j}]
+            mytester:assert(
+                sample_idx ~= n_col, "sampled an index with zero probability"
+            )
+            mytester:assert(
+                not row_samples[sample_idx], "sampled an index twice"
+            )
+            row_samples[sample_idx] = true
+         end
+      end
+   end
+end
+function torchtest.multinomialvector()
+   local n_col = 4
+   local t=os.time()
+   torch.manualSeed(t)
+   local prob_dist = torch.rand(n_col)
+   local n_sample = n_col
+   local sample_indices = torch.multinomial(prob_dist, n_sample, true)
+   local s_dim = sample_indices:dim()
+   mytester:assert(s_dim == 1, "wrong number of returned dimensions: "..s_dim)
+   mytester:assert(prob_dist:dim() == 1, "wrong number of prob_dist dimensions")
+   mytester:assert(sample_indices:size(1) == n_sample, "wrong number of samples")
+end
+function torchtest.range()
+   local mx = torch.range(0,1)
+   local mxx = torch.Tensor()
+   torch.range(mxx,0,1)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.range value')
+
+   -- Check range for non-contiguous tensors.
+   local x = torch.zeros(2, 3)
+   local y = x:narrow(2, 2, 2)
+   y:range(0, 3)
+   mytester:assertTensorEq(x, torch.Tensor{{0, 0, 1}, {0, 2, 3}}, 1e-16,
+                           'non-contiguous range failed')
+end
+function torchtest.rangenegative()
+   local mx = torch.Tensor({1,0})
+   local mxx = torch.Tensor()
+   torch.range(mxx,1,0,-1)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.range value for negative step')
+end
+function torchtest.rangeequalbounds()
+   local mx = torch.Tensor({1})
+   local mxx = torch.Tensor()
+   torch.range(mxx,1,1,-1)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.range value for equal bounds step')
+   torch.range(mxx,1,1,1)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.range value for equal bounds step')
+end
+function torchtest.rangefloat()
+   local mx = torch.FloatTensor():range(0.6, 0.9, 0.1)
+   mytester:asserteq(mx:size(1), 4, 'wrong size for FloatTensor range')
+   mx = torch.FloatTensor():range(1, 10, 0.3)
+   mytester:asserteq(mx:size(1), 31, 'wrong size for FloatTensor range')
+end
+function torchtest.rangedouble()
+   local mx = torch.DoubleTensor():range(0.6, 0.9, 0.1)
+   mytester:asserteq(mx:size(1), 4, 'wrong size for DoubleTensor range')
+   mx = torch.DoubleTensor():range(1, 10, 0.3)
+   mytester:asserteq(mx:size(1), 31, 'wrong size for DoubleTensor range')
+end
+function torchtest.randperm()
+   local t=os.time()
+   torch.manualSeed(t)
+   local mx = torch.randperm(msize)
+   local mxx = torch.Tensor()
+   torch.manualSeed(t)
+   torch.randperm(mxx,msize)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.randperm value')
+end
+function torchtest.reshape()
+   local x = torch.rand(10,13,23)
+   local mx = torch.reshape(x,130,23)
+   local mxx = torch.Tensor()
+   torch.reshape(mxx,x,130,23)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.reshape value')
+end
+
+local function assertIsOrdered(order, x, mxx, ixx, task)
+  local areOrdered
+  if order == 'descending' then
+    areOrdered = function(a, b) return a >= b end
+  elseif order == 'ascending' then
+    areOrdered = function(a, b) return a <= b end
+  else
+    error('unknown order "' .. order .. '", must be "ascending" or "descending"')
+  end
+
+  local decreasing = true
+  for j = 1,msize do
+    for k = 2,msize do
+      decreasing = decreasing and areOrdered(mxx[j][k-1], mxx[j][k])
+    end
+  end
+  mytester:assert(decreasing, 'torch.sort (' .. order .. ') values unordered for ' .. task)
+  local seen = torch.ByteTensor(msize)
+  local indicesCorrect = true
+  for k = 1,msize do
+    seen:zero()
+    for j = 1,msize do
+      indicesCorrect = indicesCorrect and (x[k][ixx[k][j]] == mxx[k][j])
+      seen[ixx[k][j]] = 1
+    end
+    indicesCorrect = indicesCorrect and (torch.sum(seen) == msize)
+  end
+  mytester:assert(indicesCorrect, 'torch.sort (' .. order .. ') indices wrong for ' .. task)
+end
+
+function torchtest.sortAscending()
+   local x = torch.rand(msize,msize)
+   local mx,ix = torch.sort(x)
+
+   -- Test use of result tensor
+   local mxx = torch.Tensor()
+   local ixx = torch.LongTensor()
+   torch.sort(mxx,ixx,x)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.sort (ascending) value')
+   mytester:asserteq(maxdiff(ix,ixx),0,'torch.sort (ascending) index')
+
+   -- Test sorting of random numbers
+   assertIsOrdered('ascending', x, mxx, ixx, 'random')
+
+   mytester:assertTensorEq(
+           torch.sort(torch.Tensor{ 50, 40, 30, 20, 10 }),
+           torch.Tensor{ 10, 20, 30, 40, 50 },
+           1e-16,
+           "torch.sort (ascending) simple sort"
+       )
+   -- Test that we still have proper sorting with duplicate keys
+   local x = torch.floor(torch.rand(msize,msize)*10)
+   torch.sort(mxx,ixx,x)
+   assertIsOrdered('ascending', x, mxx, ixx, 'random with duplicate keys')
+end
+
+function torchtest.sortDescending()
+   local x = torch.rand(msize,msize)
+   local mx,ix = torch.sort(x,true)
+
+   -- Test use of result tensor
+   local mxx = torch.Tensor()
+   local ixx = torch.LongTensor()
+   torch.sort(mxx,ixx,x,true)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.sort (descending) value')
+   mytester:asserteq(maxdiff(ix,ixx),0,'torch.sort (descending) index')
+
+   -- Test sorting of random numbers
+   assertIsOrdered('descending', x, mxx, ixx, 'random')
+
+   -- Test simple sort task
+   mytester:assertTensorEq(
+           torch.sort(torch.Tensor{ 10, 20, 30, 40, 50 },true),
+           torch.Tensor{ 50, 40, 30, 20, 10 },
+           1e-16,
+           "torch.sort (descending) simple sort"
+       )
+
+   -- Test that we still have proper sorting with duplicate keys
+   assertIsOrdered('descending', x, mxx, ixx, 'random with duplicate keys')
+end
+
+function torchtest.topK()
+   local function topKViaSort(t, k, dim, dir)
+      local sorted, indices = t:sort(dim, dir)
+      return sorted:narrow(dim, 1, k), indices:narrow(dim, 1, k)
+   end
+
+   local function compareTensors(t, res1, ind1, res2, ind2, dim, msg)
+      -- Values should be exactly equivalent
+      mytester:assertTensorEq(res1, res2, 0, msg)
+
+      -- Indices might differ based on the implementation, since there is
+      -- no guarantee of the relative order of selection
+      if ind1:eq(ind2):min() == 0 then
+         -- To verify that the indices represent equivalent elements,
+         -- gather from the input using the topk indices and compare against
+         -- the sort indices
+         local vals = t:gather(dim, ind2)
+         mytester:assertTensorEq(res1, vals, 0, msg)
+      end
+   end
+
+   local function compare(t, k, dim, dir, msg)
+      local topKVal, topKInd = t:topk(k, dim, dir, true)
+      local sortKVal, sortKInd = topKViaSort(t, k, dim, dir)
+
+      compareTensors(t, sortKVal, sortKInd, topKVal, topKInd, dim, msg)
+   end
+
+   local t = torch.rand(math.random(1, msize),
+                        math.random(1, msize),
+                        math.random(1, msize))
+
+   for kTries = 1, 3 do
+      for dimTries = 1, 3 do
+         for _, transpose in ipairs({true, false}) do
+            for _, dir in ipairs({true, false}) do
+               local testTensor = t
+
+               local transposeMsg = nil
+               if transpose then
+                  local dim1 = math.random(1, t:nDimension())
+                  local dim2 = dim1
+
+                  while dim1 == dim2 do
+                     dim2 = math.random(1, t:nDimension())
+                  end
+
+                  testTensor = t:transpose(dim1, dim2)
+                  transposeMsg = 'transpose(' .. dim1 .. ', ' .. dim2 .. ')'
+               end
+
+               local dim = math.random(1, testTensor:nDimension())
+               local k = math.random(1, testTensor:size(dim))
+               local msg = 'topk(' .. k .. ', ' .. dim .. ', ' .. tostring(dir) .. ', true)'
+               if transposeMsg then
+                  msg = msg .. ' ' .. transposeMsg
+               end
+
+               compare(testTensor, k, dim, dir, msg)
+            end
+         end
+      end
+   end
+end
+
+function torchtest.kthvalue()
+   local x = torch.rand(msize, msize, msize)
+   local x0 = x:clone()
+   do
+      local k = math.random(1, msize)
+      local mx, ix = torch.kthvalue(x, k)
+      local mxx, ixx = torch.sort(x)
+
+      mytester:assertTensorEq(mxx:select(3, k), mx:select(3, 1), 0,
+                              'torch.kthvalue value')
+      mytester:assertTensorEq(ixx:select(3, k), ix:select(3, 1), 0,
+                              'torch.kthvalue index')
+   end
+   do -- test use of result tensors
+      local k = math.random(1, msize)
+      local mx = torch.Tensor()
+      local ix = torch.LongTensor()
+      torch.kthvalue(mx, ix, x, k)
+      local mxx, ixx = torch.sort(x)
+      mytester:assertTensorEq(mxx:select(3, k), mx:select(3, 1), 0,
+                              'torch.kthvalue value')
+      mytester:assertTensorEq(ixx:select(3, k), ix:select(3, 1), 0,
+                              'torch.kthvalue index')
+   end
+   do -- test non-default dim
+      local k = math.random(1, msize)
+      local mx, ix = torch.kthvalue(x, k, 1)
+      local mxx, ixx = torch.sort(x, 1)
+      mytester:assertTensorEq(mxx:select(1, k), mx[1], 0,
+                              'torch.kthvalue value')
+      mytester:assertTensorEq(ixx:select(1, k), ix[1], 0,
+                              'torch.kthvalue index')
+   end
+   do -- non-contiguous
+      local y = x:narrow(2, 1, 1)
+      local y0 = y:clone()
+      local k = math.random(1, msize)
+      local my, ix = torch.kthvalue(y, k)
+      local my0, ix0 = torch.kthvalue(y0, k)
+      mytester:assertTensorEq(my, my0, 0, 'torch.kthvalue value')
+      mytester:assertTensorEq(ix, ix0, 0, 'torch.kthvalue index')
+   end
+   mytester:assertTensorEq(x, x0, 0, 'torch.kthvalue modified input')
+
+   -- simple test case (with repetitions)
+   local y = torch.Tensor{3,5,4,1,1,5}
+   mytester:assertTensorEq(torch.kthvalue(y, 3), torch.Tensor{3}, 1e-16,
+      'torch.kthvalue simple')
+   mytester:assertTensorEq(torch.kthvalue(y, 2), torch.Tensor{1}, 1e-16,
+      'torch.kthvalue simple')
+end
+
+function torchtest.median()
+   for _, msize in ipairs{155,156} do
+      local x = torch.rand(msize, msize)
+      local x0 = x:clone()
+
+      local mx, ix = torch.median(x)
+      local mxx, ixx = torch.sort(x)
+      local ind = math.floor((msize+1)/2)
+
+      mytester:assertTensorEq(mxx:select(2, ind), mx:select(2, 1), 0,
+                              'torch.median value')
+      mytester:assertTensorEq(ixx:select(2, ind), ix:select(2, 1), 0,
+                              'torch.median index')
+
+      -- Test use of result tensor
+      local mr = torch.Tensor()
+      local ir = torch.LongTensor()
+      torch.median(mr, ir, x)
+      mytester:assertTensorEq(mr, mx, 0, 'torch.median result tensor value')
+      mytester:assertTensorEq(ir, ix, 0, 'torch.median result tensor index')
+
+      -- Test non-default dim
+      mx, ix = torch.median(x, 1)
+      mxx, ixx = torch.sort(x, 1)
+      mytester:assertTensorEq(mxx:select(1, ind), mx[1], 0,
+                              'torch.median value')
+      mytester:assertTensorEq(ixx:select(1, ind), ix[1], 0,
+                              'torch.median index')
+
+      -- input unchanged
+      mytester:assertTensorEq(x, x0, 0, 'torch.median modified input')
+   end
+end
+
+function torchtest.mode()
+   local x = torch.range(1, msize * msize):reshape(msize, msize)
+   x:select(1, 1):fill(1)
+   x:select(1, 2):fill(1)
+   x:select(2, 1):fill(1)
+   x:select(2, 2):fill(1)
+   local x0 = x:clone()
+
+   -- Pre-calculated results.
+   local res = torch.Tensor(msize):fill(1)
+   -- The indices are the position of the last appearance of the mode element.
+   local resix = torch.LongTensor(msize):fill(2)
+   resix[1] = msize
+   resix[2] = msize
+
+   local mx, ix = torch.mode(x)
+
+   mytester:assertTensorEq(res:view(msize, 1), mx, 0, 'torch.mode value')
+   mytester:assertTensorEq(resix:view(msize, 1), ix, 0, 'torch.mode index')
+
+   -- Test use of result tensor
+   local mr = torch.Tensor()
+   local ir = torch.LongTensor()
+   torch.mode(mr, ir, x)
+   mytester:assertTensorEq(mr, mx, 0, 'torch.mode result tensor value')
+   mytester:assertTensorEq(ir, ix, 0, 'torch.mode result tensor index')
+
+   -- Test non-default dim
+   mx, ix = torch.mode(x, 1)
+   mytester:assertTensorEq(res:view(1, msize), mx, 0, 'torch.mode value')
+   mytester:assertTensorEq(resix:view(1, msize), ix, 0, 'torch.mode index')
+
+   -- input unchanged
+   mytester:assertTensorEq(x, x0, 0, 'torch.mode modified input')
+end
+
+
+function torchtest.tril()
+   local x = torch.rand(msize,msize)
+   local mx = torch.tril(x)
+   local mxx = torch.Tensor()
+   torch.tril(mxx,x)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.tril value')
+end
+function torchtest.triu()
+   local x = torch.rand(msize,msize)
+   local mx = torch.triu(x)
+   local mxx = torch.Tensor()
+   torch.triu(mxx,x)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.tril value')
+end
+function torchtest.cat()
+   for dim = 1, 3 do
+      local x = torch.rand(13, msize, msize):transpose(1, dim)
+      local y = torch.rand(17, msize, msize):transpose(1, dim)
+      local mx = torch.cat(x, y, dim)
+      mytester:assertTensorEq(mx:narrow(dim, 1, 13), x, 0, 'torch.cat value')
+      mytester:assertTensorEq(mx:narrow(dim, 14, 17), y, 0, 'torch.cat value')
+
+      local mxx = torch.Tensor()
+      torch.cat(mxx, x, y, dim)
+      mytester:assertTensorEq(mx, mxx, 0, 'torch.cat value')
+   end
+end
+function torchtest.catArray()
+   for dim = 1, 3 do
+      local x = torch.rand(13, msize, msize):transpose(1, dim)
+      local y = torch.rand(17, msize, msize):transpose(1, dim)
+      local z = torch.rand(19, msize, msize):transpose(1, dim)
+
+      local mx = torch.cat({x, y, z}, dim)
+      mytester:assertTensorEq(mx:narrow(dim, 1, 13), x, 0, 'torch.cat value')
+      mytester:assertTensorEq(mx:narrow(dim, 14, 17), y, 0, 'torch.cat value')
+      mytester:assertTensorEq(mx:narrow(dim, 31, 19), z, 0, 'torch.cat value')
+
+      mytester:assertError(function() torch.cat{} end, 'torch.cat empty table')
+
+      local mxx = torch.Tensor()
+      torch.cat(mxx, {x, y, z}, dim)
+      mytester:assertTensorEq(mx, mxx, 0, 'torch.cat value')
+      torch.cat(mxx:float(), {x:float(), y:float(), z:float()}, dim)
+      mytester:assertTensorEq(mx, mxx, 0, 'torch.cat value')
+      torch.cat(mxx:double(), {x:double(), y:double(), z:double()}, dim)
+      mytester:assertTensorEq(mx, mxx, 0, 'torch.cat value')
+   end
+end
+function torchtest.sin_2()
+   local x = torch.rand(msize,msize,msize)
+   local mx = torch.sin(x)
+   local mxx  = torch.Tensor()
+   torch.sin(mxx,x)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.sin value')
+end
+function torchtest.linspace()
+   local from = math.random()
+   local to = from+math.random()
+   local mx = torch.linspace(from,to,137)
+   local mxx = torch.Tensor()
+   torch.linspace(mxx,from,to,137)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.linspace value')
+   mytester:assertError(function() torch.linspace(0,1,1) end, 'accepted 1 point between 2 distinct endpoints')
+   mytester:assertTensorEq(torch.linspace(0,0,1),torch.zeros(1),1e-16, 'failed to generate for torch.linspace(0,0,1)')
+
+   -- Check linspace for generating with start > end.
+   mytester:assertTensorEq(torch.linspace(2,0,3),
+                           torch.Tensor{2,1,0},
+                           1e-16,
+                           'failed to generate for torch.linspace(2,0,3)')
+
+   -- Check linspace for non-contiguous tensors.
+   local x = torch.zeros(2, 3)
+   local y = x:narrow(2, 2, 2)
+   y:linspace(0, 3, 4)
+   mytester:assertTensorEq(x, torch.Tensor{{0, 0, 1}, {0, 2, 3}}, 1e-16,
+                           'non-contiguous linspace failed')
+end
+function torchtest.logspace()
+   local from = math.random()
+   local to = from+math.random()
+   local mx = torch.logspace(from,to,137)
+   local mxx = torch.Tensor()
+   torch.logspace(mxx,from,to,137)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.logspace value')
+   mytester:assertError(function() torch.logspace(0,1,1) end, 'accepted 1 point between 2 distinct endpoints')
+   mytester:assertTensorEq(torch.logspace(0,0,1),torch.ones(1),1e-16, 'failed to generate for torch.linspace(0,0,1)')
+
+   -- Check logspace for generating with start > end.
+   mytester:assertTensorEq(torch.logspace(1,0,2),
+                           torch.Tensor{10, 1},
+                           1e-16,
+                           'failed to generate for torch.logspace(1,0,2)')
+
+   -- Check logspace for non-contiguous tensors.
+   local x = torch.zeros(2, 3)
+   local y = x:narrow(2, 2, 2)
+   y:logspace(0, 3, 4)
+   mytester:assertTensorEq(x, torch.Tensor{{0, 1, 10}, {0, 100, 1000}}, 1e-16,
+                           'non-contiguous logspace failed')
+end
+function torchtest.rand()
+   torch.manualSeed(123456)
+   local mx = torch.rand(msize,msize)
+   local mxx = torch.Tensor()
+   torch.manualSeed(123456)
+   torch.rand(mxx,msize,msize)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.rand value')
+end
+function torchtest.randn()
+   torch.manualSeed(123456)
+   local mx = torch.randn(msize,msize)
+   local mxx = torch.Tensor()
+   torch.manualSeed(123456)
+   torch.randn(mxx,msize,msize)
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.randn value')
+end
+function torchtest.gesv()
+   if not torch.gesv then return end
+   local a=torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                         {-6.05, -3.30,  5.36, -4.44,  1.08},
+                         {-0.45,  2.58, -2.70,  0.27,  9.04},
+                         {8.32,  2.71,  4.35, -7.17,  2.14},
+                         {-9.67, -5.14, -7.26,  6.08, -6.87}}):t()
+   local b=torch.Tensor({{4.02,  6.19, -8.22, -7.57, -3.03},
+                         {-1.56,  4.00, -8.67,  1.75,  2.86},
+                         {9.81, -4.09, -4.57, -8.61,  8.99}}):t()
+   local mx = torch.gesv(b,a)
+   mytester:assertlt(b:dist(a*mx),1e-12,'torch.gesv')
+   local ta = torch.Tensor()
+   local tb = torch.Tensor()
+   local mxx = torch.gesv(tb,ta,b,a)
+   local mxxx = torch.gesv(b,a,b,a)
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.gesv value temp')
+   mytester:asserteq(maxdiff(mx,b),0,'torch.gesv value flag')
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.gesv value out1')
+   mytester:asserteq(maxdiff(mx,mxxx),0,'torch.gesv value out2')
+end
+function torchtest.gesv_reuse()
+   if not torch.gesv then return end
+   local a=torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                         {-6.05, -3.30,  5.36, -4.44,  1.08},
+                         {-0.45,  2.58, -2.70,  0.27,  9.04},
+                         {8.32,  2.71,  4.35, -7.17,  2.14},
+                         {-9.67, -5.14, -7.26,  6.08, -6.87}}):t()
+   local b=torch.Tensor({{4.02,  6.19, -8.22, -7.57, -3.03},
+                         {-1.56,  4.00, -8.67,  1.75,  2.86},
+                         {9.81, -4.09, -4.57, -8.61,  8.99}}):t()
+   local mx = torch.gesv(b,a)
+   local ta = torch.Tensor()
+   local tb = torch.Tensor()
+   torch.gesv(tb,ta,b,a)
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.gesv value temp')
+   torch.gesv(tb,ta,b,a)
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.gesv value reuse')
+end
+function torchtest.trtrs()
+   if not torch.trtrs then return end
+   local a=torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                         {-6.05, -3.30,  5.36, -4.44,  1.08},
+                         {-0.45,  2.58, -2.70,  0.27,  9.04},
+                         {8.32,  2.71,  4.35, -7.17,  2.14},
+                         {-9.67, -5.14, -7.26,  6.08, -6.87}}):t()
+   local b=torch.Tensor({{4.02,  6.19, -8.22, -7.57, -3.03},
+                         {-1.56,  4.00, -8.67,  1.75,  2.86},
+                         {9.81, -4.09, -4.57, -8.61,  8.99}}):t()
+
+   local U = torch.triu(a)
+   local L = torch.tril(a)
+
+   -- solve Ux = b
+   local x = torch.trtrs(b, U)
+   mytester:assertlt(b:dist(U*x),1e-12,'torch.trtrs')
+   x = torch.trtrs(b, U, 'U', 'N', 'N')
+   mytester:assertlt(b:dist(U*x),1e-12,'torch.trtrs')
+
+   -- solve Lx = b
+   x = torch.trtrs(b, L, 'L')
+   mytester:assertlt(b:dist(L*x),1e-12,'torch.trtrs')
+   x = torch.trtrs(b, L, 'L', 'N', 'N')
+   mytester:assertlt(b:dist(L*x),1e-12,'torch.trtrs')
+
+   -- solve U'x = b
+   x = torch.trtrs(b, U, 'U', 'T')
+   mytester:assertlt(b:dist(U:t()*x),1e-12,'torch.trtrs')
+   x = torch.trtrs(b, U, 'U', 'T', 'N')
+   mytester:assertlt(b:dist(U:t()*x),1e-12,'torch.trtrs')
+
+   -- solve U'x = b by manual transposition
+   y = torch.trtrs(b, U:t(), 'L', 'N')
+   mytester:assertlt(x:dist(y),1e-12,'torch.trtrs')
+
+   -- solve L'x = b
+   x = torch.trtrs(b, L, 'L', 'T')
+   mytester:assertlt(b:dist(L:t()*x),1e-12,'torch.trtrs')
+   x = torch.trtrs(b, L, 'L', 'T', 'N')
+   mytester:assertlt(b:dist(L:t()*x),1e-12,'torch.trtrs')
+
+   -- solve L'x = b by manual transposition
+   y = torch.trtrs(b, L:t(), 'U', 'N')
+   mytester:assertlt(x:dist(y),1e-12,'torch.trtrs')
+end
+function torchtest.trtrs_reuse()
+   if not torch.trtrs then return end
+   local a=torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                         {-6.05, -3.30,  5.36, -4.44,  1.08},
+                         {-0.45,  2.58, -2.70,  0.27,  9.04},
+                         {8.32,  2.71,  4.35, -7.17,  2.14},
+                         {-9.67, -5.14, -7.26,  6.08, -6.87}}):t()
+   local b=torch.Tensor({{4.02,  6.19, -8.22, -7.57, -3.03},
+                         {-1.56,  4.00, -8.67,  1.75,  2.86},
+                         {9.81, -4.09, -4.57, -8.61,  8.99}}):t()
+   local mx = torch.trtrs(b,a)
+   local ta = torch.Tensor()
+   local tb = torch.Tensor()
+   torch.trtrs(tb,ta,b,a)
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.trtrs value temp')
+   tb:zero()
+   torch.trtrs(tb,ta,b,a)
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.trtrs value reuse')
+end
+function torchtest.gels_uniquely_determined()
+   if not torch.gels then return end
+   local expectedNorm = 0
+   local a=torch.Tensor({{ 1.44, -9.96, -7.55,  8.34},
+                         {-7.84, -0.28,  3.24,  8.09},
+                         {-4.39, -3.24,  6.27,  5.28},
+                         {4.53,  3.83, -6.64,  2.06}}):t()
+   local b=torch.Tensor({{8.58,  8.26,  8.48, -5.28},
+                         {9.35, -4.43, -0.70, -0.26}}):t()
+   local a_copy = a:clone()
+   local b_copy = b:clone()
+   local mx = torch.gels(b,a)
+   mytester:asserteq(maxdiff(a,a_copy),0,'torch.gels changed a')
+   mytester:asserteq(maxdiff(b,b_copy),0,'torch.gels changed b')
+   mytester:assertalmosteq((torch.mm(a,mx)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+
+   local ta = torch.Tensor()
+   local tb = torch.Tensor()
+   local mxx = torch.gels(tb,ta,b,a)
+   mytester:asserteq(maxdiff(a,a_copy),0,'torch.gels changed a')
+   mytester:asserteq(maxdiff(b,b_copy),0,'torch.gels changed b')
+   mytester:assertalmosteq((torch.mm(a,tb)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+
+   local mxxx = torch.gels(b,a,b,a)
+   mytester:assertalmosteq((torch.mm(a_copy,b)-b_copy):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.gels value temp')
+   mytester:asserteq(maxdiff(mx,b),0,'torch.gels value flag')
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.gels value out1')
+   mytester:asserteq(maxdiff(mx,mxxx),0,'torch.gels value out2')
+end
+function torchtest.gels_reuse()
+   if not torch.gels then return end
+   local expectedNorm = 0
+   local a=torch.Tensor({{ 1.44, -9.96, -7.55,  8.34},
+                         {-7.84, -0.28,  3.24,  8.09},
+                         {-4.39, -3.24,  6.27,  5.28},
+                         {4.53,  3.83, -6.64,  2.06}}):t()
+   local b=torch.Tensor({{8.58,  8.26,  8.48, -5.28},
+                         {9.35, -4.43, -0.70, -0.26}}):t()
+   local ta = torch.Tensor()
+   local tb = torch.Tensor()
+   torch.gels(tb,ta,b,a)
+   mytester:assertalmosteq((torch.mm(a,tb)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+   torch.gels(tb,ta,b,a)
+   mytester:assertalmosteq((torch.mm(a,tb)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+   torch.gels(tb,ta,b,a)
+   mytester:assertalmosteq((torch.mm(a,tb)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+end
+function torchtest.gels_overdetermined()
+   if not torch.gels then return end
+   local expectedNorm = 17.390200628863
+   local a=torch.Tensor({{ 1.44, -9.96, -7.55,  8.34,  7.08, -5.45},
+                         {-7.84, -0.28,  3.24,  8.09,  2.52, -5.70},
+                         {-4.39, -3.24,  6.27,  5.28,  0.74, -1.19},
+                         {4.53,  3.83, -6.64,  2.06, -2.47,  4.70}}):t()
+   local b=torch.Tensor({{8.58,  8.26,  8.48, -5.28,  5.72,  8.93},
+                         {9.35, -4.43, -0.70, -0.26, -7.36, -2.52}}):t()
+   local a_copy = a:clone()
+   local b_copy = b:clone()
+   local mx = torch.gels(b,a)
+   mytester:asserteq(maxdiff(a,a_copy),0,'torch.gels changed a')
+   mytester:asserteq(maxdiff(b,b_copy),0,'torch.gels changed b')
+   mytester:assertalmosteq((torch.mm(a, mx)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+
+   local ta = torch.Tensor()
+   local tb = torch.Tensor()
+   local mxx = torch.gels(tb,ta,b,a)
+   mytester:asserteq(maxdiff(a,a_copy),0,'torch.gels changed a')
+   mytester:asserteq(maxdiff(b,b_copy),0,'torch.gels changed b')
+   mytester:assertalmosteq((torch.mm(a,tb)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+
+   local mxxx = torch.gels(b,a,b,a)
+   mytester:assertalmosteq((torch.mm(a_copy,b)-b_copy):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.gels value temp')
+   mytester:asserteq(maxdiff(mx,b),0,'torch.gels value flag')
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.gels value out1')
+   mytester:asserteq(maxdiff(mx,mxxx),0,'torch.gels value out2')
+end
+function torchtest.gels_underdetermined()
+   if not torch.gels then return end
+   local expectedNorm = 0
+   local a=torch.Tensor({{ 1.44, -9.96, -7.55},
+                         {-7.84, -0.28,  3.24},
+                         {-4.39, -3.24,  6.27},
+                         {4.53,  3.83, -6.64}}):t()
+   local b=torch.Tensor({{8.58,  8.26,  8.48},
+                         {9.35, -4.43, -0.70}}):t()
+
+   local a_copy = a:clone()
+   local b_copy = b:clone()
+   local mx = torch.gels(b,a)
+   mytester:asserteq(maxdiff(a,a_copy),0,'torch.gels changed a')
+   mytester:asserteq(maxdiff(b,b_copy),0,'torch.gels changed b')
+   mytester:assertalmosteq((torch.mm(a,mx)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+
+   local ta = torch.Tensor()
+   local tb = torch.Tensor()
+   local mxx = torch.gels(tb,ta,b,a)
+   mytester:asserteq(maxdiff(a,a_copy),0,'torch.gels changed a')
+   mytester:asserteq(maxdiff(b,b_copy),0,'torch.gels changed b')
+   mytester:assertalmosteq((torch.mm(a,tb)-b):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+
+   local mxxx = torch.gels(b,a,b,a)
+   mytester:assertalmosteq((torch.mm(a_copy,b)-b_copy):norm(), expectedNorm, 1e-8, 'torch.gels wrong answer')
+   mytester:asserteq(maxdiff(mx,tb),0,'torch.gels value temp')
+   mytester:asserteq(maxdiff(mx,b),0,'torch.gels value flag')
+   mytester:asserteq(maxdiff(mx,mxx),0,'torch.gels value out1')
+   mytester:asserteq(maxdiff(mx,mxxx),0,'torch.gels value out2')
+end
+function torchtest.eig()
+   if not torch.eig then return end
+   local a=torch.Tensor({{ 1.96,  0.00,  0.00,  0.00,  0.00},
+                         {-6.49,  3.80,  0.00,  0.00,  0.00},
+                         {-0.47, -6.39,  4.17,  0.00,  0.00},
+                         {-7.20,  1.50, -1.51,  5.70,  0.00},
+                         {-0.65, -6.34,  2.67,  1.80, -7.10}}):t():clone()
+   local e = torch.eig(a)
+   local ee,vv = torch.eig(a,'V')
+   local te = torch.Tensor()
+   local tv = torch.Tensor()
+   local eee,vvv = torch.eig(te,tv,a,'V')
+   mytester:assertlt(maxdiff(e,ee),1e-12,'torch.eig value')
+   mytester:assertlt(maxdiff(ee,eee),1e-12,'torch.eig value')
+   mytester:assertlt(maxdiff(ee,te),1e-12,'torch.eig value')
+   mytester:assertlt(maxdiff(vv,vvv),1e-12,'torch.eig value')
+   mytester:assertlt(maxdiff(vv,tv),1e-12,'torch.eig value')
+end
+function torchtest.eig_reuse()
+   if not torch.eig then return end
+   local X = torch.randn(4,4)
+   X = X:t()*X
+   local e, v = torch.zeros(4,2), torch.zeros(4,4)
+   torch.eig(e, v, X,'V')
+   local Xhat = v * torch.diag(e:select(2,1)) * v:t()
+   mytester:assertTensorEq(X, Xhat, 1e-8, 'VeV\' wrong')
+   mytester:assert(not v:isContiguous(), 'V is contiguous')
+
+   torch.eig(e, v, X, 'V')
+   local Xhat = torch.mm(v, torch.mm(e:select(2,1):diag(), v:t()))
+   mytester:assertTensorEq(X, Xhat, 1e-8, 'VeV\' wrong')
+   mytester:assert(not v:isContiguous(), 'V is contiguous')
+end
+function torchtest.eig_noncontig()
+   if not torch.eig then return end
+   local X = torch.randn(4,4)
+   X = X:t()*X
+   local e = torch.zeros(4,2,2)[{ {}, 2, {} }]
+   local v = torch.zeros(4,2,4)[{ {}, 2, {} }]
+   mytester:assert(not v:isContiguous(), 'V is contiguous')
+   mytester:assert(not e:isContiguous(), 'E is contiguous')
+   torch.eig(e, v, X,'V')
+   local Xhat = v * torch.diag(e:select(2,1)) * v:t()
+   mytester:assertTensorEq(X, Xhat, 1e-8, 'VeV\' wrong')
+end
+function torchtest.test_symeig()
+  if not torch.symeig then return end
+  local xval = torch.rand(100,3)
+  local cov = torch.mm(xval:t(), xval)
+  local rese = torch.zeros(3)
+  local resv = torch.zeros(3,3)
+
+  -- First call to symeig
+  mytester:assert(resv:isContiguous(), 'resv is not contiguous') -- PASS
+  torch.symeig(rese, resv, cov:clone(), 'V')
+  local ahat = resv*torch.diag(rese)*resv:t()
+  mytester:assertTensorEq(cov, ahat, 1e-8, 'VeV\' wrong') -- PASS
+
+  -- Second call to symeig
+  mytester:assert(not resv:isContiguous(), 'resv is contiguous') -- FAIL
+  torch.symeig(rese, resv, cov:clone(), 'V')
+  local ahat = torch.mm(torch.mm(resv, torch.diag(rese)), resv:t())
+  mytester:assertTensorEq(cov, ahat, 1e-8, 'VeV\' wrong') -- FAIL
+end
+function  torchtest.symeig_noncontig()
+  if not torch.symeig then return end
+   local X = torch.rand(5,5)
+   X = X:t()*X
+   local e = torch.zeros(4,2):select(2,2)
+   local v = torch.zeros(4,2,4)[{ {}, 2, {} }]
+   mytester:assert(not v:isContiguous(), 'V is contiguous')
+   mytester:assert(not e:isContiguous(), 'E is contiguous')
+   torch.symeig(e, v, X,'V')
+   local Xhat = v * torch.diag(e) * v:t()
+   mytester:assertTensorEq(X, Xhat, 1e-8, 'VeV\' wrong')
+end
+function torchtest.svd()
+   if not torch.svd then return end
+   local a=torch.Tensor({{8.79,  6.11, -9.15,  9.57, -3.49,  9.84},
+                         {9.93,  6.91, -7.93,  1.64,  4.02,  0.15},
+                         {9.83,  5.04,  4.86,  8.83,  9.80, -8.99},
+                         {5.45, -0.27,  4.85,  0.74, 10.00, -6.02},
+                         {3.16,  7.98,  3.01,  5.80,  4.27, -5.31}}):t():clone()
+   local u,s,v = torch.svd(a)
+   local uu = torch.Tensor()
+   local ss = torch.Tensor()
+   local vv = torch.Tensor()
+   local uuu,sss,vvv = torch.svd(uu,ss,vv,a)
+   mytester:asserteq(maxdiff(u,uu),0,'torch.svd')
+   mytester:asserteq(maxdiff(u,uuu),0,'torch.svd')
+   mytester:asserteq(maxdiff(s,ss),0,'torch.svd')
+   mytester:asserteq(maxdiff(s,sss),0,'torch.svd')
+   mytester:asserteq(maxdiff(v,vv),0,'torch.svd')
+   mytester:asserteq(maxdiff(v,vvv),0,'torch.svd')
+end
+function torchtest.svd_reuse()
+   if not torch.svd then return end
+   local X = torch.randn(4,4)
+   local U, S, V = torch.svd(X)
+   local Xhat = torch.mm(U, torch.mm(S:diag(), V:t()))
+   mytester:assertTensorEq(X, Xhat, 1e-8, 'USV\' wrong')
+
+   mytester:assert(not U:isContiguous(), 'U is contiguous')
+   torch.svd(U, S, V, X)
+   local Xhat = torch.mm(U, torch.mm(S:diag(), V:t()))
+   mytester:assertTensorEq(X, Xhat, 1e-8, 'USV\' wrong')
+end
+function torchtest.svd_noncontig()
+   if not torch.svd then return end
+   local X = torch.randn(5,5)
+   local U = torch.zeros(5,2,5)[{ {}, 2, {} }]
+   local S = torch.zeros(5,2)[{ {}, 2 }]
+   local V = torch.zeros(5,2,5)[{ {}, 2, {} }]
+
+   mytester:assert(not U:isContiguous(), 'U is contiguous')
+   mytester:assert(not S:isContiguous(), 'S is contiguous')
+   mytester:assert(not V:isContiguous(), 'V is contiguous')
+   torch.svd(U, S, V, X)
+   local Xhat = torch.mm(U, torch.mm(S:diag(), V:t()))
+   mytester:assertTensorEq(X, Xhat, 1e-8, 'USV\' wrong')
+end
+function torchtest.inverse()
+   if not torch.inverse then return end
+   local M = torch.randn(5,5)
+   local MI = torch.inverse(M)
+   local E = torch.eye(5)
+   mytester:assert(not MI:isContiguous(), 'MI is contiguous')
+   mytester:assertalmosteq(maxdiff(E,torch.mm(M,MI)), 0, 1e-8, 'inverse value')
+   mytester:assertalmosteq(maxdiff(E,torch.mm(MI,M)), 0, 1e-8, 'inverse value')
+
+   local MII = torch.Tensor(5,5)
+   torch.inverse(MII, M)
+   mytester:assert(not MII:isContiguous(), 'MII is contiguous')
+   mytester:asserteq(maxdiff(MII, MI), 0, 'inverse value in-place')
+   -- second call, now that MII is transposed
+   torch.inverse(MII, M)
+   mytester:assert(not MII:isContiguous(), 'MII is contiguous')
+   mytester:asserteq(maxdiff(MII, MI), 0, 'inverse value in-place')
+end
+function torchtest.conv2()
+   local x = torch.rand(math.floor(torch.uniform(50,100)),math.floor(torch.uniform(50,100)))
+   local k = torch.rand(math.floor(torch.uniform(10,20)),math.floor(torch.uniform(10,20)))
+   local imvc = torch.conv2(x,k)
+   local imvc2 = torch.conv2(x,k,'V')
+   local imfc = torch.conv2(x,k,'F')
+
+   local ki = k:clone();
+   local ks = k:storage()
+   local kis = ki:storage()
+   for i=ks:size(),1,-1 do kis[ks:size()-i+1]=ks[i] end
+   local imvx = torch.xcorr2(x,ki)
+   local imvx2 = torch.xcorr2(x,ki,'V')
+   local imfx = torch.xcorr2(x,ki,'F')
+
+   mytester:asserteq(maxdiff(imvc,imvc2),0,'torch.conv2')
+   mytester:asserteq(maxdiff(imvc,imvx),0,'torch.conv2')
+   mytester:asserteq(maxdiff(imvc,imvx2),0,'torch.conv2')
+   mytester:asserteq(maxdiff(imfc,imfx),0,'torch.conv2')
+   mytester:assertlt(math.abs(x:dot(x)-torch.xcorr2(x,x)[1][1]),1e-10,'torch.conv2')
+
+   local xx = torch.Tensor(2,x:size(1),x:size(2))
+   xx[1]:copy(x)
+   xx[2]:copy(x)
+   local kk = torch.Tensor(2,k:size(1),k:size(2))
+   kk[1]:copy(k)
+   kk[2]:copy(k)
+
+   local immvc = torch.conv2(xx,kk)
+   local immvc2 = torch.conv2(xx,kk,'V')
+   local immfc = torch.conv2(xx,kk,'F')
+
+   mytester:asserteq(maxdiff(immvc[1],immvc[2]),0,'torch.conv2')
+   mytester:asserteq(maxdiff(immvc[1],imvc),0,'torch.conv2')
+   mytester:asserteq(maxdiff(immvc2[1],imvc2),0,'torch.conv2')
+   mytester:asserteq(maxdiff(immfc[1],immfc[2]),0,'torch.conv2')
+   mytester:asserteq(maxdiff(immfc[1],imfc),0,'torch.conv2')
+end
+
+function torchtest.conv3()
+   local x = torch.rand(math.floor(torch.uniform(20,40)),
+                        math.floor(torch.uniform(20,40)),
+                        math.floor(torch.uniform(20,40)))
+   local k = torch.rand(math.floor(torch.uniform(5,10)),
+                        math.floor(torch.uniform(5,10)),
+                        math.floor(torch.uniform(5,10)))
+   local imvc = torch.conv3(x,k)
+   local imvc2 = torch.conv3(x,k,'V')
+   local imfc = torch.conv3(x,k,'F')
+
+   local ki = k:clone();
+   local ks = k:storage()
+   local kis = ki:storage()
+   for i=ks:size(),1,-1 do kis[ks:size()-i+1]=ks[i] end
+   local imvx = torch.xcorr3(x,ki)
+   local imvx2 = torch.xcorr3(x,ki,'V')
+   local imfx = torch.xcorr3(x,ki,'F')
+
+   mytester:asserteq(maxdiff(imvc,imvc2),0,'torch.conv3')
+   mytester:asserteq(maxdiff(imvc,imvx),0,'torch.conv3')
+   mytester:asserteq(maxdiff(imvc,imvx2),0,'torch.conv3')
+   mytester:asserteq(maxdiff(imfc,imfx),0,'torch.conv3')
+   mytester:assertlt(math.abs(x:dot(x)-torch.xcorr3(x,x)[1][1][1]),4*1e-10,'torch.conv3')
+
+   local xx = torch.Tensor(2,x:size(1),x:size(2),x:size(3))
+   xx[1]:copy(x)
+   xx[2]:copy(x)
+   local kk = torch.Tensor(2,k:size(1),k:size(2),k:size(3))
+   kk[1]:copy(k)
+   kk[2]:copy(k)
+
+   local immvc = torch.conv3(xx,kk)
+   local immvc2 = torch.conv3(xx,kk,'V')
+   local immfc = torch.conv3(xx,kk,'F')
+
+   mytester:asserteq(maxdiff(immvc[1],immvc[2]),0,'torch.conv3')
+   mytester:asserteq(maxdiff(immvc[1],imvc),0,'torch.conv3')
+   mytester:asserteq(maxdiff(immvc2[1],imvc2),0,'torch.conv3')
+   mytester:asserteq(maxdiff(immfc[1],immfc[2]),0,'torch.conv3')
+   mytester:asserteq(maxdiff(immfc[1],imfc),0,'torch.conv3')
+end
+
+function torchtest.xcorr3_xcorr2_eq()
+    local ix = math.floor(torch.uniform(20,40))
+    local iy = math.floor(torch.uniform(20,40))
+    local iz = math.floor(torch.uniform(20,40))
+    local kx = math.floor(torch.uniform(5,10))
+    local ky = math.floor(torch.uniform(5,10))
+    local kz = math.floor(torch.uniform(5,10))
+
+    local x = torch.rand(ix,iy,iz)
+    local k = torch.rand(kx,ky,kz)
+
+    local o3 = torch.xcorr3(x,k)
+    local o32 = torch.zeros(o3:size())
+
+    for i=1,o3:size(1) do
+        for j=1,k:size(1) do
+            o32[i]:add(torch.xcorr2(x[i+j-1],k[j]))
+        end
+    end
+
+    mytester:assertlt(maxdiff(o3,o32),precision,'torch.conv3_conv2_eq')
+end
+
+function torchtest.fxcorr3_fxcorr2_eq()
+    local ix = math.floor(torch.uniform(20,40))
+    local iy = math.floor(torch.uniform(20,40))
+    local iz = math.floor(torch.uniform(20,40))
+    local kx = math.floor(torch.uniform(5,10))
+    local ky = math.floor(torch.uniform(5,10))
+    local kz = math.floor(torch.uniform(5,10))
+
+    local x = torch.rand(ix,iy,iz)
+    local k = torch.rand(kx,ky,kz)
+
+    local o3 = torch.xcorr3(x,k,'F')
+
+    local o32 = torch.zeros(o3:size())
+
+    for i=1,x:size(1) do
+        for j=1,k:size(1) do
+            o32[i+j-1]:add(torch.xcorr2(x[i],k[k:size(1)-j + 1],'F'))
+        end
+    end
+
+    mytester:assertlt(maxdiff(o3,o32),precision,'torch.conv3_conv2_eq')
+end
+
+function torchtest.conv3_conv2_eq()
+    local ix = math.floor(torch.uniform(20,40))
+    local iy = math.floor(torch.uniform(20,40))
+    local iz = math.floor(torch.uniform(20,40))
+    local kx = math.floor(torch.uniform(5,10))
+    local ky = math.floor(torch.uniform(5,10))
+    local kz = math.floor(torch.uniform(5,10))
+
+    local x = torch.rand(ix,iy,iz)
+    local k = torch.rand(kx,ky,kz)
+
+    local o3 = torch.conv3(x,k)
+    local o32 = torch.zeros(o3:size())
+
+    for i=1,o3:size(1) do
+        for j=1,k:size(1) do
+            o32[i]:add(torch.conv2(x[i+j-1],k[k:size(1)-j+1]))
+        end
+    end
+
+    mytester:assertlt(maxdiff(o3,o32),precision,'torch.conv3_conv2_eq')
+end
+
+function torchtest.fconv3_fconv2_eq()
+    local ix = math.floor(torch.uniform(20,40))
+    local iy = math.floor(torch.uniform(20,40))
+    local iz = math.floor(torch.uniform(20,40))
+    local kx = math.floor(torch.uniform(5,10))
+    local ky = math.floor(torch.uniform(5,10))
+    local kz = math.floor(torch.uniform(5,10))
+
+    local x = torch.rand(ix,iy,iz)
+    local k = torch.rand(kx,ky,kz)
+
+    local o3 = torch.conv3(x,k,'F')
+
+    local o32 = torch.zeros(o3:size())
+
+    for i=1,x:size(1) do
+        for j=1,k:size(1) do
+            o32[i+j-1]:add(torch.conv2(x[i],k[j],'F'))
+        end
+    end
+
+    mytester:assertlt(maxdiff(o3,o32),precision,'torch.conv3_conv2_eq')
+end
+
+function torchtest.logical()
+   local x = torch.rand(100,100)*2-1;
+   local xx = x:clone()
+
+   local xgt = torch.gt(x,1)
+   local xlt = torch.lt(x,1)
+
+   local xeq = torch.eq(x,1)
+   local xne = torch.ne(x,1)
+
+   local neqs = xgt+xlt
+   local all = neqs + xeq
+   mytester:asserteq(neqs:sum(), xne:sum(), 'torch.logical')
+   mytester:asserteq(x:nElement(),all:double():sum() , 'torch.logical')
+end
+
+function torchtest.RNGState()
+   local state = torch.getRNGState()
+   local stateCloned = state:clone()
+   local before = torch.rand(1000)
+
+   mytester:assert(state:ne(stateCloned):long():sum() == 0, 'getRNGState should have value semantics, but appears to have reference semantics')
+
+   torch.setRNGState(state)
+   local after = torch.rand(1000)
+   mytester:assertTensorEq(before, after, 1e-16, 'getRNGState/setRNGState not generating same sequence')
+end
+
+function torchtest.RNGStateAliasing()
+    torch.manualSeed(1)
+    local unused = torch.uniform()
+
+    -- Fork the random number stream at this point
+    local gen = torch.Generator()
+    torch.setRNGState(gen, torch.getRNGState())
+
+    local target_value = torch.rand(1000)
+    --Dramatically alter the internal state of the main generator
+    local also_unused = torch.rand(100000)
+    local forked_value = torch.rand(gen, 1000)
+    mytester:assertTensorEq(target_value, forked_value, 1e-16, "RNG has not forked correctly.")
+end
+
+function torchtest.serializeGenerator()
+   local generator = torch.Generator()
+   torch.manualSeed(generator, 123)
+   local differentGenerator = torch.Generator()
+   torch.manualSeed(differentGenerator, 124)
+   local serializedGenerator = torch.serialize(generator)
+   local deserializedGenerator = torch.deserialize(serializedGenerator)
+   local generated = torch.random(generator)
+   local differentGenerated = torch.random(differentGenerator)
+   local deserializedGenerated = torch.random(deserializedGenerator)
+   mytester:asserteq(generated, deserializedGenerated, 'torch.Generator changed internal state after being serialized')
+   mytester:assertne(generated, differentGenerated, 'Generators with different random seed should not produce the same output')
+end
+
+function torchtest.testBoxMullerState()
+    torch.manualSeed(123)
+    local odd_number = 101
+    local seeded = torch.randn(odd_number)
+    local state = torch.getRNGState()
+    local midstream = torch.randn(odd_number)
+    torch.setRNGState(state)
+    local repeat_midstream = torch.randn(odd_number)
+    torch.manualSeed(123)
+    local reseeded = torch.randn(odd_number)
+    mytester:assertTensorEq(midstream, repeat_midstream, 1e-16, 'getRNGState/setRNGState not generating same sequence of normally distributed numbers')
+    mytester:assertTensorEq(seeded, reseeded, 1e-16, 'repeated calls to manualSeed not generating same sequence of normally distributed numbers')
+end
+
+function torchtest.testCholesky()
+   local x = torch.rand(10,10)
+   local A = torch.mm(x, x:t())
+
+   ---- Default Case
+   local C = torch.potrf(A)
+   local B = torch.mm(C:t(), C)
+   mytester:assertTensorEq(A, B, 1e-14, 'potrf did not allow rebuilding the original matrix')
+
+    ---- Test Upper Triangular
+    local U = torch.potrf(A, 'U')
+          B = torch.mm(U:t(), U)
+    mytester:assertTensorEq(A, B, 1e-14, 'potrf (upper) did not allow rebuilding the original matrix')
+
+    ---- Test Lower Triangular
+    local L = torch.potrf(A, 'L')
+          B = torch.mm(L, L:t())
+    mytester:assertTensorEq(A, B, 1e-14, 'potrf (lower) did not allow rebuilding the original matrix')
+end
+
+function torchtest.potrs()
+   if not torch.potrs then return end
+   local a=torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                         {-6.05, -3.30,  5.36, -4.44,  1.08},
+                         {-0.45,  2.58, -2.70,  0.27,  9.04},
+                         {8.32,  2.71,  4.35, -7.17,  2.14},
+                         {-9.67, -5.14, -7.26,  6.08, -6.87}}):t()
+   local b=torch.Tensor({{4.02,  6.19, -8.22, -7.57, -3.03},
+                         {-1.56,  4.00, -8.67,  1.75,  2.86},
+                         {9.81, -4.09, -4.57, -8.61,  8.99}}):t()
+
+   ---- Make sure 'a' is symmetric PSD
+   a = torch.mm(a, a:t())
+
+   ---- Upper Triangular Test
+   local U = torch.potrf(a, 'U')
+   local x = torch.potrs(b, U, 'U')
+   mytester:assertlt(b:dist(a*x),1e-12,"torch.potrs; uplo='U'")
+
+   ---- Lower Triangular Test
+   local L = torch.potrf(a, 'L')
+   x = torch.potrs(b, L, 'L')
+   mytester:assertlt(b:dist(a*x),1e-12,"torch.potrs; uplo='L")
+end
+
+function torchtest.potri()
+   if not torch.potrs then return end
+   local a=torch.Tensor({{6.80, -2.11,  5.66,  5.97,  8.23},
+                         {-6.05, -3.30,  5.36, -4.44,  1.08},
+                         {-0.45,  2.58, -2.70,  0.27,  9.04},
+                         {8.32,  2.71,  4.35, -7.17,  2.14},
+                         {-9.67, -5.14, -7.26,  6.08, -6.87}}):t()
+
+   ---- Make sure 'a' is symmetric PSD
+   a = torch.mm(a, a:t())
+
+   ---- Compute inverse directly
+   local inv0 = torch.inverse(a)
+
+   ---- Default case
+   local chol = torch.potrf(a)
+   local inv1 = torch.potri(chol)
+   mytester:assertlt(inv0:dist(inv1),1e-12,"torch.potri; uplo=''")
+
+   ---- Upper Triangular Test
+   chol = torch.potrf(a, 'U')
+   inv1 = torch.potri(chol, 'U')
+   mytester:assertlt(inv0:dist(inv1),1e-12,"torch.potri; uplo='U'")
+
+   ---- Lower Triangular Test
+   chol = torch.potrf(a, 'L')
+   inv1 = torch.potri(chol, 'L')
+   mytester:assertlt(inv0:dist(inv1),1e-12,"torch.potri; uplo='L'")
+end
+
+function torchtest.pstrf()
+  local function checkPsdCholesky(a, uplo, inplace)
+    local u, piv, args, a_reconstructed
+    if inplace then
+      u = torch.Tensor(a:size())
+      piv = torch.IntTensor(a:size(1))
+      args = {u, piv, a}
+    else
+      args = {a}
+    end
+
+    if uplo then table.insert(args, uplo) end
+
+    u, piv = torch.pstrf(unpack(args))
+
+    if uplo == 'L' then
+      a_reconstructed = torch.mm(u, u:t())
+    else
+      a_reconstructed = torch.mm(u:t(), u)
+    end
+
+    piv = piv:long()
+    local a_permuted = a:index(1, piv):index(2, piv)
+    mytester:assertTensorEq(a_permuted, a_reconstructed, 1e-14,
+                            'torch.pstrf did not allow rebuilding the original matrix;' ..
+                            'uplo=' .. tostring(uplo))
+  end
+
+  local dimensions = { {5, 1}, {5, 3}, {5, 5}, {10, 10} }
+  for _, dim in pairs(dimensions) do
+    local m = torch.Tensor(unpack(dim)):uniform()
+    local a = torch.mm(m, m:t())
+    -- add a small number to the diagonal to make the matrix numerically positive semidefinite
+    for i = 1, m:size(1) do
+      a[i][i] = a[i][i] + 1e-7
+    end
+    checkPsdCholesky(a, nil, false)
+    checkPsdCholesky(a, 'U', false)
+    checkPsdCholesky(a, 'L', false)
+    checkPsdCholesky(a, nil, true)
+    checkPsdCholesky(a, 'U', true)
+    checkPsdCholesky(a, 'L', true)
+  end
+end
+
+function torchtest.testNumel()
+    local b = torch.ByteTensor(3, 100, 100)
+    mytester:asserteq(b:nElement(), 3*100*100, "nElement not right")
+    mytester:asserteq(b:numel(), 3*100*100, "numel not right")
+end
+
+
+-- Generate a tensor of size `size` whose values are ascending integers from
+-- `start` (or 1, if `start is not given)
+local function consecutive(size, start)
+    local sequence = torch.ones(torch.Tensor(size):prod(1)[1]):cumsum(1)
+    if start then
+        sequence:add(start - 1)
+    end
+    return sequence:resize(unpack(size))
+end
+
+function torchtest.index()
+    local badIndexMsg = "Lookup with valid index should return correct result"
+    local reference = consecutive{3, 3, 3}
+    mytester:assertTensorEq(reference[1], consecutive{3, 3}, 1e-16, badIndexMsg)
+    mytester:assertTensorEq(reference[2], consecutive({3, 3}, 10), 1e-16, badIndexMsg)
+    mytester:assertTensorEq(reference[3], consecutive({3, 3}, 19), 1e-16, badIndexMsg)
+    mytester:assertTensorEq(reference[{1}], consecutive{3, 3}, 1e-16, badIndexMsg)
+    mytester:assertTensorEq(reference[{2}], consecutive({3, 3}, 10), 1e-16, badIndexMsg)
+    mytester:assertTensorEq(reference[{3}], consecutive({3, 3}, 19), 1e-16, badIndexMsg)
+    mytester:assertTensorEq(reference[{1,2}], consecutive({3}, 4), 1e-16, badIndexMsg)
+    mytester:assertTensorEq(reference[{{1,2}}], consecutive({2, 3, 3}), 1e-16, badIndexMsg)
+    mytester:asserteq(reference[{3, 3, 3}], 27, badIndexMsg)
+    mytester:assertTensorEq(reference[{}], consecutive{3, 3, 3}, 1e-16, badIndexMsg)
+
+    local shouldErrorMsg = "Lookup with too many indices should error"
+    mytester:assertError(function() return reference[{1, 1, 1, 1}] end, shouldErrorMsg)
+    mytester:assertError(function() return reference[{1, 1, 1, {1, 1}}] end, shouldErrorMsg)
+    mytester:assertError(function() return reference[{3, 3, 3, 3, 3, 3, 3, 3}] end, shouldErrorMsg)
+end
+
+function torchtest.newIndex()
+    local badIndexMsg = "Assignment to valid index should produce correct result"
+    local reference = consecutive{3, 3, 3}
+    -- This relies on __index__() being correct - but we have separate tests for that
+    local function checkPartialAssign(index)
+        local reference = torch.zeros(3, 3, 3)
+        reference[index] = consecutive{3, 3, 3}[index]
+        mytester:assertTensorEq(reference[index], consecutive{3, 3, 3}[index], 1e-16, badIndexMsg)
+        reference[index] = 0
+        mytester:assertTensorEq(reference, torch.zeros(3, 3, 3), 1e-16, badIndexMsg)
+    end
+
+    checkPartialAssign{1}
+    checkPartialAssign{2}
+    checkPartialAssign{3}
+    checkPartialAssign{1,2}
+    checkPartialAssign{2,3}
+    checkPartialAssign{1,3}
+    checkPartialAssign{}
+
+    local shouldErrorMsg = "Assignment with too many indices should error"
+    mytester:assertError(function() reference[{1, 1, 1, 1}] = 1 end, shouldErrorMsg)
+    mytester:assertError(function() reference[{1, 1, 1, {1, 1}}] = 1 end, shouldErrorMsg)
+    mytester:assertError(function() reference[{3, 3, 3, 3, 3, 3, 3, 3}] = 1 end, shouldErrorMsg)
+end
+
+function torchtest.indexCopy()
+   local nCopy, nDest = 3, 20
+   local dest = torch.randn(nDest,4,5)
+   local src = torch.randn(nCopy,4,5)
+   local idx = torch.randperm(nDest):narrow(1, 1, nCopy):long()
+   local dest2 = dest:clone()
+   dest:indexCopy(1, idx, src)
+   for i=1,idx:size(1) do
+      dest2[idx[i]]:copy(src[i])
+   end
+   mytester:assertTensorEq(dest, dest2, 0.000001, "indexCopy tensor error")
+
+   local dest = torch.randn(nDest)
+   local src = torch.randn(nCopy)
+   local idx = torch.randperm(nDest):narrow(1, 1, nCopy):long()
+   local dest2 = dest:clone()
+   dest:indexCopy(1, idx, src)
+   for i=1,idx:size(1) do
+      dest2[idx[i]] = src[i]
+   end
+   mytester:assertTensorEq(dest, dest2, 0.000001, "indexCopy scalar error")
+end
+
+function torchtest.indexAdd()
+   local nCopy, nDest = 3, 20
+   local dest = torch.randn(nDest,4,5)
+   local src = torch.randn(nCopy,4,5)
+   local idx = torch.randperm(nDest):narrow(1, 1, nCopy):long()
+   local dest2 = dest:clone()
+   dest:indexAdd(1, idx, src)
+   for i=1,idx:size(1) do
+      dest2[idx[i]]:add(src[i])
+   end
+   mytester:assertTensorEq(dest, dest2, 0.000001, "indexAdd tensor error")
+
+   local dest = torch.randn(nDest)
+   local src = torch.randn(nCopy)
+   local idx = torch.randperm(nDest):narrow(1, 1, nCopy):long()
+   local dest2 = dest:clone()
+   dest:indexAdd(1, idx, src)
+   for i=1,idx:size(1) do
+      dest2[idx[i]] = dest2[idx[i]] + src[i]
+   end
+   mytester:assertTensorEq(dest, dest2, 0.000001, "indexAdd scalar error")
+end
+
+-- Fill idx with valid indices.
+local function fillIdx(idx, dim, dim_size, elems_per_row, m, n, o)
+   for i = 1, (dim == 1 and 1 or m) do
+      for j = 1, (dim == 2 and 1 or n) do
+         for k = 1, (dim == 3 and 1 or o) do
+            local ii = {i, j, k}
+            ii[dim] = {}
+            idx[ii] = torch.randperm(dim_size)[{{1, elems_per_row}}]
+         end
+      end
+   end
+end
+
+function torchtest.gather()
+   local m, n, o = torch.random(10, 20), torch.random(10, 20), torch.random(10, 20)
+   local elems_per_row = torch.random(10)
+   local dim = torch.random(3)
+
+   local src = torch.randn(m, n, o)
+   local idx_size = {m, n, o}
+   idx_size[dim] = elems_per_row
+   local idx = torch.LongTensor():resize(unpack(idx_size))
+   fillIdx(idx, dim, src:size(dim), elems_per_row, m, n, o)
+
+   local actual = torch.gather(src, dim, idx)
+   local expected = torch.Tensor():resize(unpack(idx_size))
+   for i = 1, idx_size[1] do
+      for j = 1, idx_size[2] do
+         for k = 1, idx_size[3] do
+            local ii = {i, j, k}
+            ii[dim] = idx[i][j][k]
+            expected[i][j][k] = src[ii]
+         end
+      end
+   end
+   mytester:assertTensorEq(actual, expected, 0, "Wrong values for gather")
+
+   idx[1][1][1] = 23
+   mytester:assertError(function() torch.gather(src, dim, idx) end,
+                        "Invalid index not detected")
+end
+
+function torchtest.gatherMax()
+   local src = torch.randn(3, 4, 5)
+   local expected, idx = src:max(3)
+   local actual = torch.gather(src, 3, idx)
+   mytester:assertTensorEq(actual, expected, 0, "Wrong values for gather")
+end
+
+function torchtest.scatter()
+   local m, n, o = torch.random(10, 20), torch.random(10, 20), torch.random(10, 20)
+   local elems_per_row = torch.random(10)
+   local dim = torch.random(3)
+
+   local idx_size = {m, n, o}
+   idx_size[dim] = elems_per_row
+   local idx = torch.LongTensor():resize(unpack(idx_size))
+   fillIdx(idx, dim, ({m, n, o})[dim], elems_per_row, m, n, o)
+   local src = torch.Tensor():resize(unpack(idx_size)):normal()
+
+   local actual = torch.zeros(m, n, o):scatter(dim, idx, src)
+   local expected = torch.zeros(m, n, o)
+   for i = 1, idx_size[1] do
+      for j = 1, idx_size[2] do
+         for k = 1, idx_size[3] do
+            local ii = {i, j, k}
+            ii[dim] = idx[i][j][k]
+           expected[ii] = src[i][j][k]
+         end
+      end
+   end
+   mytester:assertTensorEq(actual, expected, 0, "Wrong values for scatter")
+
+   idx[1][1][1] = 34
+   mytester:assertError(function() torch.zeros(m, n, o):scatter(dim, idx, src) end,
+                        "Invalid index not detected")
+end
+
+function torchtest.scatterFill()
+   local m, n, o = torch.random(10, 20), torch.random(10, 20), torch.random(10, 20)
+   local elems_per_row = torch.random(10)
+   local dim = torch.random(3)
+
+   local val = torch.uniform()
+   local idx_size = {m, n, o}
+   idx_size[dim] = elems_per_row
+   local idx = torch.LongTensor():resize(unpack(idx_size))
+   fillIdx(idx, dim, ({m, n, o})[dim], elems_per_row, m, n, o)
+
+   local actual = torch.zeros(m, n, o):scatter(dim, idx, val)
+   local expected = torch.zeros(m, n, o)
+   for i = 1, idx_size[1] do
+      for j = 1, idx_size[2] do
+         for k = 1, idx_size[3] do
+            local ii = {i, j, k}
+            ii[dim] = idx[i][j][k]
+            expected[ii] = val
+         end
+      end
+   end
+   mytester:assertTensorEq(actual, expected, 0, "Wrong values for scatter")
+
+   idx[1][1][1] = 28
+   mytester:assertError(function() torch.zeros(m, n, o):scatter(dim, idx, val) end,
+                        "Invalid index not detected")
+end
+
+function torchtest.maskedCopy()
+   local nCopy, nDest = 3, 10
+   local dest = torch.randn(nDest)
+   local src = torch.randn(nCopy)
+   local mask = torch.ByteTensor{0,0,0,0,1,0,1,0,1,0}
+   local dest2 = dest:clone()
+   dest:maskedCopy(mask, src)
+   local j = 1
+   for i=1,nDest do
+      if mask[i] == 1 then
+         dest2[i] = src[j]
+         j = j + 1
+      end
+   end
+   mytester:assertTensorEq(dest, dest2, 0.000001, "maskedCopy error")
+
+   -- make source bigger than number of 1s in mask
+   src = torch.randn(nDest)
+   local ok = pcall(dest.maskedCopy, dest, mask, src)
+   mytester:assert(ok, "maskedCopy incorrect complaint when"
+		      .. " src is bigger than mask's one count")
+
+   src = torch.randn(nCopy - 1) -- make src smaller. this should fail
+   local ok = pcall(dest.maskedCopy, dest, mask, src)
+   mytester:assert(not ok, "maskedCopy not erroring when"
+		      .. " src is smaller than mask's one count")
+end
+
+function torchtest.maskedSelect()
+   local nSrc = 10
+   local src = torch.randn(nSrc)
+   local mask = torch.rand(nSrc):mul(2):floor():byte()
+   local dst = torch.Tensor()
+   dst:maskedSelect(src, mask)
+   local dst2 = {}
+   for i=1,nSrc do
+      if mask[i] == 1 then
+         table.insert(dst2, src[i])
+      end
+   end
+   mytester:assertTensorEq(dst, torch.DoubleTensor(dst2), 0.000001, "maskedSelect error")
+end
+
+function torchtest.maskedFill()
+   local nDst = 10
+   local dst = torch.randn(nDst)
+   local mask = torch.rand(nDst):mul(2):floor():byte()
+   local val = math.random()
+   local dst2 = dst:clone()
+   dst:maskedFill(mask, val)
+   for i=1,nDst do
+      if mask[i] == 1 then
+         dst2[i] = val
+      end
+   end
+   mytester:assertTensorEq(dst, dst2, 0.000001, "maskedFill error")
+end
+
+function torchtest.abs()
+   local size = 1000
+   local range = 1000
+   local original = torch.rand(size):mul(range)
+   -- Tensor filled with {-1,1}
+   local switch = torch.rand(size):mul(2):floor():mul(2):add(-1)
+
+   local types = {'torch.DoubleTensor', 'torch.FloatTensor', 'torch.LongTensor', 'torch.IntTensor'}
+   for k,t in ipairs(types) do
+      local data = original:type(t)
+      local switch = switch:type(t)
+      local input = torch.cmul(data, switch)
+      mytester:assertTensorEq(input:abs(), data, 1e-16, 'Error in abs() for '..t)
+   end
+
+   -- Checking that the right abs function is called for LongTensor
+   local bignumber = 2^31 + 1
+   local input = torch.LongTensor{-bignumber}
+   mytester:assertgt(input:abs()[1], 0, 'torch.abs(3)')
+end
+
+function torchtest.classInModule()
+    -- Need a global for this module
+    _mymodule123 = {}
+    local x = torch.class('_mymodule123.myclass')
+    mytester:assert(x ~= nil, 'Could not create class in module')
+    -- Remove the global
+    _G['_mymodule123'] = nil
+end
+
+function torchtest.classNoModule()
+    local x = torch.class('_myclass123')
+    mytester:assert(x ~= nil, 'Could not create class in module')
+end
+
+function torchtest.type()
+   local objects = {torch.DoubleTensor(), {}, nil, 2, "asdf"}
+   local types = {'torch.DoubleTensor', 'table', 'nil', 'number', 'string'}
+   for i,obj in ipairs(objects) do
+      mytester:assert(torch.type(obj) == types[i], "wrong type "..types[i])
+   end
+end
+
+function torchtest.isTypeOfInheritance()
+   do
+      local A = torch.class('A')
+      local B, parB = torch.class('B', 'A')
+      local C, parC = torch.class('C', 'A')
+   end
+   local a, b, c = A(), B(), C()
+
+   mytester:assert(torch.isTypeOf(a, 'A'), 'isTypeOf error, string spec')
+   mytester:assert(torch.isTypeOf(a, A), 'isTypeOf error, constructor')
+   mytester:assert(torch.isTypeOf(b, 'B'), 'isTypeOf error child class')
+   mytester:assert(torch.isTypeOf(b, B), 'isTypeOf error child class ctor')
+   mytester:assert(torch.isTypeOf(b, 'A'), 'isTypeOf error: inheritance')
+   mytester:assert(torch.isTypeOf(b, A), 'isTypeOf error: inheritance')
+   mytester:assert(not torch.isTypeOf(c, 'B'), 'isTypeOf error: common parent')
+   mytester:assert(not torch.isTypeOf(c, B), 'isTypeOf error: common parent')
+end
+
+function torchtest.isTypeOfPartial()
+    do
+      local TorchDummy = torch.class('TorchDummy')
+      local OtherTorchDummy = torch.class('OtherTorchDummy')
+      local TorchMember = torch.class('TorchMember')
+      local OtherTorchMember = torch.class('OtherTorchMember')
+      local FirstTorchMember = torch.class('FirstTorchMember',
+                                           'TorchMember')
+      local SecondTorchMember = torch.class('SecondTorchMember',
+                                            'TorchMember')
+      local ThirdTorchMember = torch.class('ThirdTorchMember',
+                                           'OtherTorchMember')
+   end
+   local td, otd = TorchDummy(), OtherTorchDummy()
+   local tm, ftm, stm, ttm = TorchMember(), FirstTorchMember(),
+      SecondTorchMember(), ThirdTorchMember()
+
+   mytester:assert(not torch.isTypeOf(td, 'OtherTorchDummy'),
+                   'isTypeOf error: incorrect partial match')
+   mytester:assert(not torch.isTypeOf(otd, 'TorchDummy'),
+                   'isTypeOf error: incorrect partial match')
+   mytester:assert(torch.isTypeOf(tm, 'TorchMember'),
+                   'isTypeOf error, string spec')
+   mytester:assert(torch.isTypeOf(tm, TorchMember),
+                   'isTypeOf error, constructor')
+   mytester:assert(torch.isTypeOf(ftm, 'FirstTorchMember'),
+                   'isTypeOf error child class')
+   mytester:assert(torch.isTypeOf(ftm, FirstTorchMember),
+                   'isTypeOf error child class ctor')
+   mytester:assert(torch.isTypeOf(ftm, 'TorchMember'),
+                   'isTypeOf error: inheritance')
+   mytester:assert(torch.isTypeOf(ftm, TorchMember),
+                   'isTypeOf error: inheritance')
+   mytester:assert(not torch.isTypeOf(stm, 'FirstTorchMember'),
+                   'isTypeOf error: common parent')
+   mytester:assert(not torch.isTypeOf(stm, FirstTorchMember),
+                   'isTypeOf error: common parent')
+   mytester:assert(not torch.isTypeOf(ttm, TorchMember),
+                   'isTypeOf error: inheritance')
+   mytester:assert(not torch.isTypeOf(ttm, 'TorchMember'),
+                   'isTypeOf error: inheritance')
+end
+
+function torchtest.isTypeOfPattern()
+   local t = torch.LongTensor()
+   mytester:assert(torch.isTypeOf(t, torch.LongTensor),
+                   'isTypeOf error: incorrect match')
+   mytester:assert(not torch.isTypeOf(t, torch.IntTensor),
+                   'isTypeOf error: incorrect match')
+   mytester:assert(torch.isTypeOf(t, 'torch.LongTensor'),
+                   'isTypeOf error: incorrect match')
+   mytester:assert(not torch.isTypeOf(t, 'torch.Long'),
+                   'isTypeOf error: incorrect match')
+   mytester:assert(torch.isTypeOf(t, 'torch.*Tensor'),
+                   'isTypeOf error: incorrect match')
+   mytester:assert(torch.isTypeOf(t, '.*Long'),
+                   'isTypeOf error: incorrect match')
+   mytester:assert(not torch.isTypeOf(t, 'torch.IntTensor'),
+                   'isTypeOf error: incorrect match')
+end
+
+function torchtest.isTensor()
+   local t = torch.randn(3,4)
+   mytester:assert(torch.isTensor(t), 'error in isTensor')
+   mytester:assert(torch.isTensor(t[1]), 'error in isTensor for subTensor')
+   mytester:assert(not torch.isTensor(t[1][2]), 'false positive in isTensor')
+   mytester:assert(torch.Tensor.isTensor(t), 'alias not working')
+end
+
+function torchtest.isStorage()
+  local t = torch.randn(3,4)
+  mytester:assert(torch.isStorage(t:storage()), 'error in isStorage')
+  mytester:assert(not torch.isStorage(t), 'false positive in isStorage')
+end
+
+function torchtest.view()
+   local tensor = torch.rand(15)
+   local template = torch.rand(3,5)
+   local target = template:size():totable()
+   mytester:assertTableEq(tensor:viewAs(template):size():totable(), target, 'Error in viewAs')
+   mytester:assertTableEq(tensor:view(3,5):size():totable(), target, 'Error in view')
+   mytester:assertTableEq(tensor:view(torch.LongStorage{3,5}):size():totable(), target, 'Error in view using LongStorage')
+   mytester:assertTableEq(tensor:view(-1,5):size():totable(), target, 'Error in view using dimension -1')
+   mytester:assertTableEq(tensor:view(3,-1):size():totable(), target, 'Error in view using dimension -1')
+   local tensor_view = tensor:view(5,3)
+   tensor_view:fill(torch.rand(1)[1])
+   mytester:asserteq((tensor_view-tensor):abs():max(), 0, 'Error in view')
+
+   local target_tensor = torch.Tensor()
+   mytester:assertTableEq(target_tensor:viewAs(tensor, template):size():totable(), target, 'Error in viewAs')
+   mytester:assertTableEq(target_tensor:view(tensor, 3,5):size():totable(), target, 'Error in view')
+   mytester:assertTableEq(target_tensor:view(tensor, torch.LongStorage{3,5}):size():totable(), target, 'Error in view using LongStorage')
+   mytester:assertTableEq(target_tensor:view(tensor, -1,5):size():totable(), target, 'Error in view using dimension -1')
+   mytester:assertTableEq(target_tensor:view(tensor, 3,-1):size():totable(), target, 'Error in view using dimension -1')
+   target_tensor:fill(torch.rand(1)[1])
+   mytester:asserteq((target_tensor-tensor):abs():max(), 0, 'Error in viewAs')
+end
+
+function torchtest.expand()
+   local result = torch.Tensor()
+   local tensor = torch.rand(8,1)
+   local template = torch.rand(8,5)
+   local target = template:size():totable()
+   mytester:assertTableEq(tensor:expandAs(template):size():totable(), target, 'Error in expandAs')
+   mytester:assertTableEq(tensor:expand(8,5):size():totable(), target, 'Error in expand')
+   mytester:assertTableEq(tensor:expand(torch.LongStorage{8,5}):size():totable(), target, 'Error in expand using LongStorage')
+   result:expandAs(tensor,template)
+   mytester:assertTableEq(result:size():totable(), target, 'Error in expandAs using result')
+   result:expand(tensor,8,5)
+   mytester:assertTableEq(result:size():totable(), target, 'Error in expand using result')
+   result:expand(tensor,torch.LongStorage{8,5})
+   mytester:assertTableEq(result:size():totable(), target, 'Error in expand using result and LongStorage')
+   mytester:asserteq((result:mean(2):view(8,1)-tensor):abs():max(), 0, 'Error in expand (not equal)')
+end
+
+function torchtest.repeatTensor()
+   local result = torch.Tensor()
+   local tensor = torch.rand(8,4)
+   local size = {3,1,1}
+   local sizeStorage = torch.LongStorage(size)
+   local target = {3,8,4}
+   mytester:assertTableEq(tensor:repeatTensor(unpack(size)):size():totable(), target, 'Error in repeatTensor')
+   mytester:assertTableEq(tensor:repeatTensor(sizeStorage):size():totable(), target, 'Error in repeatTensor using LongStorage')
+   result:repeatTensor(tensor,unpack(size))
+   mytester:assertTableEq(result:size():totable(), target, 'Error in repeatTensor using result')
+   result:repeatTensor(tensor,sizeStorage)
+   mytester:assertTableEq(result:size():totable(), target, 'Error in repeatTensor using result and LongStorage')
+   mytester:asserteq((result:mean(1):view(8,4)-tensor):abs():max(), 0, 'Error in repeatTensor (not equal)')
+end
+
+function torchtest.isSameSizeAs()
+   local t1 = torch.Tensor(3, 4, 9, 10)
+   local t2 = torch.Tensor(3, 4)
+   local t3 = torch.Tensor(1, 9, 3, 3)
+   local t4 = torch.Tensor(3, 4, 9, 10)
+
+   mytester:assert(t1:isSameSizeAs(t2) == false, "wrong answer ")
+   mytester:assert(t1:isSameSizeAs(t3) == false, "wrong answer ")
+   mytester:assert(t1:isSameSizeAs(t4) == true, "wrong answer ")
+end
+
+function torchtest.isSetTo()
+   local t1 = torch.Tensor(3, 4, 9, 10)
+   local t2 = torch.Tensor(3, 4, 9, 10)
+   local t3 = torch.Tensor():set(t1)
+   local t4 = t3:reshape(12, 90)
+   mytester:assert(t1:isSetTo(t2) == false, "tensors do not share storage")
+   mytester:assert(t1:isSetTo(t3) == true, "tensor is set to other")
+   mytester:assert(t3:isSetTo(t1) == true, "isSetTo should be symmetric")
+   mytester:assert(t1:isSetTo(t4) == false, "tensors have different view")
+   mytester:assert(not torch.Tensor():isSetTo(torch.Tensor()),
+                   "Tensors with no storages should not appear to be set " ..
+                   "to each other")
+end
+
+function torchtest.equal()
+  -- Contiguous, 1D
+  local t1 = torch.Tensor{3, 4, 9, 10}
+  local t2 = t1:clone()
+  local t3 = torch.Tensor{1, 9, 3, 10}
+  local t4 = torch.Tensor{3, 4, 9}
+  local t5 = torch.Tensor()
+  mytester:assert(t1:equal(t2) == true, "wrong answer ")
+  mytester:assert(t1:equal(t3) == false, "wrong answer ")
+  mytester:assert(t1:equal(t4) == false, "wrong answer ")
+  mytester:assert(t1:equal(t5) == false, "wrong answer ")
+  mytester:assert(torch.equal(t1, t2) == true, "wrong answer ")
+  mytester:assert(torch.equal(t1, t3) == false, "wrong answer ")
+  mytester:assert(torch.equal(t1, t4) == false, "wrong answer ")
+  mytester:assert(torch.equal(t1, t5) == false, "wrong answer ")
+
+  -- Non contiguous, 2D
+  local s = torch.Tensor({{1, 2, 3, 4}, {5, 6, 7, 8}})
+  local s1 = s[{{}, {2, 3}}]
+  local s2 = s1:clone()
+  local s3 = torch.Tensor({{2, 3}, {6, 7}})
+  local s4 = torch.Tensor({{0, 0}, {0, 0}})
+
+  mytester:assert(not s1:isContiguous(), "wrong answer ")
+  mytester:assert(s1:equal(s2) == true, "wrong answer ")
+  mytester:assert(s1:equal(s3) == true, "wrong answer ")
+  mytester:assert(s1:equal(s4) == false, "wrong answer ")
+  mytester:assert(torch.equal(s1, s2) == true, "wrong answer ")
+  mytester:assert(torch.equal(s1, s3) == true, "wrong answer ")
+  mytester:assert(torch.equal(s1, s4) == false, "wrong answer ")
+end
+
+function torchtest.isSize()
+  local t1 = torch.Tensor(3, 4, 5)
+  local s1 = torch.LongStorage({3, 4, 5})
+  local s2 = torch.LongStorage({5, 4, 3})
+
+   mytester:assert(t1:isSize(s1) == true, "wrong answer ")
+   mytester:assert(t1:isSize(s2) == false, "wrong answer ")
+   mytester:assert(t1:isSize(t1:size()) == true, "wrong answer ")
+end
+
+function torchtest.elementSize()
+  local byte   =   torch.ByteStorage():elementSize()
+  local char   =   torch.CharStorage():elementSize()
+  local short  =  torch.ShortStorage():elementSize()
+  local int    =    torch.IntStorage():elementSize()
+  local long   =   torch.LongStorage():elementSize()
+  local float  =  torch.FloatStorage():elementSize()
+  local double = torch.DoubleStorage():elementSize()
+
+  mytester:asserteq(byte,   torch.ByteTensor():elementSize())
+  mytester:asserteq(char,   torch.CharTensor():elementSize())
+  mytester:asserteq(short,  torch.ShortTensor():elementSize())
+  mytester:asserteq(int,    torch.IntTensor():elementSize())
+  mytester:asserteq(long,   torch.LongTensor():elementSize())
+  mytester:asserteq(float,  torch.FloatTensor():elementSize())
+  mytester:asserteq(double, torch.DoubleTensor():elementSize())
+
+  mytester:assertne(byte, 0)
+  mytester:assertne(char, 0)
+  mytester:assertne(short, 0)
+  mytester:assertne(int, 0)
+  mytester:assertne(long, 0)
+  mytester:assertne(float, 0)
+  mytester:assertne(double, 0)
+
+  -- These tests are portable, not necessarily strict for your system.
+  mytester:asserteq(byte, 1)
+  mytester:asserteq(char, 1)
+  mytester:assert(short >= 2)
+  mytester:assert(int >= 2)
+  mytester:assert(int >= short)
+  mytester:assert(long >= 4)
+  mytester:assert(long >= int)
+  mytester:assert(double >= float)
+end
+
+function torchtest.split()
+   local result = {}
+   local tensor = torch.rand(7,4)
+   local splitSize = 3
+   local targetSize = {{3,4},{3,4},{1,4}}
+   local dim = 1
+   local splits = tensor:split(splitSize, dim)
+   local start = 1
+   for i, split in ipairs(splits) do
+      mytester:assertTableEq(split:size():totable(), targetSize[i], 'Size error in split '..i)
+      mytester:assertTensorEq(tensor:narrow(dim, start, targetSize[i][dim]), split, 0.00001, 'Content error in split '..i)
+      start = start + targetSize[i][dim]
+   end
+   torch.split(result, tensor, splitSize, dim)
+   local start = 1
+   for i, split in ipairs(result) do
+      mytester:assertTableEq(split:size():totable(), targetSize[i], 'Result size error in split '..i)
+      mytester:assertTensorEq(tensor:narrow(dim, start, targetSize[i][dim]), split, 0.000001, 'Result content error in split '..i)
+      start = start + targetSize[i][dim]
+   end
+   mytester:asserteq(#splits, #result, 'Non-consistent output size from split')
+   for i, split in ipairs(splits) do
+      mytester:assertTensorEq(split,result[i], 0, 'Non-consistent outputs from split')
+   end
+end
+
+function torchtest.chunk()
+   local result = {}
+   local tensor = torch.rand(4,7)
+   local nChunk = 3
+   local targetSize = {{4,3},{4,3},{4,1}}
+   local dim = 2
+   local splits = tensor:chunk(nChunk, dim)
+   local start = 1
+   for i, split in ipairs(splits) do
+      mytester:assertTableEq(split:size():totable(), targetSize[i], 'Size error in chunk '..i)
+      mytester:assertTensorEq(tensor:narrow(dim, start, targetSize[i][dim]), split, 0.00001, 'Content error in chunk '..i)
+      start = start + targetSize[i][dim]
+   end
+   torch.split(result, tensor, nChunk, dim)
+   local start = 1
+   for i, split in ipairs(result) do
+      mytester:assertTableEq(split:size():totable(), targetSize[i], 'Result size error in chunk '..i)
+      mytester:assertTensorEq(tensor:narrow(dim, start, targetSize[i][dim]), split, 0.000001, 'Result content error in chunk '..i)
+      start = start + targetSize[i][dim]
+   end
+end
+
+function torchtest.totable()
+  local table0D = {}
+  local tensor0D = torch.Tensor(table0D)
+  mytester:assertTableEq(torch.totable(tensor0D), table0D, 'tensor0D:totable incorrect')
+
+  local table1D = {1, 2, 3}
+  local tensor1D = torch.Tensor(table1D)
+  local storage = torch.Storage(table1D)
+  mytester:assertTableEq(tensor1D:totable(), table1D, 'tensor1D:totable incorrect')
+  mytester:assertTableEq(storage:totable(), table1D, 'storage:totable incorrect')
+  mytester:assertTableEq(torch.totable(tensor1D), table1D, 'torch.totable incorrect for Tensors')
+  mytester:assertTableEq(torch.totable(storage), table1D, 'torch.totable incorrect for Storages')
+
+  local table2D = {{1, 2}, {3, 4}}
+  local tensor2D = torch.Tensor(table2D)
+  mytester:assertTableEq(tensor2D:totable(), table2D, 'tensor2D:totable incorrect')
+
+  local tensor3D = torch.Tensor({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})
+  local tensorNonContig = tensor3D:select(2, 2)
+  mytester:assert(not tensorNonContig:isContiguous(), 'invalid test')
+  mytester:assertTableEq(tensorNonContig:totable(), {{3, 4}, {7, 8}},
+                         'totable() incorrect for non-contiguous tensors')
+end
+
+function torchtest.permute()
+  local orig = {1,2,3,4,5,6,7}
+  local perm = torch.randperm(7):totable()
+  local x = torch.Tensor(unpack(orig)):fill(0)
+  local new = x:permute(unpack(perm)):size():totable()
+  mytester:assertTableEq(perm, new, 'Tensor:permute incorrect')
+  mytester:assertTableEq(x:size():totable(), orig, 'Tensor:permute changes tensor')
+end
+
+function torchtest.serialize()
+   local tableObj = {6, a = 42}
+   local tensObj = torch.randn(3,4,5)
+
+   -- Test serializing a table
+   local serString = torch.serialize(tableObj)
+   local serStorage = torch.serializeToStorage(tableObj)
+   mytester:assertTableEq(tableObj, torch.deserialize(serString))
+   mytester:assertTableEq(tableObj, torch.deserializeFromStorage(serStorage))
+
+   -- Test serializing a Tensor
+   serString = torch.serialize(tensObj)
+   serStorage = torch.serializeToStorage(tensObj)
+   mytester:assertTensorEq(tensObj, torch.deserialize(serString), 1e-10)
+   mytester:assertTensorEq(tensObj, torch.deserializeFromStorage(serStorage), 1e-10)
+end
+
+function torchtest.storageview()
+   local s1 = torch.LongStorage({3, 4, 5})
+   local s2 = torch.LongStorage(s1, 2)
+
+   mytester:assert(s2:size() == 2, "should be size 2")
+   mytester:assert(s2[1] == s1[2], "should have 4 at position 1")
+   mytester:assert(s2[2] == s1[3], "should have 5 at position 2")
+
+   s2[1] = 13
+   mytester:assert(13 == s1[2], "should have 13 at position 1")
+end
+
+function torchtest.nonzero()
+  local nSrc = 12
+
+  local types = {
+      'torch.ByteTensor',
+      'torch.CharTensor',
+      'torch.ShortTensor',
+      'torch.IntTensor',
+      'torch.FloatTensor',
+      'torch.DoubleTensor',
+      'torch.LongTensor',
+  }
+
+  local shapes = {
+      torch.LongStorage{12},
+      torch.LongStorage{12, 1},
+      torch.LongStorage{1, 12},
+      torch.LongStorage{6, 2},
+      torch.LongStorage{3, 2, 2},
+  }
+
+  for _, type in ipairs(types) do
+    local tensor = torch.rand(nSrc):mul(2):floor():type(type)
+      for _, shape in ipairs(shapes) do
+        tensor = tensor:reshape(shape)
+        local dst1 = torch.nonzero(tensor)
+        local dst2 = tensor:nonzero()
+        -- Does not work. Torch uses the first argument to determine what
+        -- type the Tensor is expected to be. In our case the second argument
+        -- determines the type of Tensor.
+        --local dst3 = torch.LongTensor()
+        --torch.nonzero(dst3, tensor)
+        -- However, there are workarounds to this issue when it is desired to
+        -- use an existing tensor for the result:
+        local dst4 = torch.LongTensor()
+        tensor.nonzero(dst4, tensor)
+        if shape:size() == 1 then
+          local dst = {}
+          for i = 1 , nSrc do
+            if tensor[i] ~= 0 then
+              table.insert(dst, i)
+            end
+          end
+          mytester:assertTensorEq(dst1:select(2, 1), torch.LongTensor(dst), 0.0,
+                                  "nonzero error")
+          mytester:assertTensorEq(dst2:select(2, 1), torch.LongTensor(dst), 0.0,
+                                  "nonzero error")
+          --mytester:assertTensorEq(dst3:select(2, 1), torch.LongTensor(dst),
+          --                        0.0,  "nonzero error")
+          mytester:assertTensorEq(dst4:select(2, 1), torch.LongTensor(dst), 0.0,
+                                  "nonzero error")
+        elseif shape:size() == 2 then
+          -- This test will allow through some false positives. It only checks
+          -- that the elements flagged positive are indeed non-zero.
+          for i=1,dst1:size()[1] do
+            mytester:assert(tensor[dst1[i][1]][dst1[i][2]] ~= 0)
+          end
+        elseif shape:size() == 3 then
+          -- This test will allow through some false positives. It only checks
+          -- that the elements flagged positive are indeed non-zero.
+          for i=1,dst1:size()[1] do
+            mytester:assert(tensor[dst1[i][1]][dst1[i][2]][dst1[i][3]] ~= 0)
+          end
+        end
+      end
+   end
+
+end
+
+function torchtest.testheaptracking()
+  local oldheaptracking = torch._heaptracking
+  if oldheaptracking == nil then
+    oldheaptracking = false
+  end
+  torch.setheaptracking(true)
+  mytester:assert(torch._heaptracking == true, 'Heap tracking expected true')
+
+  torch.setheaptracking(false)
+  mytester:assert(torch._heaptracking == false, 'Heap tracking expected false')
+
+  -- put heap tracking to its original state
+  torch.setheaptracking(oldheaptracking)
+end
+
+function torchtest.bernoulli()
+  local size = torch.LongStorage{10, 10}
+  local t = torch.ByteTensor(size)
+
+  local function isBinary(t)
+    return torch.ne(t, 0):cmul(torch.ne(t, 1)):sum() == 0
+  end
+
+  local p = 0.5
+  t:bernoulli(p)
+  mytester:assert(isBinary(t), 'Sample from torch.bernoulli is not binary')
+
+  local p = torch.rand(size)
+  t:bernoulli(p)
+  mytester:assert(isBinary(t), 'Sample from torch.bernoulli is not binary')
+end
+
+function torch.test(tests)
+   torch.setheaptracking(true)
+   math.randomseed(os.time())
+   if torch.getdefaulttensortype() == 'torch.FloatTensor' then
+      precision = 1e-4
+   elseif  torch.getdefaulttensortype() == 'torch.DoubleTensor' then
+      precision = 1e-8
+   end
+   mytester = torch.Tester()
+   mytester:add(torchtest)
+   mytester:run(tests)
+   return mytester
+end
diff --git a/test/test_Multinomial.lua b/test/test_Multinomial.lua
new file mode 100644
index 0000000..9069ecb
--- /dev/null
+++ b/test/test_Multinomial.lua
@@ -0,0 +1,25 @@
+-- Test multinomial for rare events (based on https://github.com/torch/torch7/issues/418)
+-- and for performance (cf. https://github.com/torch/torch7/issues/453)
+
+sys.tic()
+do
+   local p = torch.FloatTensor(1001000):fill(1)
+   p:narrow(1, 50001, 50000):fill(1e-3)
+   p:div(p:sum())
+   local N = 1001000
+
+   local n = 0
+   local c = torch.LongTensor(p:nElement()):zero()
+   local c_ptr = c:data() - 1
+   local tmp = torch.LongTensor()
+   for i = 1, 100 do
+      p.multinomial(tmp, p, N, true);
+      n = n + N
+      tmp:apply(function(i) c_ptr[i] = c_ptr[i] + 1 end)
+   end
+
+   local actual = c:narrow(1, 50001, 50000):sum()
+   local expected = n*p:narrow(1, 50001, 50000):sum()
+   print('Actual, Expected: ', actual, expected)
+end
+print('Time spent: ', sys.toc())
diff --git a/test/test_Tester.lua b/test/test_Tester.lua
new file mode 100644
index 0000000..a283360
--- /dev/null
+++ b/test/test_Tester.lua
@@ -0,0 +1,626 @@
+require 'torch'
+
+local tester = torch.Tester()
+
+local MESSAGE = "a really useful informative error message"
+
+local subtester = torch.Tester()
+-- The message only interests us in case of failure
+subtester._success = function(self) return true, MESSAGE end
+subtester._failure = function(self, message) return false, message end
+
+local tests = torch.TestSuite()
+
+local test_name_passed_to_setUp
+local calls_to_setUp = 0
+local calls_to_tearDown = 0
+
+local originalIoWrite = io.write
+local function disableIoWrite()
+   io.write = function() end
+end
+local function enableIoWrite()
+   io.write = originalIoWrite
+end
+
+local function meta_assert_success(success, message)
+   tester:assert(success == true, "assert wasn't successful")
+   tester:assert(string.find(message, MESSAGE) ~= nil, "message doesn't match")
+end
+local function meta_assert_failure(success, message)
+   tester:assert(success == false, "assert didn't fail")
+   tester:assert(string.find(message, MESSAGE) ~= nil, "message doesn't match")
+end
+
+function tests.really_test_assert()
+   assert((subtester:assert(true, MESSAGE)),
+          "subtester:assert doesn't actually work!")
+   assert(not (subtester:assert(false, MESSAGE)),
+          "subtester:assert doesn't actually work!")
+end
+
+function tests.setEarlyAbort()
+   disableIoWrite()
+
+   for _, earlyAbort in ipairs{false, true} do
+      local myTester = torch.Tester()
+
+      local invokedCount = 0
+      local myTests = {}
+      function myTests.t1()
+         invokedCount = invokedCount + 1
+         myTester:assert(false)
+      end
+      myTests.t2 = myTests.t1
+
+      myTester:setEarlyAbort(earlyAbort)
+      myTester:add(myTests)
+      pcall(myTester.run, myTester)
+
+      tester:assert(invokedCount == (earlyAbort and 1 or 2),
+                    "wrong number of tests invoked for use with earlyAbort")
+   end
+
+   enableIoWrite()
+end
+
+function tests.setRethrowErrors()
+   disableIoWrite()
+
+   local myTester = torch.Tester()
+   myTester:setRethrowErrors(true)
+   myTester:add(function() error("a throw") end)
+
+   tester:assertErrorPattern(function() myTester:run() end,
+                             "a throw",
+                             "error should be rethrown")
+
+   enableIoWrite()
+end
+
+function tests.disable()
+   disableIoWrite()
+
+   for disableCount = 1, 2 do
+      local myTester = torch.Tester()
+      local tests = {}
+      local test1Invoked = false
+      local test2Invoked = false
+      function tests.test1()
+         test1Invoked = true
+      end
+      function tests.test2()
+         test2Invoked = true
+      end
+      myTester:add(tests)
+
+      if disableCount == 1 then
+         myTester:disable('test1'):run()
+         tester:assert((not test1Invoked) and test2Invoked,
+                       "disabled test shouldn't have been invoked")
+      else
+         myTester:disable({'test1', 'test2'}):run()
+         tester:assert((not test1Invoked) and (not test2Invoked),
+                       "disabled tests shouldn't have been invoked")
+      end
+   end
+
+   enableIoWrite()
+end
+
+function tests.assert()
+   meta_assert_success(subtester:assert(true, MESSAGE))
+   meta_assert_failure(subtester:assert(false, MESSAGE))
+end
+
+local function testEqNe(eqExpected, ...)
+   if eqExpected then
+      meta_assert_success(subtester:eq(...))
+      meta_assert_failure(subtester:ne(...))
+   else
+      meta_assert_failure(subtester:eq(...))
+      meta_assert_success(subtester:ne(...))
+   end
+end
+
+--[[ Test :assertGeneralEq and :assertGeneralNe (also known as :eq and :ne).
+
+Note that in-depth testing of testing of many specific types of data (such as
+Tensor) is covered below, when we test specific functions (such as
+:assertTensorEq). This just does a general check, as well as testing of testing
+of mixed datatypes.
+]]
+function tests.assertGeneral()
+   local one = torch.Tensor{1}
+
+   testEqNe(true, one, one, MESSAGE)
+   testEqNe(false, one, 1, MESSAGE)
+   testEqNe(true, "hi", "hi", MESSAGE)
+   testEqNe(true, {one, 1}, {one, 1}, MESSAGE)
+   testEqNe(true, {{{one}}}, {{{one}}}, MESSAGE)
+   testEqNe(false, {{{one}}}, {{one}}, MESSAGE)
+   testEqNe(true, torch.Storage{1}, torch.Storage{1}, MESSAGE)
+   testEqNe(false, torch.FloatStorage{1}, torch.LongStorage{1}, MESSAGE)
+   testEqNe(false, torch.Storage{1}, torch.Storage{1, 2}, MESSAGE)
+   testEqNe(false, "one", 1, MESSAGE)
+   testEqNe(false, {one}, {one + torch.Tensor{1e-10}}, MESSAGE)
+   testEqNe(true, {one}, {one + torch.Tensor{1e-10}}, 1e-9, MESSAGE)
+end
+
+function tests.assertlt()
+   meta_assert_success(subtester:assertlt(1, 2, MESSAGE))
+   meta_assert_failure(subtester:assertlt(2, 1, MESSAGE))
+   meta_assert_failure(subtester:assertlt(1, 1, MESSAGE))
+end
+
+function tests.assertgt()
+   meta_assert_success(subtester:assertgt(2, 1, MESSAGE))
+   meta_assert_failure(subtester:assertgt(1, 2, MESSAGE))
+   meta_assert_failure(subtester:assertgt(1, 1, MESSAGE))
+end
+
+function tests.assertle()
+   meta_assert_success(subtester:assertle(1, 2, MESSAGE))
+   meta_assert_failure(subtester:assertle(2, 1, MESSAGE))
+   meta_assert_success(subtester:assertle(1, 1, MESSAGE))
+end
+
+function tests.assertge()
+   meta_assert_success(subtester:assertge(2, 1, MESSAGE))
+   meta_assert_failure(subtester:assertge(1, 2, MESSAGE))
+   meta_assert_success(subtester:assertge(1, 1, MESSAGE))
+end
+
+function tests.asserteq()
+   meta_assert_success(subtester:asserteq(1, 1, MESSAGE))
+   meta_assert_failure(subtester:asserteq(1, 2, MESSAGE))
+end
+
+function tests.assertalmosteq()
+   meta_assert_success(subtester:assertalmosteq(1, 1, MESSAGE))
+   meta_assert_success(subtester:assertalmosteq(1, 1 + 1e-17, MESSAGE))
+   meta_assert_success(subtester:assertalmosteq(1, 2, 2, MESSAGE))
+   meta_assert_failure(subtester:assertalmosteq(1, 2, MESSAGE))
+   meta_assert_failure(subtester:assertalmosteq(1, 3, 1, MESSAGE))
+end
+
+function tests.assertne()
+   meta_assert_success(subtester:assertne(1, 2, MESSAGE))
+   meta_assert_failure(subtester:assertne(1, 1, MESSAGE))
+end
+
+-- The `alsoTestEq` flag is provided to test :eq in addition to :assertTensorEq.
+-- The behaviour of the two isn't always the same due to handling of tensors of
+-- different dimensions but the same number of elements.
+local function testTensorEqNe(eqExpected, alsoTestEq, ...)
+   if eqExpected then
+      meta_assert_success(subtester:assertTensorEq(...))
+      meta_assert_failure(subtester:assertTensorNe(...))
+      if alsoTestEq then
+         meta_assert_success(subtester:eq(...))
+         meta_assert_failure(subtester:ne(...))
+      end
+   else
+      meta_assert_failure(subtester:assertTensorEq(...))
+      meta_assert_success(subtester:assertTensorNe(...))
+      if alsoTestEq then
+         meta_assert_failure(subtester:eq(...))
+         meta_assert_success(subtester:ne(...))
+      end
+   end
+end
+
+function tests.assertTensor_types()
+   local allTypes = {
+         torch.ByteTensor,
+         torch.CharTensor,
+         torch.ShortTensor,
+         torch.IntTensor,
+         torch.LongTensor,
+         torch.FloatTensor,
+         torch.DoubleTensor,
+   }
+   for _, tensor1 in ipairs(allTypes) do
+      for _, tensor2 in ipairs(allTypes) do
+         local t1 = tensor1():ones(10)
+         local t2 = tensor2():ones(10)
+         testTensorEqNe(tensor1 == tensor2, true, t1, t2, 1e-6, MESSAGE)
+      end
+   end
+
+   testTensorEqNe(false, true, torch.FloatTensor(), torch.LongTensor(), MESSAGE)
+end
+
+function tests.assertTensor_sizes()
+   local t = torch.Tensor() -- no dimensions
+   local t2 = torch.ones(2)
+   local t3 = torch.ones(3)
+   local t12 = torch.ones(1, 2)
+   assert(subtester._assertTensorEqIgnoresDims == true) -- default state
+   testTensorEqNe(false, false, t, t2, 1e-6, MESSAGE)
+   testTensorEqNe(false, false, t, t3, 1e-6, MESSAGE)
+   testTensorEqNe(false, false, t, t12, 1e-6, MESSAGE)
+   testTensorEqNe(false, false, t2, t3, 1e-6, MESSAGE)
+   testTensorEqNe(true, false, t2, t12, 1e-6, MESSAGE)
+   testTensorEqNe(false, false, t3, t12, 1e-6, MESSAGE)
+   subtester._assertTensorEqIgnoresDims = false
+   testTensorEqNe(false, true, t, t2, 1e-6, MESSAGE)
+   testTensorEqNe(false, true, t, t3, 1e-6, MESSAGE)
+   testTensorEqNe(false, true, t, t12, 1e-6, MESSAGE)
+   testTensorEqNe(false, true, t2, t3, 1e-6, MESSAGE)
+   testTensorEqNe(false, true, t2, t12, 1e-6, MESSAGE)
+   testTensorEqNe(false, true, t3, t12, 1e-6, MESSAGE)
+   subtester._assertTensorEqIgnoresDims = true -- reset back
+end
+
+function tests.assertTensor_epsilon()
+   local t1 = torch.rand(100, 100)
+   local t2 = torch.rand(100, 100) * 1e-5
+   local t3 = t1 + t2
+   testTensorEqNe(true, true, t1, t3, 1e-4, MESSAGE)
+   testTensorEqNe(false, true, t1, t3, 1e-6, MESSAGE)
+end
+
+function tests.assertTensor_arg()
+   local one = torch.Tensor{1}
+
+   tester:assertErrorPattern(
+         function() subtester:assertTensorEq(one, 2) end,
+         "Second argument should be a Tensor")
+
+   -- Test that assertTensorEq support message and tolerance in either ordering
+   tester:assertNoError(
+         function() subtester:assertTensorEq(one, one, 0.1, MESSAGE) end)
+   tester:assertNoError(
+         function() subtester:assertTensorEq(one, one, MESSAGE, 0.1) end)
+end
+
+function tests.assertTensor()
+   local t1 = torch.randn(100, 100)
+   local t2 = t1:clone()
+   local t3 = torch.randn(100, 100)
+   testTensorEqNe(true, true, t1, t2, 1e-6, MESSAGE)
+   testTensorEqNe(false, true, t1, t3, 1e-6, MESSAGE)
+   testTensorEqNe(true, true, torch.Tensor(), torch.Tensor(), MESSAGE)
+end
+
+-- Check that calling assertTensorEq with two tensors with the same content but
+-- different dimensions gives a warning.
+function tests.assertTensorDimWarning()
+   local myTester = torch.Tester()
+   myTester:add(
+       function()
+          myTester:assertTensorEq(torch.Tensor{{1}}, torch.Tensor{1})
+       end)
+
+   local warningGiven = false
+   io.write = function(s)
+      if string.match(s, 'but different dimensions') then
+         warningGiven = true
+      end
+   end
+
+   myTester:run()
+   enableIoWrite()
+
+   tester:assert(warningGiven,
+                 "Calling :assertTensorEq({{1}}, {1}) should give a warning")
+end
+
+local function testTableEqNe(eqExpected, ...)
+   if eqExpected then
+      meta_assert_success(subtester:assertTableEq(...))
+      meta_assert_failure(subtester:assertTableNe(...))
+      meta_assert_success(subtester:eq(...))
+      meta_assert_failure(subtester:ne(...))
+   else
+      meta_assert_failure(subtester:assertTableEq(...))
+      meta_assert_success(subtester:assertTableNe(...))
+      meta_assert_failure(subtester:eq(...))
+      meta_assert_success(subtester:ne(...))
+   end
+end
+
+function tests.assertTable()
+   testTableEqNe(true, {1, 2, 3}, {1, 2, 3}, MESSAGE)
+   testTableEqNe(false, {1, 2, 3}, {3, 2, 1}, MESSAGE)
+   testTableEqNe(true, {1, 2, {4, 5}}, {1, 2, {4, 5}}, MESSAGE)
+   testTableEqNe(false, {1, 2, 3}, {1,2}, MESSAGE)
+   testTableEqNe(false, {1, 2, 3}, {1, 2, 3, 4}, MESSAGE)
+   testTableEqNe(true, {{1}}, {{1}}, MESSAGE)
+   testTableEqNe(false, {{1}}, {{{1}}}, MESSAGE)
+   testTableEqNe(true, {false}, {false}, MESSAGE)
+   testTableEqNe(false, {true}, {false}, MESSAGE)
+   testTableEqNe(false, {false}, {true}, MESSAGE)
+
+   local tensor = torch.rand(100, 100)
+   local t1 = {1, "a", key = "value", tensor = tensor, subtable = {"nested"}}
+   local t2 = {1, "a", key = "value", tensor = tensor, subtable = {"nested"}}
+   testTableEqNe(true, t1, t2, MESSAGE)
+   for k, v in pairs(t1) do
+      local x = "something else"
+      t2[k] = nil
+      t2[x] = v
+      testTableEqNe(false, t1, t2, MESSAGE)
+      t2[x] = nil
+      t2[k] = x
+      testTableEqNe(false, t1, t2, MESSAGE)
+      t2[k] = v
+      testTableEqNe(true, t1, t2, MESSAGE)
+   end
+end
+
+local function good_fn() end
+local function bad_fn() error("muahaha!") end
+
+function tests.assertError()
+   meta_assert_success(subtester:assertError(bad_fn, MESSAGE))
+   meta_assert_failure(subtester:assertError(good_fn, MESSAGE))
+end
+
+function tests.assertNoError()
+   meta_assert_success(subtester:assertNoError(good_fn, MESSAGE))
+   meta_assert_failure(subtester:assertNoError(bad_fn, MESSAGE))
+end
+
+function tests.assertErrorPattern()
+   meta_assert_success(subtester:assertErrorPattern(bad_fn, "haha", MESSAGE))
+   meta_assert_failure(subtester:assertErrorPattern(bad_fn, "hehe", MESSAGE))
+end
+
+function tests.testSuite_duplicateTests()
+   local function createDuplicateTests()
+      local tests = torch.TestSuite()
+      function tests.testThis() end
+      function tests.testThis() end
+   end
+   tester:assertErrorPattern(createDuplicateTests,
+                             "Test testThis is already defined.")
+end
+
+--[[ Returns a Tester with `numSuccess` success cases, `numFailure` failure
+  cases, and with an error if `hasError` is true.
+  Success and fail tests are evaluated with tester:eq
+]]
+local function genDummyTest(numSuccess, numFailure, hasError)
+   hasError = hasError or false
+
+   local dummyTester = torch.Tester()
+   local dummyTests = torch.TestSuite()
+
+   if numSuccess > 0 then
+      function dummyTests.testDummySuccess()
+         for i = 1, numSuccess do
+           dummyTester:eq({1}, {1}, '', 0)
+         end
+      end
+   end
+
+   if numFailure > 0 then
+      function dummyTests.testDummyFailure()
+         for i = 1, numFailure do
+            dummyTester:eq({1}, {2}, '', 0)
+         end
+      end
+   end
+
+   if hasError then
+      function dummyTests.testDummyError()
+         error('dummy error')
+      end
+   end
+
+   return dummyTester:add(dummyTests)
+end
+
+function tests.runStatusAndAssertCounts()
+   local emptyTest      = genDummyTest(0, 0, false)
+   local sucTest        = genDummyTest(1, 0, false)
+   local multSucTest    = genDummyTest(4, 0, false)
+   local failTest       = genDummyTest(0, 1, false)
+   local errTest        = genDummyTest(0, 0, true)
+   local errFailTest    = genDummyTest(0, 1, true)
+   local errSucTest     = genDummyTest(1, 0, true)
+   local failSucTest    = genDummyTest(1, 1, false)
+   local failSucErrTest = genDummyTest(1, 1, true)
+
+   disableIoWrite()
+
+   local success, msg = pcall(emptyTest.run, emptyTest)
+   tester:asserteq(success, true, "pcall should succeed for empty tests")
+
+   local success, msg = pcall(sucTest.run, sucTest)
+   tester:asserteq(success, true, "pcall should succeed for 1 successful test")
+
+   local success, msg = pcall(multSucTest.run, multSucTest)
+   tester:asserteq(success, true,
+                   "pcall should succeed for 2+ successful tests")
+
+   local success, msg = pcall(failTest.run, failTest)
+   tester:asserteq(success, false, "pcall should fail for tests with failure")
+
+   local success, msg = pcall(errTest.run, errTest)
+   tester:asserteq(success, false, "pcall should fail for tests with error")
+
+   local success, msg = pcall(errFailTest.run, errFailTest)
+   tester:asserteq(success, false, "pcall should fail for error+fail tests")
+
+   local success, msg = pcall(errSucTest.run, errSucTest)
+   tester:asserteq(success, false, "pcall should fail for error+success tests")
+
+   local success, msg = pcall(failSucTest.run, failSucTest)
+   tester:asserteq(success, false, "pcall should fail for fail+success tests")
+
+   local success, msg = pcall(failSucErrTest.run, failSucErrTest)
+   tester:asserteq(success, false,
+                   "pcall should fail for fail+success+err test")
+
+   enableIoWrite()
+
+   tester:asserteq(emptyTest.countasserts, 0,
+                   "emptyTest should have 0 asserts")
+   tester:asserteq(sucTest.countasserts, 1, "sucTest should have 1 assert")
+   tester:asserteq(multSucTest.countasserts, 4,
+                   "multSucTest should have 4 asserts")
+   tester:asserteq(failTest.countasserts, 1, "failTest should have 1 assert")
+   tester:asserteq(errTest.countasserts, 0, "errTest should have 0 asserts")
+   tester:asserteq(errFailTest.countasserts, 1,
+                   "errFailTest should have 1 assert")
+   tester:asserteq(errSucTest.countasserts, 1,
+                   "errSucTest should have 0 asserts")
+   tester:asserteq(failSucTest.countasserts, 2,
+                   "failSucTest should have 2 asserts")
+end
+
+function tests.checkNestedTestsForbidden()
+   disableIoWrite()
+
+   local myTester = torch.Tester()
+   local myTests = {{function() end}}
+   tester:assertErrorPattern(function() myTester:add(myTests) end,
+                             "Nested sets",
+                             "tester should forbid adding nested test sets")
+
+   enableIoWrite()
+end
+
+function tests.checkWarningOnAssertObject()
+   -- This test checks that calling assert with an object generates a warning
+   local myTester = torch.Tester()
+   local myTests = {}
+   function myTests.assertAbuse()
+      myTester:assert({})
+   end
+   myTester:add(myTests)
+
+   local warningGiven = false
+   io.write = function(s)
+      if string.match(s, 'should only be used for boolean') then
+         warningGiven = true
+      end
+   end
+
+   myTester:run()
+   enableIoWrite()
+
+   tester:assert(warningGiven, "Should warn on calling :assert(object)")
+end
+
+function tests.checkWarningOnAssertNeObject()
+   -- This test checks that calling assertne with two objects generates warning
+   local myTester = torch.Tester()
+   local myTests = {}
+   function myTests.assertAbuse()
+      myTester:assertne({}, {})
+   end
+   myTester:add(myTests)
+
+   local warningGiven = false
+   io.write = function(s)
+      if string.match(s, 'assertne should only be used to compare basic') then
+         warningGiven = true
+      end
+   end
+
+   myTester:run()
+   enableIoWrite()
+
+   tester:assert(warningGiven, "Should warn on calling :assertne(obj, obj)")
+end
+
+function tests.checkWarningOnExtraAssertArguments()
+   -- This test checks that calling assert with extra args gives a lua error
+   local myTester = torch.Tester()
+   local myTests = {}
+   function myTests.assertAbuse()
+      myTester:assert(true, "some message", "extra argument")
+   end
+   myTester:add(myTests)
+
+   local errorGiven = false
+   io.write = function(s)
+      if string.match(s, 'Unexpected arguments') then
+         errorGiven = true
+      end
+   end
+   tester:assertError(function() myTester:run() end)
+   enableIoWrite()
+
+   tester:assert(errorGiven, ":assert should fail on extra arguments")
+end
+
+function tests.checkWarningOnUsingTable()
+   -- Checks that if we don't use a TestSuite then gives a warning
+   local myTester = torch.Tester()
+   local myTests = {}
+   myTester:add(myTests)
+
+   local errorGiven = false
+   io.write = function(s)
+      if string.match(s, 'use TestSuite rather than plain lua table') then
+         errorGiven = true
+      end
+   end
+   myTester:run()
+
+   enableIoWrite()
+   tester:assert(errorGiven, "Using a plain lua table for testsuite should warn")
+end
+
+function tests.checkMaxAllowedSetUpAndTearDown()
+   -- Checks can have at most 1 set-up and at most 1 tear-down function
+   local function f() end
+   local myTester = torch.Tester()
+
+   for _, name in ipairs({'_setUp', '_tearDown'}) do
+      tester:assertNoError(function() myTester:add(f, name) end,
+                           "Adding 1 set-up / tear-down should be fine")
+      tester:assertErrorPattern(function() myTester:add(f, name) end,
+                                "Only one",
+                                "Adding second set-up / tear-down should fail")
+   end
+end
+
+function tests.test_setUp()
+   tester:asserteq(test_name_passed_to_setUp, 'test_setUp')
+   for key, value in pairs(tester.tests) do
+      tester:assertne(key, '_setUp')
+   end
+end
+
+function tests.test_tearDown()
+   for key, value in pairs(tester.tests) do
+      tester:assertne(key, '_tearDown')
+   end
+end
+
+function tests._setUp(name)
+   test_name_passed_to_setUp = name
+   calls_to_setUp = calls_to_setUp + 1
+end
+
+function tests._tearDown(name)
+   calls_to_tearDown = calls_to_tearDown + 1
+end
+
+tester:add(tests):run()
+
+-- Additional tests to check that _setUp and _tearDown were called.
+local test_count = 0
+for _ in pairs(tester.tests) do
+   test_count = test_count + 1
+end
+local postTests = torch.TestSuite()
+local postTester = torch.Tester()
+
+function postTests.test_setUp(tester)
+   postTester:asserteq(calls_to_setUp, test_count,
+                       "Expected " .. test_count .. " calls to _setUp")
+end
+
+function postTests.test_tearDown()
+   postTester:asserteq(calls_to_tearDown, test_count,
+                      "Expected " .. test_count .. " calls to _tearDown")
+end
+
+postTester:add(postTests):run()
diff --git a/test/test_qr.lua b/test/test_qr.lua
new file mode 100644
index 0000000..c850c3f
--- /dev/null
+++ b/test/test_qr.lua
@@ -0,0 +1,274 @@
+-- This file contains tests for the QR decomposition functions in torch:
+-- torch.qr(), torch.geqrf() and torch.orgqr().
+local torch = require 'torch'
+local tester = torch.Tester()
+local tests = torch.TestSuite()
+
+-- torch.qr() with result tensors given.
+local function qrInPlace(tensorFunc)
+  return function(x)
+    local q, r = tensorFunc(), tensorFunc()
+    torch.qr(q, r, x:clone())
+    return q, r
+  end
+end
+
+-- torch.qr() without result tensors given.
+local function qrReturned(tensorFunc)
+  return function(x)
+    return torch.qr(x:clone())
+  end
+end
+
+-- torch.geqrf() with result tensors given.
+local function geqrfInPlace(tensorFunc)
+  return function(x)
+    local result = tensorFunc()
+    local tau = tensorFunc()
+    local result_, tau_ = torch.geqrf(result, tau, x)
+    assert(torch.pointer(result) == torch.pointer(result_),
+           'expected result, result_ same tensor')
+    assert(torch.pointer(tau) == torch.pointer(tau_),
+           'expected tau, tau_ same tensor')
+    return result_, tau_
+  end
+end
+
+-- torch.orgqr() with result tensors given.
+local function orgqrInPlace(tensorFunc)
+  return function(result, tau)
+    local q = tensorFunc()
+    local q_ = torch.orgqr(q, result, tau)
+    assert(torch.pointer(q) == torch.pointer(q_), 'expected q, q_ same tensor')
+    return q
+  end
+end
+
+-- Test a custom QR routine that calls the LAPACK functions manually.
+local function qrManual(geqrfFunc, orgqrFunc)
+  return function(x)
+    local m = x:size(1)
+    local n = x:size(2)
+    local k = math.min(m, n)
+    local result, tau = geqrfFunc(x)
+    assert(result:size(1) == m)
+    assert(result:size(2) == n)
+    assert(tau:size(1) == k)
+    local r = torch.triu(result:narrow(1, 1, k))
+    local q = orgqrFunc(result, tau)
+    return q:narrow(2, 1, k), r
+  end
+end
+
+-- Check that Q multiplied with a matrix with ormqr gives the correct result
+local function checkQM(testOpts, mat1, mat2)
+  local q, r = torch.qr(mat1)
+  local m, tau = torch.geqrf(mat1)
+  local requiredPrecision = 1e-5
+  tester:assertTensorEq(torch.mm(q, mat2), torch.ormqr(m, tau, mat2),
+                        requiredPrecision)
+  tester:assertTensorEq(torch.mm(mat2, q), torch.ormqr(m, tau, mat2, 'R'),
+                        requiredPrecision)
+  tester:assertTensorEq(torch.mm(q:t(), mat2),
+                        torch.ormqr(m, tau, mat2, 'L', 'T'), requiredPrecision)
+  tester:assertTensorEq(torch.mm(mat2, q:t()),
+                        torch.ormqr(m, tau, mat2, 'R', 'T'), requiredPrecision)
+end
+
+-- Check that the given `q`, `r` matrices are a valid QR decomposition of `a`.
+local function checkQR(testOpts, a, q, r)
+  local qrFunc = testOpts.qr
+  if not q then
+    q, r = qrFunc(a)
+  end
+  local k = math.min(a:size(1), a:size(2))
+  tester:asserteq(q:size(1), a:size(1), "Bad size for q first dimension.")
+  tester:asserteq(q:size(2), k, "Bad size for q second dimension.")
+  tester:asserteq(r:size(1), k, "Bad size for r first dimension.")
+  tester:asserteq(r:size(2), a:size(2), "Bad size for r second dimension.")
+  tester:assertTensorEq(q:t() * q,
+                        torch.eye(q:size(2)):typeAs(testOpts.tensorFunc()),
+                        testOpts.precision,
+                        "Q was not orthogonal")
+  tester:assertTensorEq(r, r:triu(), testOpts.precision,
+                        "R was not upper triangular")
+  tester:assertTensorEq(q * r, a, testOpts.precision, "QR = A")
+end
+
+-- Do a QR decomposition of `a` and check that the result is valid and matches
+-- the given expected `q` and `r`.
+local function checkQRWithExpected(testOpts, a, expected_q, expected_r)
+  local qrFunc = testOpts.qr
+  -- Since the QR decomposition is unique only up to the signs of the rows of
+  -- R, we must ensure these are positive before doing the comparison.
+  local function canonicalize(q, r)
+      local d = r:diag():sign():diag()
+      return q * d, d * r
+  end
+  local q, r = qrFunc(a)
+  local q_canon, r_canon = canonicalize(q, r)
+  local expected_q_canon, expected_r_canon
+      = canonicalize(expected_q, expected_r)
+  tester:assertTensorEq(q_canon, expected_q_canon, testOpts.precision,
+                        "Q did not match expected")
+  tester:assertTensorEq(r_canon, expected_r_canon, testOpts.precision,
+                        "R did not match expected")
+  checkQR(testOpts, a, q, r)
+end
+
+-- Generate a separate test based on `func` for each of the possible
+-- combinations of tensor type (double or float) and QR function (torch.qr
+-- in-place, torch.qr, and manually calling the geqrf and orgqr from Lua
+-- (both in-place and not).
+--
+-- The tests are added to the given `tests` table, with names generated by
+-- appending a unique string for the specific combination to `name`.
+--
+-- If opts.doubleTensorOnly is true, then the FloatTensor versions of the test
+-- will be skipped.
+local function addTestVariations(tests, name, func, opts)
+  opts = opts or {}
+  local tensorTypes = {
+      [torch.DoubleTensor] = 1e-12,
+      [torch.FloatTensor] = 1e-5,
+  }
+  for tensorFunc, requiredPrecision in pairs(tensorTypes) do
+    local qrFuncs = {
+        ['inPlace'] = qrInPlace(tensorFunc),
+        ['returned'] = qrReturned(tensorFunc),
+        ['manualInPlace'] = qrManual(geqrfInPlace(tensorFunc),
+                                     orgqrInPlace(tensorFunc)),
+        ['manualReturned'] = qrManual(torch.geqrf, torch.orgqr)
+    }
+    for qrName, qrFunc in pairs(qrFuncs) do
+      local testOpts = {
+          tensorFunc=tensorFunc,
+          precision=requiredPrecision,
+          qr=qrFunc,
+      }
+      local tensorType = tensorFunc():type()
+      local fullName = name .. "_" .. qrName .. "_" .. tensorType
+      assert(not tests[fullName])
+      if tensorType == 'torch.DoubleTensor' or not opts.doubleTensorOnly then
+        tests[fullName] = function()
+          local state = torch.getRNGState()
+          torch.manualSeed(1)
+          func(testOpts)
+          torch.setRNGState(state)
+        end
+      end
+    end
+  end
+end
+
+-- Decomposing a specific square matrix.
+addTestVariations(tests, 'qrSquare', function(testOpts)
+  return function(testOpts)
+    local tensorFunc = testOpts.tensorFunc
+    local a = tensorFunc{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}
+    local expected_q = tensorFunc{
+        {-1.230914909793328e-01,  9.045340337332914e-01,
+         4.082482904638621e-01},
+        {-4.923659639173310e-01,  3.015113445777629e-01,
+         -8.164965809277264e-01},
+        {-8.616404368553292e-01, -3.015113445777631e-01,
+         4.082482904638634e-01},
+    }
+    local expected_r = tensorFunc{
+        {-8.124038404635959e+00, -9.601136296387955e+00,
+         -1.107823418813995e+01},
+        { 0.000000000000000e+00,  9.045340337332926e-01,
+         1.809068067466585e+00},
+        { 0.000000000000000e+00,  0.000000000000000e+00,
+         -8.881784197001252e-16},
+    }
+    checkQRWithExpected(testOpts, a,  expected_q, expected_r)
+  end
+end, {doubleTensorOnly=true})
+
+-- Decomposing a specific (wide) rectangular matrix.
+addTestVariations(tests, 'qrRectFat', function(testOpts)
+  -- The matrix is chosen to be full-rank.
+  local a = testOpts.tensorFunc{
+      {1,  2,  3,  4},
+      {5,  6,  7,  8},
+      {9, 10, 11, 13}
+  }
+  local expected_q = testOpts.tensorFunc{
+      {-0.0966736489045663,  0.907737593658436 ,  0.4082482904638653},
+      {-0.4833682445228317,  0.3157348151855452, -0.8164965809277254},
+      {-0.870062840141097 , -0.2762679632873518,  0.4082482904638621}
+  }
+  local expected_r = testOpts.tensorFunc{
+      { -1.0344080432788603e+01,  -1.1794185166357092e+01,
+        -1.3244289899925587e+01,  -1.5564457473635180e+01},
+      {  0.0000000000000000e+00,   9.4720444555662542e-01,
+         1.8944088911132546e+00,   2.5653453733825331e+00},
+      {  0.0000000000000000e+00,   0.0000000000000000e+00,
+         1.5543122344752192e-15,   4.0824829046386757e-01}
+  }
+  checkQRWithExpected(testOpts, a, expected_q, expected_r)
+end, {doubleTensorOnly=true})
+
+-- Decomposing a specific (thin) rectangular matrix.
+addTestVariations(tests, 'qrRectThin', function(testOpts)
+  -- The matrix is chosen to be full-rank.
+  local a = testOpts.tensorFunc{
+      { 1,  2,  3},
+      { 4,  5,  6},
+      { 7,  8,  9},
+      {10, 11, 13},
+  }
+  local expected_q = testOpts.tensorFunc{
+      {-0.0776150525706334, -0.833052161400748 ,  0.3651483716701106},
+      {-0.3104602102825332, -0.4512365874254053, -0.1825741858350556},
+      {-0.5433053679944331, -0.0694210134500621, -0.7302967433402217},
+      {-0.7761505257063329,  0.3123945605252804,  0.5477225575051663}
+  }
+  local expected_r = testOpts.tensorFunc{
+      {-12.8840987267251261, -14.5916298832790581, -17.0753115655393231},
+      {  0,                  -1.0413152017509357,  -1.770235842976589 },
+      {  0,                   0,                    0.5477225575051664}
+  }
+  checkQRWithExpected(testOpts, a, expected_q, expected_r)
+end, {doubleTensorOnly=true})
+
+-- Decomposing a sequence of medium-sized random matrices.
+addTestVariations(tests, 'randomMediumQR', function(testOpts)
+  for x = 0, 10 do
+    for y = 0, 10 do
+      local m = math.pow(2, x)
+      local n = math.pow(2, y)
+      local x = torch.rand(m, n)
+      checkQR(testOpts, x:typeAs(testOpts.tensorFunc()))
+    end
+  end
+end)
+
+-- Decomposing a sequence of small random matrices.
+addTestVariations(tests, 'randomSmallQR', function(testOpts)
+  for m = 1, 40 do
+    for n = 1, 40 do
+      checkQR(testOpts, torch.rand(m, n):typeAs(testOpts.tensorFunc()))
+    end
+  end
+end)
+
+-- Decomposing a sequence of small matrices that are not contiguous in memory.
+addTestVariations(tests, 'randomNonContiguous', function(testOpts)
+  for m = 2, 40 do
+    for n = 2, 40 do
+      local x = torch.rand(m, n):t()
+      tester:assert(not x:isContiguous(), "x should not be contiguous")
+      checkQR(testOpts, x:typeAs(testOpts.tensorFunc()))
+    end
+  end
+end)
+
+function tests.testQM()
+  checkQM({}, torch.randn(10, 10), torch.randn(10, 10))
+  -- checkQM({}, torch.randn(20, 10), torch.randn(20, 20))
+end
+
+tester:add(tests)
+tester:run()
diff --git a/test/test_sharedmem.lua b/test/test_sharedmem.lua
new file mode 100644
index 0000000..14cdeaf
--- /dev/null
+++ b/test/test_sharedmem.lua
@@ -0,0 +1,76 @@
+require 'torch'
+
+local tester = torch.Tester()
+local tests = torch.TestSuite()
+
+local function createSharedMemStorage(name, size, storageType)
+  local storageType = storageType or 'FloatStorage'
+  local shmName = name or os.tmpname():gsub('/','_')
+  local isShared = true
+  local isSharedMem = true
+  local nElements = size or torch.random(10000, 20000)
+  local storage = torch[storageType](shmName, isShared, nElements, isSharedMem)
+  return storage, shmName
+end
+
+function tests.createSharedMemFile()
+  local storage, shmName = createSharedMemStorage()
+
+  -- check that file is at /dev/shm
+  tester:assert(paths.filep('/dev/shm/' .. shmName),
+                'Shared memory file does not exist')
+
+  -- collect storage and make sure that file is gone
+  storage = nil
+  collectgarbage()
+  collectgarbage()
+  tester:assert(not paths.filep('/dev/shm/' .. shmName),
+                'Shared memory file still exists')
+end
+
+function tests.checkContents()
+  local storage, shmName = createSharedMemStorage()
+  local tensor = torch.FloatTensor(storage, 1, torch.LongStorage{storage:size()})
+  tensor:copy(torch.rand(storage:size()))
+
+  local sharedFile = torch.DiskFile('/dev/shm/'..shmName, 'r'):binary()
+  for i = 1, storage:size() do
+    tester:assert(sharedFile:readFloat() == storage[i], 'value is not correct')
+  end
+  sharedFile:close()
+end
+
+function tests.testSharing()
+  -- since we are going to cast numbers into double (lua default)
+  -- we specifically generate double storage
+  local storage, shmName = createSharedMemStorage(nil, nil, 'DoubleStorage')
+  local shmFileName = '/dev/shm/' .. shmName
+  local tensor = torch.DoubleTensor(storage, 1, torch.LongStorage{storage:size()})
+  tensor:copy(torch.rand(storage:size()))
+  local tensorCopy = tensor.new():resizeAs(tensor):copy(tensor)
+
+  -- access the same shared memory file as regular mapping from same process
+  local storage2 = torch.DoubleStorage(shmFileName, true, storage:size())
+  local tensor2 = torch.DoubleTensor(storage2, 1,
+                                     torch.LongStorage{storage2:size()})
+  local tensor2Copy = tensor2.new():resizeAs(tensor2):copy(tensor2)
+
+  tester:assertTensorEq(tensorCopy, tensor2Copy, 0, 'contents don\'t match')
+
+  -- fill tensor 1 with a random value and read from 2
+  local rval = torch.uniform()
+  tensor:fill(rval)
+  for i = 1, tensor2:size(1) do
+    tester:asserteq(tensor2[i], rval, 'content is wrong')
+  end
+
+  -- fill tensor 2 with a random value and read from 1
+  local rval = torch.uniform()
+  tensor2:fill(rval)
+  for i = 1, tensor:size(1) do
+    tester:asserteq(tensor[i], rval, 'content is wrong')
+  end
+end
+
+tester:add(tests)
+tester:run()
diff --git a/test/test_writeObject.lua b/test/test_writeObject.lua
new file mode 100644
index 0000000..1013a96
--- /dev/null
+++ b/test/test_writeObject.lua
@@ -0,0 +1,229 @@
+require 'torch'
+
+local myTester = torch.Tester()
+
+local tests = torch.TestSuite()
+
+
+-- checks that an object can be written and unwritten
+-- returns false if an error occurs
+local function serializeAndDeserialize(obj)
+   local file = torch.MemoryFile()
+   file:binary()
+   local ok, msg = pcall (file.writeObject, file, obj)
+   myTester:assert(ok, 'error in writing an object'  )
+   file:seek(1)
+   local ok, copy = pcall(file.readObject, file)
+   if not ok then print(copy) end
+   myTester:assert(ok, 'error in reading an object ')
+   return copy
+end
+
+function tests.test_can_write_a_nil_closure()
+  local a
+  local function closure()
+    if not a then return 1 end
+    return 0
+  end
+
+  local copyClosure = serializeAndDeserialize(closure)
+  myTester:assert(copyClosure() == closure(), 'the closures should give same output')
+end
+
+function tests.test_nil_upvalues_in_closure()
+  local a = 1
+  local b
+  local c = 2
+  local function closure()
+    if not b then return c end
+    return a
+  end
+
+  local copyClosure = serializeAndDeserialize(closure)
+  myTester:assert(copyClosure() == closure(), 'the closures should give same output')
+end
+
+function tests.test_global_function_in_closure()
+  local x = "5"
+  local function closure(str)
+    return tonumber(str .. x)
+  end
+
+  local copyClosure = serializeAndDeserialize(closure)
+  myTester:assert(copyClosure("3") == closure("3"), 'the closures should give same output')
+end
+
+function tests.test_a_recursive_closure()
+  local foo
+
+  foo = function (level)
+    if level == 1 then return 1 end
+    return 1+foo(level-1)
+  end
+
+  local copyFoo = serializeAndDeserialize(foo)
+  myTester:assert(copyFoo(42) == foo(42), 'the closures should give same output')
+end
+
+function tests.test_a_tensor()
+   local x = torch.rand(5, 10)
+   local xcopy = serializeAndDeserialize(x)
+   myTester:assert(x:norm() == xcopy:norm(), 'tensors should be the same')
+end
+
+-- Regression test for bug reported in issue 456.
+function tests.test_empty_table()
+   local file = torch.MemoryFile()
+   file:writeObject({})
+end
+
+function tests.test_error_msg()
+   local torch = torch
+   local inner = {
+       baz = function(a) torch.somefunc() end
+   }
+   local outer = {
+       theinner = inner
+   }
+   local function evil_func()
+      outer.prop = 1
+      image.compress(1)
+   end
+   local ok, msg = pcall(torch.save, 'saved.t7', evil_func)
+   myTester:assert(not ok)
+   myTester:assert(msg:find('at <%?>%.outer%.theinner%.baz%.torch') ~= nil)
+end
+
+function tests.test_warning_msg()
+  local foo = {}
+  torch.class('Bar', foo)
+
+  local obj = foo.Bar()
+  local tensor = torch.Tensor()
+  obj.data = tensor:cdata() -- pick something NOT writable
+
+  local file = torch.MemoryFile('rw'):binary()
+  local ok, _ = pcall(torch.File.writeObject, file, obj)
+  -- only a warning is printed on STDOUT:
+  --   $ Warning: cannot write object field <data> of <Bar> <?>
+  myTester:assert(ok)
+  file:close()
+end
+
+function tests.test_referenced()
+   local file = torch.MemoryFile('rw'):binary()
+   file:referenced(false)
+
+   local foo = 'bar'
+   file:writeObject(foo)
+   file:close()
+end
+
+function tests.test_shared_upvalues()
+  if debug.upvalueid then
+     local i=1
+     local j=2
+
+     local func = {}
+
+     func.increment = function()
+        i=i+1
+        j=j+2
+     end
+     func.get_i = function()
+        return i
+     end
+     func.get_j = function()
+        return j
+     end
+
+     local copyFunc = serializeAndDeserialize(func)
+     myTester:assert(copyFunc.get_i()==1)
+     myTester:assert(copyFunc.get_j()==2)
+     copyFunc.increment()
+     myTester:assert(copyFunc.get_i()==2)
+     myTester:assert(copyFunc.get_j()==4)
+  else
+     print('Not running shared upvalues test, as we are in Lua-5.1')
+  end
+end
+
+
+-- checks that the hook function works properly
+-- returns false if an error occurs
+function tests.test_SerializationHook()
+   -- Simpel uuid implementation from [https://gist.github.com/jrus/3197011]
+   -- The only goal is to aoid collisions within the scope of tests,
+   -- so more than enough.
+   local random = math.random
+   local function uuid()
+       local template ='xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'
+       return string.gsub(template, '[xy]', function (c)
+           local v = (c == 'x') and random(0, 0xf) or random(8, 0xb)
+           return string.format('%x', v)
+       end)
+   end
+   local unique1 = uuid()
+   local unique2 = uuid()
+   local class = {}
+   -- Create 2 classes
+   local spec = torch.class('class.'.. unique1, class)
+   function spec:test()
+      return false
+   end
+   local gen = torch.class('class.' .. unique2, class)
+   function gen:test()
+      return true
+   end
+   local hook = function(object)
+      local class = class
+      local newObject = object
+      if torch.typename(object) == 'class.'..unique1 then
+         newObject = class[unique2]()
+      end
+      return newObject
+   end
+
+   -- Write to 2 files, first without hooking,
+   -- second with hooking
+   local file = torch.MemoryFile('rw')
+   file:binary()
+   local file2 = torch.MemoryFile('rw')
+   file2:binary()
+   local s = class[unique1]()
+   local object = {s1 = s, v = 'test', g = class[unique2](), s2 = s}
+   file:writeObject(object)
+   file2:writeObject(object, nil, hook)
+
+   -- unregister class[unique1] and try to reload the first serialized object
+   if debug and debug.getregistry then
+      local ok, res = pcall(function() classTestSerializationHook1 = nil debug.getregistry()[classTestSerializationHook1] = nil file:seek(1) return file:readObject() end)
+      myTester:assert(not ok)
+   else
+      print('Not running serialization hook failure test because debug is missing.')
+   end
+
+   -- Try to reload the second serialized object
+   local ok, clone = pcall(function() file2:seek(1) return file2:readObject()  end)
+
+   -- Test that everything happened smoothly
+   myTester:assert(clone.v == 'test')
+   myTester:assert(torch.typename(clone.s1) == 'class.' .. unique2)
+   myTester:assert(clone.s1:test() and clone.s2:test())
+   myTester:assert(string.format('%x',torch.pointer(clone.s1)) == string.format('%x',torch.pointer(clone.s2)))
+end
+
+function tests.test_serializeToStorage()
+   torch.save("foo.t7", "foo")
+   local f = io.open("foo.t7", "rb")
+   local size = f:seek("end")
+   f:close()
+   myTester:eq(
+      torch.serializeToStorage("foo"):size(), size,
+      "memory and disk serializations should have the same size"
+   )
+end
+
+myTester:add(tests)
+myTester:run()
+if myTester.errors[1] then os.exit(1) end
diff --git a/test/timeSort.lua b/test/timeSort.lua
new file mode 100644
index 0000000..baa0b7d
--- /dev/null
+++ b/test/timeSort.lua
@@ -0,0 +1,148 @@
+-- gnuplot.figure(2)
+-- Test torch sort, show it suffers from the problems of quicksort
+-- i.e. complexity O(N^2) in worst-case of sorted list
+require 'gnuplot'
+
+local cmd = torch.CmdLine()
+cmd:option('-N', 10^7, 'Maximum array size')
+cmd:option('-p',  50, 'Number of points in logspace')
+cmd:option('-r', 20, 'Number of repetitions')
+
+local options = cmd:parse(arg or {})
+function main()
+    local log10 = math.log10 or function(x) return math.log(x, 10) end
+    local pow10 = torch.linspace(1,log10(options.N), options.p)
+    local num_sizes = options.p
+    local num_reps = options.r
+
+    local old_rnd = torch.zeros(num_sizes, num_reps)
+    local old_srt = torch.zeros(num_sizes, num_reps)
+    local old_cst = torch.zeros(num_sizes, num_reps)
+    local new_rnd = torch.zeros(num_sizes, num_reps)
+    local new_srt = torch.zeros(num_sizes, num_reps)
+    local new_cst = torch.zeros(num_sizes, num_reps)
+    local ratio_rnd = torch.zeros(num_sizes, num_reps)
+    local ratio_srt = torch.zeros(num_sizes, num_reps)
+    local ratio_cst = torch.zeros(num_sizes, num_reps)
+
+    -- Ascending sort uses new sort
+    local function time_sort(x)
+        collectgarbage()
+        local start = os.clock()
+        torch.sort(x,false)
+        return (os.clock()-start)
+    end
+
+    -- Descending sort uses old sort
+    local function time_old_sort(x)
+        collectgarbage()
+        local start = os.clock()
+        torch.sort(x,true)
+        return (os.clock()-start)
+    end
+
+    local benches = {
+        function(i,j,n)
+            -- on random
+            local input = torch.rand(n)
+            new_rnd[i][j] = time_sort(input:clone())
+            old_rnd[i][j] = time_old_sort(input:clone())
+        end,
+
+        function(i,j,n)
+            -- on sorted
+            new_srt[i][j] = time_sort(torch.linspace(0,1,n))
+            old_srt[i][j] = time_old_sort(torch.linspace(0,1,n):add(-1):mul(-1)) -- old_time is called on descending sort, hence the reversed input
+        end,
+
+        function(i,j,n)
+            -- on constant
+            new_cst[i][j] = time_sort(torch.zeros(n))
+            old_cst[i][j] = time_old_sort(torch.zeros(n))
+        end
+    }
+
+    local num_benches = #benches
+    local num_exps = num_sizes * num_benches * num_reps
+
+    -- Full randomization
+    local perm = torch.randperm(num_exps):long()
+    local perm_benches = torch.Tensor(num_exps)
+    local perm_reps = torch.Tensor(num_exps)
+    local perm_sizes = torch.Tensor(num_exps)
+
+    local l = 1
+    for i=1, num_sizes do
+        for j=1, num_reps do
+            for k=1, num_benches do
+                perm_benches[ perm[l] ] = k
+                perm_reps[ perm[l] ] = j
+                perm_sizes[ perm[l] ] = i
+                l = l+1
+            end
+        end
+    end
+
+    local pc = 0
+    for j = 1, num_exps do
+        local n = 10^pow10[perm_sizes[j]]
+    --    print(string.format('rep %d / %d, bench %d, size %d, rep %d\n', j, num_exps, perm_benches[j], n, perm_reps[j]))
+        if math.floor(100*j/num_exps) > pc then
+            pc = math.floor(100*j/num_exps)
+            io.write('.')
+            if pc % 10 == 0 then
+                io.write(' ' .. pc .. '%\n')
+             end
+            io.flush()
+        end
+        benches[perm_benches[j]](perm_sizes[j], perm_reps[j], n)
+    end
+
+    ratio_rnd = torch.cdiv(old_rnd:mean(2), new_rnd:mean(2))
+    ratio_srt = torch.cdiv(old_srt:mean(2), new_srt:mean(2))
+    ratio_cst = torch.cdiv(old_cst:mean(2), new_cst:mean(2))
+
+    local N = pow10:clone():apply(function(x) return 10^x end)
+
+    gnuplot.setterm('x11')
+    gnuplot.figure(1)
+    gnuplot.raw('set log x; set mxtics 10')
+    gnuplot.raw('set grid mxtics mytics xtics ytics')
+    gnuplot.raw('set xrange [' .. N:min() .. ':' .. N:max() .. ']' )
+    gnuplot.plot({'Random - new', N, new_rnd:mean(2)},
+                 {'Sorted - new', N, new_srt:mean(2)},
+                 {'Constant - new', N, new_cst:mean(2)},
+                 {'Random - old', N, old_rnd:mean(2)},
+                 {'Sorted - old', N, old_srt:mean(2)},
+                 {'Constant - old', N, old_cst:mean(2)})
+    gnuplot.xlabel('N')
+    gnuplot.ylabel('Time (s)')
+    gnuplot.figprint('benchmarkTime.png')
+
+    gnuplot.figure(2)
+    gnuplot.raw('set log x; set mxtics 10')
+    gnuplot.raw('set grid mxtics mytics xtics ytics')
+    gnuplot.raw('set xrange [' .. N:min() .. ':' .. N:max() .. ']' )
+    gnuplot.plot({'Random', N, ratio_rnd:mean(2)},
+                 {'Sorted', N, ratio_srt:mean(2)},
+                 {'Constant', N, ratio_cst:mean(2)})
+    gnuplot.xlabel('N')
+    gnuplot.ylabel('Speed-up Factor (s)')
+    gnuplot.figprint('benchmarkRatio.png')
+
+    torch.save('benchmark.t7', {
+               new_rnd=new_rnd,
+               new_srt=new_srt,
+               new_cst=new_cst,
+               old_rnd=old_rnd,
+               old_srt=old_srt,
+               old_cst=old_cst,
+               ratio_rnd=ratio_rnd,
+               ratio_srt=ratio_srt,
+               ratio_cst=ratio_cst,
+               pow10 = pow10,
+               num_reps = num_reps
+           })
+end
+
+main()
diff --git a/torchcwrap.lua b/torchcwrap.lua
new file mode 100644
index 0000000..ab0df43
--- /dev/null
+++ b/torchcwrap.lua
@@ -0,0 +1,462 @@
+local wrap = require 'cwrap'
+local types = wrap.types
+
+types.Tensor = {
+
+   helpname = function(arg)
+                 if arg.dim then
+                    return string.format("Tensor~%dD", arg.dim)
+                 else
+                    return "Tensor"
+                 end
+            end,
+
+   declare = function(arg)
+                local txt = {}
+                table.insert(txt, string.format("THTensor *arg%d = NULL;", arg.i))
+                if arg.returned then
+                   table.insert(txt, string.format("int arg%d_idx = 0;", arg.i));
+                end
+                return table.concat(txt, '\n')
+           end,
+
+   check = function(arg, idx)
+              if arg.dim then
+                 return string.format("(arg%d = luaT_toudata(L, %d, torch_Tensor)) && (arg%d->nDimension == %d)", arg.i, idx, arg.i, arg.dim)
+              else
+                 return string.format("(arg%d = luaT_toudata(L, %d, torch_Tensor))", arg.i, idx)
+              end
+         end,
+
+   read = function(arg, idx)
+             if arg.returned then
+                return string.format("arg%d_idx = %d;", arg.i, idx)
+             end
+          end,
+
+   init = function(arg)
+             if type(arg.default) == 'boolean' then
+                return string.format('arg%d = THTensor_(new)();', arg.i)
+             elseif type(arg.default) == 'number' then
+                return string.format('arg%d = %s;', arg.i, arg.args[arg.default]:carg())
+             else
+                error('unknown default tensor type value')
+             end
+          end,
+
+   carg = function(arg)
+             return string.format('arg%d', arg.i)
+          end,
+
+   creturn = function(arg)
+                return string.format('arg%d', arg.i)
+             end,
+
+   precall = function(arg)
+                local txt = {}
+                if arg.default and arg.returned then
+                   table.insert(txt, string.format('if(arg%d_idx)', arg.i)) -- means it was passed as arg
+                   table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+                   table.insert(txt, string.format('else'))
+                   if type(arg.default) == 'boolean' then -- boolean: we did a new()
+                      table.insert(txt, string.format('luaT_pushudata(L, arg%d, torch_Tensor);', arg.i))
+                   else  -- otherwise: point on default tensor --> retain
+                      table.insert(txt, string.format('{'))
+                      table.insert(txt, string.format('THTensor_(retain)(arg%d);', arg.i)) -- so we need a retain
+                      table.insert(txt, string.format('luaT_pushudata(L, arg%d, torch_Tensor);', arg.i))
+                      table.insert(txt, string.format('}'))
+                   end
+                elseif arg.default then
+                   -- we would have to deallocate the beast later if we did a new
+                   -- unlikely anyways, so i do not support it for now
+                   if type(arg.default) == 'boolean' then
+                      error('a tensor cannot be optional if not returned')
+                   end
+                elseif arg.returned then
+                   table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+                end
+                return table.concat(txt, '\n')
+             end,
+
+   postcall = function(arg)
+                 local txt = {}
+                 if arg.creturned then
+                    -- this next line is actually debatable
+                    table.insert(txt, string.format('THTensor_(retain)(arg%d);', arg.i))
+                    table.insert(txt, string.format('luaT_pushudata(L, arg%d, torch_Tensor);', arg.i))
+                 end
+                 return table.concat(txt, '\n')
+              end
+}
+
+types.Generator = {
+
+   helpname = function(arg)
+                 return "Generator"
+              end,
+
+   declare = function(arg)
+                return string.format("THGenerator *arg%d = NULL;", arg.i)
+             end,
+
+   check = function(arg, idx)
+              return string.format("(arg%d = luaT_toudata(L, %d, torch_Generator))", arg.i, idx)
+           end,
+
+   read = function(arg, idx)
+          end,
+
+   init = function(arg)
+             local text = {}
+             -- If no generator is supplied, pull the default out of the torch namespace.
+             table.insert(text, 'lua_getglobal(L,"torch");')
+             table.insert(text, string.format('arg%d = luaT_getfieldcheckudata(L, -1, "_gen", torch_Generator);', arg.i))
+             table.insert(text, 'lua_pop(L, 2);')
+             return table.concat(text, '\n')
+          end,
+
+   carg = function(arg)
+             return string.format('arg%d', arg.i)
+          end,
+
+   creturn = function(arg)
+                return string.format('arg%d', arg.i)
+             end,
+
+   precall = function(arg)
+             end,
+
+   postcall = function(arg)
+              end
+}
+
+types.IndexTensor = {
+
+   helpname = function(arg)
+               return "LongTensor"
+            end,
+
+   declare = function(arg)
+                local txt = {}
+                table.insert(txt, string.format("THLongTensor *arg%d = NULL;", arg.i))
+                if arg.returned then
+                   table.insert(txt, string.format("int arg%d_idx = 0;", arg.i));
+                end
+                return table.concat(txt, '\n')
+           end,
+
+   check = function(arg, idx)
+              return string.format('(arg%d = luaT_toudata(L, %d, "torch.LongTensor"))', arg.i, idx)
+           end,
+
+   read = function(arg, idx)
+             local txt = {}
+             if not arg.noreadadd then
+                table.insert(txt, string.format("THLongTensor_add(arg%d, arg%d, -1);", arg.i, arg.i));
+             end
+             if arg.returned then
+                table.insert(txt, string.format("arg%d_idx = %d;", arg.i, idx))
+             end
+             return table.concat(txt, '\n')
+          end,
+
+   init = function(arg)
+             return string.format('arg%d = THLongTensor_new();', arg.i)
+          end,
+
+   carg = function(arg)
+             return string.format('arg%d', arg.i)
+          end,
+
+   creturn = function(arg)
+                return string.format('arg%d', arg.i)
+             end,
+
+   precall = function(arg)
+                local txt = {}
+                if arg.default and arg.returned then
+                   table.insert(txt, string.format('if(arg%d_idx)', arg.i)) -- means it was passed as arg
+                   table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+                   table.insert(txt, string.format('else')) -- means we did a new()
+                   table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.LongTensor");', arg.i))
+                elseif arg.default then
+                   error('a tensor cannot be optional if not returned')
+                elseif arg.returned then
+                   table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+                end
+                return table.concat(txt, '\n')
+             end,
+
+   postcall = function(arg)
+                 local txt = {}
+                 if arg.creturned or arg.returned then
+                    table.insert(txt, string.format("THLongTensor_add(arg%d, arg%d, 1);", arg.i, arg.i));
+                 end
+                 if arg.creturned then
+                    -- this next line is actually debatable
+                    table.insert(txt, string.format('THLongTensor_retain(arg%d);', arg.i))
+                    table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.LongTensor");', arg.i))
+                 end
+                 return table.concat(txt, '\n')
+              end
+}
+
+for _,typename in ipairs({"ByteTensor", "CharTensor", "ShortTensor", "IntTensor", "LongTensor",
+                          "FloatTensor", "DoubleTensor"}) do
+
+   types[typename] = {
+
+      helpname = function(arg)
+                    if arg.dim then
+                       return string.format('%s~%dD', typename, arg.dim)
+                    else
+                       return typename
+                    end
+                 end,
+
+      declare = function(arg)
+                   local txt = {}
+                   table.insert(txt, string.format("TH%s *arg%d = NULL;", typename, arg.i))
+                   if arg.returned then
+                      table.insert(txt, string.format("int arg%d_idx = 0;", arg.i));
+                   end
+                   return table.concat(txt, '\n')
+                end,
+
+      check = function(arg, idx)
+                 if arg.dim then
+                    return string.format('(arg%d = luaT_toudata(L, %d, "torch.%s")) && (arg%d->nDimension == %d)', arg.i, idx, typename, arg.i, arg.dim)
+                 else
+                    return string.format('(arg%d = luaT_toudata(L, %d, "torch.%s"))', arg.i, idx, typename)
+                 end
+              end,
+
+      read = function(arg, idx)
+                if arg.returned then
+                   return string.format("arg%d_idx = %d;", arg.i, idx)
+                end
+             end,
+
+      init = function(arg)
+                if type(arg.default) == 'boolean' then
+                   return string.format('arg%d = TH%s_new();', arg.i, typename)
+                elseif type(arg.default) == 'number' then
+                   return string.format('arg%d = %s;', arg.i, arg.args[arg.default]:carg())
+                else
+                   error('unknown default tensor type value')
+                end
+             end,
+
+      carg = function(arg)
+                return string.format('arg%d', arg.i)
+             end,
+
+      creturn = function(arg)
+                   return string.format('arg%d', arg.i)
+             end,
+
+      precall = function(arg)
+                   local txt = {}
+                   if arg.default and arg.returned then
+                      table.insert(txt, string.format('if(arg%d_idx)', arg.i)) -- means it was passed as arg
+                      table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+                      table.insert(txt, string.format('else'))
+                      if type(arg.default) == 'boolean' then -- boolean: we did a new()
+                         table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.%s");', arg.i, typename))
+                      else  -- otherwise: point on default tensor --> retain
+                         table.insert(txt, string.format('{'))
+                         table.insert(txt, string.format('TH%s_retain(arg%d);', typename, arg.i)) -- so we need a retain
+                         table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.%s");', arg.i, typename))
+                         table.insert(txt, string.format('}'))
+                      end
+                   elseif arg.default then
+                      -- we would have to deallocate the beast later if we did a new
+                      -- unlikely anyways, so i do not support it for now
+                      if type(arg.default) == 'boolean' then
+                         error('a tensor cannot be optional if not returned')
+                      end
+                   elseif arg.returned then
+                      table.insert(txt, string.format('lua_pushvalue(L, arg%d_idx);', arg.i))
+                   end
+                   return table.concat(txt, '\n')
+                end,
+
+      postcall = function(arg)
+                    local txt = {}
+                    if arg.creturned then
+                       -- this next line is actually debatable
+                       table.insert(txt, string.format('TH%s_retain(arg%d);', typename, arg.i))
+                       table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.%s");', arg.i, typename))
+                    end
+                    return table.concat(txt, '\n')
+                 end
+   }
+
+   types[typename .. 'Array'] = {
+
+      helpname = function(arg)
+                    return string.format('{%s+}', typename)
+               end,
+
+      declare = function(arg)
+                   local txt = {}
+                   table.insert(txt, string.format('TH%s **arg%d_data = NULL;', typename, arg.i))
+                   table.insert(txt, string.format('long arg%d_size = 0;', arg.i))
+                   table.insert(txt, string.format('int arg%d_i = 0;', arg.i))
+                   return table.concat(txt, '\n')
+              end,
+
+      check = function(arg, idx)
+                 return string.format('torch_isnonemptytable(L, %d)', idx)
+            end,
+
+      read = function(arg, idx)
+                local txt = {}
+                -- Iterate over the array to find its length, leave elements on stack.
+                table.insert(txt, string.format('do'))
+                table.insert(txt, string.format('{'))
+                table.insert(txt, string.format('  arg%d_size++;', arg.i))
+                table.insert(txt, string.format('  lua_checkstack(L, 1);'))
+                table.insert(txt, string.format('  lua_rawgeti(L, %d, arg%d_size);', idx, arg.i))
+                table.insert(txt, string.format('}'))
+                table.insert(txt, string.format('while (!lua_isnil(L, -1));'))
+                table.insert(txt, string.format('arg%d_size--;', arg.i))
+                -- Pop nil element from stack.
+                table.insert(txt, string.format('lua_pop(L, 1);'))
+                -- Allocate tensor pointers and read values from stack backwards.
+                table.insert(txt, string.format('arg%d_data = (TH%s**)THAlloc(arg%d_size * sizeof(TH%s*));', arg.i, typename, arg.i, typename))
+                table.insert(txt, string.format('for (arg%d_i = arg%d_size - 1; arg%d_i >= 0; arg%d_i--)', arg.i, arg.i, arg.i, arg.i))
+                table.insert(txt, string.format('{'))
+                table.insert(txt, string.format('  if (!(arg%d_data[arg%d_i] = luaT_toudata(L, -1, "torch.%s")))', arg.i, arg.i, typename))
+                table.insert(txt, string.format('    luaL_error(L, "expected %s in tensor array");', typename))
+                table.insert(txt, string.format('  lua_pop(L, 1);'))
+                table.insert(txt, string.format('}'))
+                table.insert(txt, string.format(''))
+                return table.concat(txt, '\n')
+             end,
+
+      init = function(arg)
+             end,
+
+      carg = function(arg)
+                return string.format('arg%d_data,arg%d_size', arg.i, arg.i)
+             end,
+
+      creturn = function(arg)
+                   error('TensorArray cannot be returned.')
+                end,
+
+      precall = function(arg)
+                end,
+
+      postcall = function(arg)
+                    return string.format('THFree(arg%d_data);', arg.i)
+                 end
+   }
+end
+
+types.LongArg = {
+
+   vararg = true,
+
+   helpname = function(arg)
+               return "(LongStorage | dim1 [dim2...])"
+            end,
+
+   declare = function(arg)
+              return string.format("THLongStorage *arg%d = NULL;", arg.i)
+           end,
+
+   init = function(arg)
+             if arg.default then
+                error('LongArg cannot have a default value')
+             end
+          end,
+
+   check = function(arg, idx)
+            return string.format("torch_islongargs(L, %d)", idx)
+         end,
+
+   read = function(arg, idx)
+             return string.format("arg%d = torch_checklongargs(L, %d);", arg.i, idx)
+          end,
+
+   carg = function(arg, idx)
+             return string.format('arg%d', arg.i)
+          end,
+
+   creturn = function(arg, idx)
+                return string.format('arg%d', arg.i)
+             end,
+
+   precall = function(arg)
+                local txt = {}
+                if arg.returned then
+                   table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.LongStorage");', arg.i))
+                end
+                return table.concat(txt, '\n')
+             end,
+
+   postcall = function(arg)
+                 local txt = {}
+                 if arg.creturned then
+                    -- this next line is actually debatable
+                    table.insert(txt, string.format('THLongStorage_retain(arg%d);', arg.i))
+                    table.insert(txt, string.format('luaT_pushudata(L, arg%d, "torch.LongStorage");', arg.i))
+                 end
+                 if not arg.returned and not arg.creturned then
+                    table.insert(txt, string.format('THLongStorage_free(arg%d);', arg.i))
+                 end
+                 return table.concat(txt, '\n')
+              end
+}
+
+types.charoption = {
+
+   helpname = function(arg)
+                 if arg.values then
+                    return "(" .. table.concat(arg.values, '|') .. ")"
+                 end
+              end,
+
+   declare = function(arg)
+                local txt = {}
+                table.insert(txt, string.format("const char *arg%d = NULL;", arg.i))
+                if arg.default then
+                   table.insert(txt, string.format("char arg%d_default = '%s';", arg.i, arg.default))
+                end
+                return table.concat(txt, '\n')
+           end,
+
+   init = function(arg)
+             return string.format("arg%d = &arg%d_default;", arg.i, arg.i)
+          end,
+
+   check = function(arg, idx)
+              local txt = {}
+              local txtv = {}
+              table.insert(txt, string.format('(arg%d = lua_tostring(L, %d)) && (', arg.i, idx))
+              for _,value in ipairs(arg.values) do
+                 table.insert(txtv, string.format("*arg%d == '%s'", arg.i, value))
+              end
+              table.insert(txt, table.concat(txtv, ' || '))
+              table.insert(txt, ')')
+              return table.concat(txt, '')
+         end,
+
+   read = function(arg, idx)
+          end,
+
+   carg = function(arg, idx)
+             return string.format('arg%d', arg.i)
+          end,
+
+   creturn = function(arg, idx)
+             end,
+
+   precall = function(arg)
+             end,
+
+   postcall = function(arg)
+              end
+}
diff --git a/utils.c b/utils.c
new file mode 100644
index 0000000..35fdae4
--- /dev/null
+++ b/utils.c
@@ -0,0 +1,260 @@
+#include "general.h"
+#include "utils.h"
+
+#ifdef WIN32
+# include <time.h>
+#else
+# include <sys/time.h>
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+THLongStorage* torch_checklongargs(lua_State *L, int index)
+{
+  THLongStorage *storage;
+  int i;
+  int narg = lua_gettop(L)-index+1;
+
+  if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage"))
+  {
+    THLongStorage *storagesrc = luaT_toudata(L, index, "torch.LongStorage");
+    storage = THLongStorage_newWithSize(storagesrc->size);
+    THLongStorage_copy(storage, storagesrc);
+  }
+  else
+  {
+    storage = THLongStorage_newWithSize(narg);
+    for(i = index; i < index+narg; i++)
+    {
+      if(!lua_isnumber(L, i))
+      {
+        THLongStorage_free(storage);
+        luaL_argerror(L, i, "number expected");
+      }
+      THLongStorage_set(storage, i-index, lua_tonumber(L, i));
+    }
+  }
+  return storage;
+}
+
+int torch_islongargs(lua_State *L, int index)
+{
+  int narg = lua_gettop(L)-index+1;
+
+  if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage"))
+  {
+    return 1;
+  }
+  else
+  {
+    int i;
+
+    for(i = index; i < index+narg; i++)
+    {
+      if(!lua_isnumber(L, i))
+        return 0;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static int torch_isatty(lua_State *L)
+{
+#ifdef _WIN32
+  lua_pushboolean(L, 0);
+#else
+  FILE **fp = (FILE **) luaL_checkudata(L, -1, LUA_FILEHANDLE);
+  lua_pushboolean(L, isatty(fileno(*fp)));
+#endif
+  return 1;
+}
+
+static double real_time()
+{
+#ifdef _WIN32
+  time_t ltime;
+  time(&ltime);
+  return (double)(ltime);
+#else
+  struct timeval current;
+  gettimeofday(&current, NULL);
+  return (current.tv_sec + current.tv_usec/1000000.0);
+#endif
+}
+
+static int torch_lua_tic(lua_State* L)
+{
+  double ttime = real_time();
+  lua_pushnumber(L,ttime);
+  return 1;
+}
+
+static int torch_lua_toc(lua_State* L)
+{
+  double toctime = real_time();
+  lua_Number tictime = luaL_checknumber(L,1);
+  lua_pushnumber(L,toctime-tictime);
+  return 1;
+}
+
+static int torch_lua_getdefaulttensortype(lua_State *L)
+{
+  const char* tname = torch_getdefaulttensortype(L);
+  if(tname)
+  {
+    lua_pushstring(L, tname);
+    return 1;
+  }
+  return 0;
+}
+
+const char* torch_getdefaulttensortype(lua_State *L)
+{
+  lua_getglobal(L, "torch");
+  if(lua_istable(L, -1))
+  {
+    lua_getfield(L, -1, "Tensor");
+    if(lua_istable(L, -1))
+    {
+      if(lua_getmetatable(L, -1))
+      {
+        lua_pushstring(L, "__index");
+        lua_rawget(L, -2);
+        if(lua_istable(L, -1))
+        {
+          lua_rawget(L, LUA_REGISTRYINDEX);
+          if(lua_isstring(L, -1))
+          {
+            const char *tname = lua_tostring(L, -1);
+            lua_pop(L, 4);
+            return tname;
+          }
+        }
+        else
+        {
+          lua_pop(L, 4);
+          return NULL;
+        }
+      }
+      else
+      {
+        lua_pop(L, 2);
+        return NULL;
+      }
+    }
+    else
+    {
+      lua_pop(L, 2);
+      return NULL;
+    }
+  }
+  else
+  {
+    lua_pop(L, 1);
+    return NULL;
+  }
+  return NULL;
+}
+
+static int torch_getnumthreads(lua_State *L)
+{
+#ifdef _OPENMP
+  lua_pushinteger(L, omp_get_max_threads());
+#else
+  lua_pushinteger(L, 1);
+#endif
+  return 1;
+}
+
+static int torch_setnumthreads(lua_State *L)
+{
+#ifdef _OPENMP
+  int nth = luaL_checkint(L,1);
+  omp_set_num_threads(nth);
+#endif
+  return 0;
+}
+
+static int torch_getnumcores(lua_State *L)
+{
+#ifdef _OPENMP
+  lua_pushinteger(L, omp_get_num_procs());
+#else
+  lua_pushinteger(L, 1);
+#endif
+  return 1;
+}
+
+static void luaTorchGCFunction(void *data)
+{
+  lua_State *L = data;
+  lua_gc(L, LUA_GCCOLLECT, 0);
+}
+
+static int torch_setheaptracking(lua_State *L)
+{
+  int enabled = luaT_checkboolean(L,1);
+  lua_getglobal(L, "torch");
+  lua_pushboolean(L, enabled);
+  lua_setfield(L, -2, "_heaptracking");
+  if(enabled) {
+    THSetGCHandler(luaTorchGCFunction, L);
+  } else {
+    THSetGCHandler(NULL, NULL);
+  }
+  return 0;
+}
+
+static void luaTorchErrorHandlerFunction(const char *msg, void *data)
+{
+  lua_State *L = data;
+  luaL_error(L, msg);
+}
+
+static void luaTorchArgErrorHandlerFunction(int argNumber, const char *msg, void *data)
+{
+  lua_State *L = data;
+  luaL_argcheck(L, 0, argNumber, msg);
+}
+
+static int torch_updateerrorhandlers(lua_State *L)
+{
+  THSetErrorHandler(luaTorchErrorHandlerFunction, L);
+  THSetArgErrorHandler(luaTorchArgErrorHandlerFunction, L);
+  return 0;
+}
+
+static const struct luaL_Reg torch_utils__ [] = {
+  {"getdefaulttensortype", torch_lua_getdefaulttensortype},
+  {"isatty", torch_isatty},
+  {"tic", torch_lua_tic},
+  {"toc", torch_lua_toc},
+  {"setnumthreads", torch_setnumthreads},
+  {"getnumthreads", torch_getnumthreads},
+  {"getnumcores", torch_getnumcores},
+  {"factory", luaT_lua_factory},
+  {"getconstructortable", luaT_lua_getconstructortable},
+  {"typename", luaT_lua_typename},
+  {"isequal", luaT_lua_isequal},
+  {"getenv", luaT_lua_getenv},
+  {"setenv", luaT_lua_setenv},
+  {"newmetatable", luaT_lua_newmetatable},
+  {"setmetatable", luaT_lua_setmetatable},
+  {"getmetatable", luaT_lua_getmetatable},
+  {"metatype", luaT_lua_metatype},
+  {"pushudata", luaT_lua_pushudata},
+  {"version", luaT_lua_version},
+  {"pointer", luaT_lua_pointer},
+  {"setheaptracking", torch_setheaptracking},
+  {"updateerrorhandlers", torch_updateerrorhandlers},
+  {NULL, NULL}
+};
+
+void torch_utils_init(lua_State *L)
+{
+  torch_updateerrorhandlers(L);
+  luaT_setfuncs(L, torch_utils__, 0);
+}
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000..f9654d4
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,36 @@
+#ifndef TORCH_UTILS_INC
+#define TORCH_UTILS_INC
+
+#include "luaT.h"
+#include "TH.h"
+
+#include <lua.h>
+#include <lualib.h>
+
+#ifdef _WIN32
+#else
+#include <unistd.h>
+#endif
+
+#ifdef __cplusplus
+# define TORCH_EXTERNC extern "C"
+#else
+# define TORCH_EXTERNC extern
+#endif
+
+#ifdef _WIN32
+# ifdef torch_EXPORTS
+#  define TORCH_API TORCH_EXTERNC __declspec(dllexport)
+# else
+#  define TORCH_API TORCH_EXTERNC __declspec(dllimport)
+# endif
+#else
+# define TORCH_API TORCH_EXTERNC
+#endif
+
+
+TORCH_API THLongStorage* torch_checklongargs(lua_State *L, int index);
+TORCH_API int torch_islongargs(lua_State *L, int index);
+TORCH_API const char* torch_getdefaulttensortype(lua_State *L);
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-torch7.git