[lua-torch-nn] 01/06: New upstream version 0~20170726-gf613412+dfsg

Sun Jul 30 06:09:59 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit 801c4f9604ed77f480263443046d9da4eba3c458
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Wed Jul 26 15:49:48 2017 +0000

    New upstream version 0~20170726-gf613412+dfsg
---
 Bilinear.lua                                     |   2 +-
 CAddTensorTable.lua                              |  43 ++
 Collapse.lua                                     |  30 +
 ConcatTable.lua                                  |   4 +-
 Constant.lua                                     |  36 ++
 Container.lua                                    |   4 +-
 Convert.lua                                      | 245 ++++++++
 DontCast.lua                                     |   2 +-
 FeatureLPPooling.lua                             |  74 +++
 FlattenTable.lua                                 |  16 +-
 Identity.lua                                     |   2 +-
 IndexLinear.lua                                  |   4 +-
 Kmeans.lua                                       | 215 +++++++
 ModuleCriterion.lua                              |  44 ++
 OneHot.lua                                       |  69 +++
 PrintSize.lua                                    |  36 ++
 SparseLinear.lua                                 |  12 +-
 SpatialFullConvolution.lua                       |  10 +-
 Sum.lua                                          |   2 +-
 UpSampling.lua                                   | 216 +++++++
 VolumetricFullConvolution.lua                    |  10 +-
 WhiteNoise.lua                                   |  40 ++
 ZeroGrad.lua                                     |  14 +
 ZipTable.lua                                     |  34 ++
 ZipTableOneToMany.lua                            |  37 ++
 doc/convolution.md                               |  35 ++
 doc/criterion.md                                 |  17 +-
 doc/simple.md                                    | 264 ++++++++-
 doc/table.md                                     |  56 +-
 hessian.lua                                      |   4 +-
 init.lua                                         |  15 +
 lib/THNN/doc/api_reference.md                    | 123 ++++
 lib/THNN/generic/BCECriterion.c                  |   6 +
 lib/THNN/generic/FeatureLPPooling.c              | 348 +++++++++++
 lib/THNN/generic/FusedRNNKernel.c                |  16 +-
 lib/THNN/generic/GatedLinearUnit.c               |  10 +-
 lib/THNN/generic/SpatialDepthWiseConvolution.c   |  47 +-
 lib/THNN/generic/SpatialGridSamplerBilinear.c    | 204 +++++++
 lib/THNN/generic/THNN.h                          |  81 ++-
 lib/THNN/generic/VolumetricUpSamplingNearest.c   | 226 +++++++
 lib/THNN/generic/VolumetricUpSamplingTrilinear.c | 213 +++++++
 lib/THNN/init.c                                  |  12 +
 test.lua                                         | 715 ++++++++++++++++++++++-
 utils.lua                                        |  21 +-
 44 files changed, 3526 insertions(+), 88 deletions(-)

diff --git a/Bilinear.lua b/Bilinear.lua
index 9350b03..3c0f6db 100644
--- a/Bilinear.lua
+++ b/Bilinear.lua
@@ -2,7 +2,7 @@ local Bilinear, parent = torch.class('nn.Bilinear', 'nn.Module')
 
 local function isint(x) return type(x) == 'number' and x == math.floor(x) end
 function Bilinear:__assertInput(input)
-   assert(input and type(input) == 'table' and #input == 2,
+   assert(input and torch.type(input) == 'table' and #input == 2,
       'input should be a table containing two data Tensors')
    assert(input[1]:nDimension() == 2 and input[2]:nDimension() == 2,
       'input Tensors should be two-dimensional')
diff --git a/CAddTensorTable.lua b/CAddTensorTable.lua
new file mode 100644
index 0000000..16efe44
--- /dev/null
+++ b/CAddTensorTable.lua
@@ -0,0 +1,43 @@
+
+local CAddTensorTable, parent = torch.class('nn.CAddTensorTable', 'nn.Module')
+
+function CAddTensorTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+-- input is a table with 2 entries. input[1] is the vector to be added.
+-- input[2] is the table to which we add the vector
+function CAddTensorTable:updateOutput(input)
+  local currentOutput = {}
+  for i=1,#input[2] do
+    currentOutput[i] = currentOutput[i] or input[1].new()
+    currentOutput[i]:resizeAs(input[1])
+    currentOutput[i]:copy(input[2][i])
+    currentOutput[i]:add(input[1])
+  end
+  for i = #input[2]+1, #currentOutput do
+    currentOutput[i] = nil
+  end
+  self.output = currentOutput
+  return self.output
+end
+
+function CAddTensorTable:updateGradInput(input, gradOutput)
+  self.gradInput[1] = self.gradInput[1] or input[1].new()
+  self.gradInput[1]:resizeAs(input[1])
+  self.gradInput[1]:copy(gradOutput[1])
+  for i=2, #input[2] do
+    self.gradInput[1]:add(gradOutput[i])
+  end
+  self.gradInput[2] = self.gradInput[2] or {}
+  for i=1,#input[2] do
+    self.gradInput[2][i] = self.gradInput[2][i] or input[1].new()
+    self.gradInput[2][i]:resizeAs(input[1])
+    self.gradInput[2][i]:copy(gradOutput[i])
+  end
+  for i=#input[2]+1, #self.gradInput[2] do
+     self.gradInput[2][i] = nil
+  end
+  return self.gradInput
+end
\ No newline at end of file
diff --git a/Collapse.lua b/Collapse.lua
new file mode 100644
index 0000000..a088608
--- /dev/null
+++ b/Collapse.lua
@@ -0,0 +1,30 @@
+local Collapse, parent = torch.class('nn.Collapse', 'nn.Module')
+
+-- collapses non-batch dims
+function Collapse:__init(nInputDim)
+   parent.__init(self)
+   self.nInputDim = nInputDim
+end
+
+function Collapse:updateOutput(input)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resize(input:size()):copy(input)
+      input = self._input
+   end
+   if input:dim() > self.nInputDim then
+      self.output:view(input,input:size(1),-1)
+   else
+      self.output:view(input,-1)
+   end
+   return self.output
+end
+
+function Collapse:updateGradInput(input, gradOutput)
+   self.gradInput:view(gradOutput, input:size())
+   return self.gradInput
+end
+
+function Collapse:clearState()
+   self._input = nil
+end
diff --git a/ConcatTable.lua b/ConcatTable.lua
index b1d904f..7427193 100644
--- a/ConcatTable.lua
+++ b/ConcatTable.lua
@@ -44,7 +44,7 @@ local function backward(self, method, input, gradOutput, scale)
             retable(self.gradInput, currentGradInput,
                function(t, k, v)
                   t[k] = t[k] or v:clone()
-                  t[k]:resizeAs(v)
+                  t[k]:resize(v:size())
                   t[k]:copy(v)
                end
             )
@@ -65,7 +65,7 @@ local function backward(self, method, input, gradOutput, scale)
       for i,module in ipairs(self.modules) do
          local currentGradInput = self:rethrowErrors(module, i, method, input, gradOutput[i], scale)
          if i == 1 then
-            self.gradInput:resizeAs(currentGradInput):copy(currentGradInput)
+            self.gradInput:resize(currentGradInput:size()):copy(currentGradInput)
          else
             self.gradInput:add(currentGradInput)
          end
diff --git a/Constant.lua b/Constant.lua
new file mode 100644
index 0000000..07773fe
--- /dev/null
+++ b/Constant.lua
@@ -0,0 +1,36 @@
+------------------------------------------------------------------------
+--[[ Constant ]]--
+-- Outputs a constant value given an input.
+-- If nInputDim is specified, uses the input to determine the size of
+-- the batch. The value is then replicated over the batch.
+-- You can use this with nn.ConcatTable() to append constant inputs to
+-- an input : nn.ConcatTable():add(nn.Constant(v)):add(nn.Identity()) .
+------------------------------------------------------------------------
+local Constant, parent = torch.class("nn.Constant", "nn.Module")
+
+function Constant:__init(value, nInputDim)
+   self.value = value
+   if torch.type(self.value) == 'number' then
+      self.value = torch.Tensor{self.value}
+   end
+   assert(torch.isTensor(self.value), "Expecting number or tensor at arg 1")
+   self.nInputDim = nInputDim
+   parent.__init(self)
+end
+
+function Constant:updateOutput(input)
+   if self.nInputDim and input:dim() > self.nInputDim then
+      local vsize = self.value:size():totable()
+      self.output:resize(input:size(1), table.unpack(vsize))
+      local value = self.value:view(1, table.unpack(vsize))
+      self.output:copy(value:expand(self.output:size()))
+   else
+      self.output:resize(self.value:size()):copy(self.value)
+   end
+   return self.output
+end
+
+function Constant:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):zero()
+   return self.gradInput
+end
diff --git a/Container.lua b/Container.lua
index 7e264ba..67fac9f 100644
--- a/Container.lua
+++ b/Container.lua
@@ -105,7 +105,7 @@ end
 
 function Container:parameters()
     local function tinsert(to, from)
-        if type(from) == 'table' then
+        if torch.type(from) == 'table' then
             for i=1,#from do
                 tinsert(to,from[i])
             end
@@ -131,7 +131,7 @@ function Container:clearState()
       if self[f] then
          if torch.isTensor(self[f]) then
             self[f] = self[f].new()
-         elseif type(self[f]) == 'table' then
+         elseif torch.type(self[f]) == 'table' then
             self[f] = {}
          else
             self[f] = nil
diff --git a/Convert.lua b/Convert.lua
new file mode 100644
index 0000000..855338d
--- /dev/null
+++ b/Convert.lua
@@ -0,0 +1,245 @@
+------------------------------------------------------------------------
+--[ nn.Convert ]--
+-- Module to convert between different data formats
+-- nn.Convert('bchw', 'bf') or nn.Convert('chw', 'f')
+-- Automatically converts input to same type as self.output
+-- Simplest use is for automatic input type converions : nn.Convert()
+------------------------------------------------------------------------
+local _ = require 'moses'
+local Convert, parent = torch.class("nn.Convert", "nn.Container")
+
+function Convert:__init(inputShape, outputShape)
+   if outputShape and not inputShape then
+      error"Expecting non-nil arg 1 when arg 2 is provided"
+   end
+   inputShape = inputShape or 'b*'
+   outputShape = outputShape or inputShape
+   self.inputShape = inputShape:find('b') and inputShape or ('b'..inputShape)
+   self.outputShape = outputShape:find('b') and outputShape or ('b'..outputShape)
+   self.inputBatchDim = self.inputShape:find('b')
+   self.outputBatchDim = self.outputShape:find('b')
+   if self.inputShape == 'b*' or self.outputShape == 'b*' then
+      assert(self.inputShape == 'b*' and self.outputShape == 'b*', 'Both or neither shapes must be b*')
+      self.nInputDim = -1
+      self.nOutputDim = -1
+      self.transposition = true
+   else
+      -- number of dims in batch mode
+      self.nInputDim = #self.inputShape
+      self.nOutputDim = #self.outputShape
+      -- is the outputShape just a transposition of the inputShape?
+      if self.nInputDim == self.nOutputDim then
+         self.transposition = true
+         for i=1,self.nInputDim do
+            if not self.outputShape:find(self.inputShape:sub(i,i)) then
+               self.transposition = false
+               break
+            end
+         end
+      end
+   end
+   parent.__init(self)
+end
+
+-- post-initialization
+function Convert:buildConverter(input)
+   if self.transposition then
+      self.converter = self:transpose(self.outputShape)
+   else
+      if (torch.type(self[self.outputShape]) ~= 'function') then
+         error(string.format("Unrecognized conversion of shape %s to %s", self.inputShape, self.outputShape))
+      end
+      self.converter = self[self.outputShape](self, input)
+   end
+   assert(torch.isTensor(self.output), "Expecting Tensor output")
+
+   self.converter:type(torch.type(self.output))
+
+   self.modules[1] = self.converter
+end
+
+function Convert:updateOutput(input)
+   assert(torch.isTensor(input), "expecting Tensor")
+   if not torch.isTypeOf(input, torch.type(self.output)) then
+      -- handle different input type
+      self._input = self._input or self.output.new()
+      self._input:resize(input:size()):copy(input)
+      input = self._input
+   end
+   self.batchMode = true
+   if input:dim() < self.nInputDim then
+      -- handle non-batch mode
+      local inputSize = input:size():totable()
+      table.insert(inputSize, self.inputBatchDim, 1)
+      self.__input = self.__input or input.new()
+      self.__input:set(input):resize(table.unpack(inputSize))
+      input = self.__input
+      self.batchMode = false
+   end
+   if not self.converter then
+      self:buildConverter(input)
+   end
+
+   self.output = self.converter:updateOutput(input)
+
+   if not self.batchMode then
+      local outputSize = self.output:size():totable()
+      table.remove(outputSize, self.outputBatchDim)
+      self.__output = self.__output or self.output.new()
+      self.__output:set(self.output):resize(table.unpack(outputSize))
+      self.output = self.__output
+   end
+   return self.output
+end
+
+function Convert:updateGradInput(input, gradOutput)
+   local input_ = input
+   input = self._input or input
+   if not self.batchMode then
+      input = self.__input
+      self.__gradOutput = self.__gradOutput or gradOutput.new()
+      self.__gradOutput:set(gradOutput):resize(self.converter.output:size())
+      gradOutput = self.__gradOutput
+   end
+
+   local gradInput = self.converter:updateGradInput(input, gradOutput)
+
+   if not self.batchMode then
+      self.__gradInput = self.__gradInput or gradInput.new()
+      self.__gradInput:set(gradInput):resize(input_:size())
+      gradInput = self.__gradInput
+   end
+   if self._input then
+      self._gradInput = self._gradInput or input.new()
+      self._gradInput:resize(input:size()):copy(gradInput)
+      self.gradInput = self._gradInput
+   else
+      self.gradInput = gradInput
+   end
+
+   return self.gradInput
+end
+
+function Convert:accGradParameters(input, gradOutput, scale)
+   input = self.batchMode and self.__input or self._input or input
+   gradOutput = self.batchMode and self.__gradOutput or gradOutput
+   self.converter:accGradParameters(input, gradOutput, scale)
+end
+
+function Convert:accUpdateGradParameters(input, gradOutput, lr)
+   input = self.batchMode and self.__input or self._input or input
+   gradOutput = self.batchMode and self.__gradOutput or gradOutput
+   self.converter:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+-- batch feature
+function Convert:bf(input)
+   local b_pos = self:findAxis('b', self.inputShape)
+   local dim = #self.inputShape
+   if self.inputShape == 'bt' then
+      error"Conversion of shape bt to bf not supported: open an issue on github"
+   end
+   -- was b
+   if dim == 1 then
+      return nn.Reshape(1)
+   end
+   -- was b...
+   local modula
+   if b_pos ~= 1 then
+      modula = nn.Transpose({1, b_pos})
+   end
+   if dim > 2 then
+      local transpose = modula
+      local sampleSize = input:select(self:findAxis('b'),1):nElement()
+      local reshape = nn.Reshape(sampleSize)
+      if transpose then
+         modula = nn.Sequential()
+         modula:add(transpose)
+         modula:add(reshape)
+      else
+         modula = reshape
+      end
+   end
+   return modula or nn.Identity()
+end
+
+-- each example is a scalar; batch is a vector
+function Convert:b(input)
+   local b_pos = self:findAxis('b')
+   if self.inputShape == 'bt' or self.inputShape == 'tb' then
+      local t_pos = self:findAxis('t')
+      -- select first set of classes
+      return nn.Select(t_pos, 1)
+   elseif self.inputShape == 'bf' or self.inputShape == 'fb' then
+      -- this wont work as expected with size(f) > 1
+      local f_pos = self:findAxis('f')
+      if input:size(f_pos) > 1 then
+         error("Cannot convert shape "..self.inputShape.." to b when feature > 1")
+      end
+      return nn.Select(f_pos, 1)
+   else
+      error("Cannot convert shape "..self.inputShape.." to shape b")
+   end
+end
+
+-- returns the current shape of the data
+function Convert:default()
+   return nn.Identity()
+end
+
+-- multi-class (batch target)
+function Convert:bt()
+   local b_pos = self:findAxis('b')
+   local modula
+   if self.inputShape == 'b' then
+      modula = nn.Reshape(1)
+   else
+      error("cannot convert shape '"..self.inputShape.."' to bt")
+   end
+   return modula
+end
+
+-- a generic function for transposing shape axes
+function Convert:transpose(newShape)
+   if newShape == self.inputShape then
+      return nn.Identity()
+   end
+   local inputShape = {}
+   for i=1,#self.inputShape do
+      table.insert(inputShape, self.inputShape:sub(i,i))
+   end
+   local transpositions = {}
+   for i=1,#newShape do
+      local j = _.indexOf(inputShape, newShape:sub(i,i))
+      if i ~= j then
+         local char = inputShape[i]
+         inputShape[i] = inputShape[j]
+         inputShape[j] = char
+         table.insert(transpositions, {j, i})
+      end
+   end
+   return nn.Transpose(table.unpack(transpositions))
+end
+
+function Convert:findAxis(axis_char, shape, silent)
+   shape = shape or self.inputShape
+   local axis_pos = shape:find(axis_char)
+   if (not silent) and (not axis_pos) then
+      error("Provided shape '"..shape.."' has no axis '"..axis_char.."'", 2)
+   end
+   return axis_pos
+end
+
+function Convert:clearState()
+   self._input = nil
+   self._gradInput = nil
+   self.__input = nil
+   self.__output = nil
+   self.__gradInput = nil
+   self.__gradOutput =  nil
+end
+
+function Convert:type(type)
+   self:clearState()
+   return parent.type(self, type)
+end
diff --git a/DontCast.lua b/DontCast.lua
index b89f543..eaa39b6 100644
--- a/DontCast.lua
+++ b/DontCast.lua
@@ -19,7 +19,7 @@ local function recursiveTypeCopy(dst, src, type_str)
 end
 
 local function tableTensorType(src)
-   if type(src) == 'table' then
+   if type(src) == 'table' then -- Note: don't use torch.type here
       local type_str, found
       for k,v in pairs(src) do
          type_str, found = tableTensorType(v)
diff --git a/FeatureLPPooling.lua b/FeatureLPPooling.lua
new file mode 100644
index 0000000..5de4656
--- /dev/null
+++ b/FeatureLPPooling.lua
@@ -0,0 +1,74 @@
+
+local FeatureLPPooling, parent =
+   torch.class('nn.FeatureLPPooling', 'nn.Module')
+
+--[[
+   Possible inputs that we handle:
+
+   #### `batch_mode = false`
+   The dimensionality of the input chooses between the following modes:
+
+   ```
+   [feature dim]
+   [feature dim][opt dim 1]
+   [feature dim][opt dim 1][opt dim 2]
+   ```
+
+   #### `batch_mode = true`
+   The dimensionality of the input chooses between the following modes:
+   ```
+   [batch dim][feature dim]
+   [batch dim][feature dim][opt dim 1]
+   [batch dim][feature dim][opt dim 1][opt dim 2]
+   ```
+
+   The output has the same number of dimensions as the input, except the feature
+   dimension size is reduced to ((`input` - `width`) / `stride`) + 1
+]]
+function FeatureLPPooling:__init(width, stride, power, batch_mode)
+   parent.__init(self)
+
+   if (width < 2 or width > 16) then
+      error('width must be within 2 to 16')
+   end
+
+   if (stride < 1 or stride > 4) then
+      error('stride must be within 1 to 4')
+   end
+
+   self.width = width
+   self.stride = stride
+   self.power = power
+   self.batch_mode = batch_mode
+
+   self.output = torch.Tensor()
+   self.gradInput = torch.Tensor()
+end
+
+function FeatureLPPooling:updateOutput(input)
+   input.THNN.FeatureLPPooling_updateOutput(input:cdata(),
+                                            self.output:cdata(),
+                                            self.power,
+                                            self.width,
+                                            self.stride,
+                                            self.batch_mode)
+   return self.output
+end
+
+function FeatureLPPooling:updateGradInput(input, gradOutput)
+   input.THNN.FeatureLPPooling_updateGradInput(gradOutput:cdata(),
+                                               input:cdata(),
+                                               self.output:cdata(),
+                                               self.gradInput:cdata(),
+                                               self.power,
+                                               self.width,
+                                               self.stride,
+                                               self.batch_mode)
+   return self.gradInput
+end
+
+function FeatureLPPooling:__tostring__()
+   return string.format('%s(w%d s%d power %d batch %d',
+                        torch.type(self),
+                        self.width, self.stride, self.power, self.batch_mode)
+end
diff --git a/FlattenTable.lua b/FlattenTable.lua
index 1c18255..3fe2fd5 100644
--- a/FlattenTable.lua
+++ b/FlattenTable.lua
@@ -12,7 +12,7 @@ end
 local function flatten(output, input)
   local input_map  -- has the same structure as input, but stores the
                    -- indices to the corresponding output
-  if type(input) == 'table' then
+  if torch.type(input) == 'table' then
     input_map = {}
     -- forward DFS order
     for i = 1, #input do
@@ -30,8 +30,8 @@ local function checkMapping(output, input, input_map)
   if input_map == nil or output == nil or input == nil then
     return false
   end
-  if type(input) == 'table' then
-    if type(input_map) ~= 'table' then
+  if torch.type(input) == 'table' then
+    if torch.type(input_map) ~= 'table' then
       return false
     end
     if #input ~= #input_map then
@@ -46,7 +46,7 @@ local function checkMapping(output, input, input_map)
     end
     return true
   else
-    if type(input_map) ~= 'number' then
+    if torch.type(input_map) ~= 'number' then
       return false
     end
     return output[input_map] == input
@@ -56,7 +56,7 @@ end
 -- During BPROP we have to build a gradInput with the same shape as the
 -- input.  This is a recursive function to build up a gradInput
 local function inverseFlatten(gradOutput, input_map)
-  if type(input_map) == 'table' then
+  if torch.type(input_map) == 'table' then
     local gradInput = {}
     for i = 1, #input_map do
       gradInput[#gradInput + 1] = inverseFlatten(gradOutput, input_map[i])
@@ -68,7 +68,7 @@ local function inverseFlatten(gradOutput, input_map)
 end
 
 function FlattenTable:updateOutput(input)
-  assert(type(input) == 'table', 'input must be a table')
+  assert(torch.type(input) == 'table', 'input must be a table')
   -- to avoid updating rebuilding the flattened table every updateOutput call
   -- we will do a DFS pass over the existing output table and the inputs to
   -- see if it needs to be rebuilt.
@@ -80,8 +80,8 @@ function FlattenTable:updateOutput(input)
 end
 
 function FlattenTable:updateGradInput(input, gradOutput)
-  assert(type(input) == 'table', 'input must be a table')
-  assert(type(input) == 'table', 'gradOutput must be a table')
+  assert(torch.type(input) == 'table', 'input must be a table')
+  assert(torch.type(input) == 'table', 'gradOutput must be a table')
   -- If the input changes between the updateOutput and updateGradInput call,
   -- then we may have to rebuild the input_map!  However, let's assume that
   -- the input_map is valid and that forward has already been called.
diff --git a/Identity.lua b/Identity.lua
index 5e6ccb6..647aee3 100644
--- a/Identity.lua
+++ b/Identity.lua
@@ -17,7 +17,7 @@ function Identity:clearState()
       if self[f] then
          if torch.isTensor(self[f]) then
             self[f] = self[f].new()
-         elseif type(self[f]) == 'table' then
+         elseif torch.type(self[f]) == 'table' then
             self[f] = {}
          else
             self[f] = nil
diff --git a/IndexLinear.lua b/IndexLinear.lua
index 2ddbcbd..6b6b200 100644
--- a/IndexLinear.lua
+++ b/IndexLinear.lua
@@ -73,7 +73,7 @@ function IndexLinear:reset(stdv)
 end
 
 function IndexLinear:reshapeInput(input)
-   assert(type(input) == 'table')
+   assert(torch.type(input) == 'table')
 
    local ninputs = 0
    for _, v in ipairs(input) do
@@ -108,7 +108,7 @@ function IndexLinear:reshapeInput(input)
    --   { torch.LongTensor(size1), torch.LongTensor(size2), ..., torch.LongTensor(sizeN) }, -- batch of keys
    --   { torch.Tensor(size1), torch.Tensor(size2), ..., torch.Tensor(sizeN) }, -- batch of values,
    -- }
-   if type(keys) == 'table' and type(values) == 'table' then
+   if torch.type(keys) == 'table' and torch.type(values) == 'table' then
       lkeys, lvalues = keys, values
       self.isFlat = false
       self.noBatch = false
diff --git a/Kmeans.lua b/Kmeans.lua
new file mode 100644
index 0000000..56066b6
--- /dev/null
+++ b/Kmeans.lua
@@ -0,0 +1,215 @@
+-- Online (Hard) Kmeans layer.
+local Kmeans, parent = torch.class('nn.Kmeans', 'nn.Module')
+
+function Kmeans:__init(k, dim, scale)
+   parent.__init(self)
+   self.k = k
+   self.dim = dim
+
+   -- scale for online kmean update
+   self.scale = scale
+
+   assert(k > 0, "Clusters cannot be 0 or negative.")
+   assert(dim > 0, "Dimensionality cannot be 0 or negative.")
+
+   -- Kmeans centers -> self.weight
+   self.weight = torch.Tensor(self.k, self.dim)
+
+   self.gradWeight = torch.Tensor(self.weight:size())
+   self.loss = 0 -- within cluster error of the last forward
+
+   self.clusterSampleCount = torch.Tensor(self.k)
+
+   self:reset()
+end
+
+-- Reset
+function Kmeans:reset(stdev)
+   stdev = stdev or 1
+   self.weight:uniform(-stdev, stdev)
+end
+
+-- Initialize Kmeans weight with random samples from input.
+function Kmeans:initRandom(input)
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+
+   local noOfSamples = input:size(1)
+   local dim = input:size(2)
+   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
+   assert(noOfSamples >= self.k, "Need atleast k samples for initialization.")
+
+   local indices = torch.zeros(self.k)
+   indices:random(1, noOfSamples)
+
+   for i=1, self.k do
+      self.weight[i]:copy(input[indices[i]])
+   end
+end
+
+-- Initialize using Kmeans++
+function Kmeans:initKmeansPlus(input, p)
+   self.p = p or self.p or 0.95
+   assert(self.p>=0 and self.p<=1, "P value should be between 0-1.")
+
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+   local noOfSamples = input:size(1)
+
+   local pcount = math.ceil((1-self.p)*noOfSamples)
+   if pcount <= 0 then pcount = 1 end
+
+   local initializedK = 1
+   self.weight[initializedK]:copy(input[torch.random(noOfSamples)])
+   initializedK = initializedK + 1
+
+   local clusters = self.weight.new()
+   local clusterDistances = self.weight.new()
+   local temp = self.weight.new()
+   local expandedSample = self.weight.new()
+   local distances = self.weight.new()
+   distances:resize(noOfSamples):fill(math.huge)
+   local maxScores = self.weight.new()
+   local maxIndx = self.weight.new()
+
+   for k=initializedK, self.k do
+      clusters = self.weight[{{initializedK-1, initializedK-1}}]
+      for i=1, noOfSamples do
+         temp:expand(input[{{i}}], 1, self.dim)
+         expandedSample:resize(temp:size()):copy(temp)
+
+         -- Squared Euclidean distance
+         expandedSample:add(-1, clusters)
+         clusterDistances:norm(expandedSample, 2, 2)
+         clusterDistances:pow(2)
+         distances[i] = math.min(clusterDistances:min(), distances[i])
+      end
+      maxScores, maxIndx = distances:sort(true)
+      local tempIndx = torch.random(pcount)
+      local indx = maxIndx[tempIndx]
+      self.weight[initializedK]:copy(input[indx])
+      initializedK = initializedK + 1
+   end
+end
+
+local function isCudaTensor(tensor)
+   local typename = torch.typename(tensor)
+   if typename and typename:find('torch.Cuda*Tensor') then
+      return true
+   end
+   return false
+end
+
+-- Kmeans updateOutput (forward)
+function Kmeans:updateOutput(input)
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+
+   local batchSize = input:size(1)
+   local dim = input:size(2)
+   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
+
+   assert(input:isContiguous(), "Input is not contiguous.")
+
+   -- a sample copied k times to compute distance between sample and weight
+   self._expandedSamples = self._expandedSamples or self.weight.new()
+
+   -- distance between a sample and weight
+   self._clusterDistances = self._clusterDistances or self.weight.new()
+
+   self._temp = self._temp or input.new()
+   self._tempExpanded = self._tempExpanded or input.new()
+
+   -- Expanding inputs
+   self._temp:view(input, 1, batchSize, self.dim)
+   self._tempExpanded:expand(self._temp, self.k, batchSize, self.dim)
+   self._expandedSamples:resize(self.k, batchSize, self.dim)
+                        :copy(self._tempExpanded)
+
+   -- Expanding weights
+   self._tempWeight = self._tempWeight or self.weight.new()
+   self._tempWeightExp = self._tempWeightExp or self.weight.new()
+   self._expandedWeight = self._expanedWeight or self.weight.new()
+   self._tempWeight:view(self.weight, self.k, 1, self.dim)
+   self._tempWeightExp:expand(self._tempWeight, self._expandedSamples:size())
+   self._expandedWeight:resize(self.k, batchSize, self.dim)
+                       :copy(self._tempWeightExp)
+
+   -- x-c
+   self._expandedSamples:add(-1, self._expandedWeight)
+   -- Squared Euclidean distance
+   self._clusterDistances:norm(self._expandedSamples, 2, 3)
+   self._clusterDistances:pow(2)
+   self._clusterDistances:resize(self.k, batchSize)
+
+   self._minScore = self._minScore or self.weight.new()
+   self._minIndx = self._minIndx or (isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor())
+   self._minScore:min(self._minIndx, self._clusterDistances, 1)
+   self._minIndx:resize(batchSize)
+
+   self.output:resize(batchSize):copy(self._minIndx)
+   self.loss = self._minScore:sum()
+
+   return self.output
+end
+
+-- Kmeans has its own criterion hence gradInput are zeros
+function Kmeans:updateGradInput(input, gradOuput)
+   self.gradInput:resize(input:size()):zero()
+
+   return self.gradInput
+end
+
+-- We define kmeans update rule as c -> c + scale * 1/n * sum_i (x-c).
+-- n is no. of x's belonging to c.
+-- With this update rule and gradient descent will be negative the gradWeights.
+function Kmeans:accGradParameters(input, gradOutput, scale)
+   local scale = self.scale or scale or 1
+   assert(scale > 0 , " Scale has to be positive.")
+
+   -- Update cluster sample count
+   local batchSize = input:size(1)
+   self._cscAdder = self._cscAdder or self.weight.new()
+   self._cscAdder:resize(batchSize):fill(1)
+   self.clusterSampleCount:zero()
+   self.clusterSampleCount:indexAdd(1, self._minIndx, self._cscAdder)
+
+   -- scale * (x[k]-c[k]) where k is nearest cluster to x
+   self._gradWeight = self._gradWeight or self.gradWeight.new()
+   self._gradWeight:index(self.weight, 1, self._minIndx)
+   self._gradWeight:mul(-1)
+   self._gradWeight:add(input)
+   self._gradWeight:mul(-scale)
+
+   self._gradWeight2 = self._gradWeight2 or self.gradWeight.new()
+   self._gradWeight2:resizeAs(self.gradWeight):zero()
+   self._gradWeight2:indexAdd(1, self._minIndx, self._gradWeight)
+
+   -- scale/n * sum_i (x-c)
+   self._ccounts = self._ccounts or self.clusterSampleCount.new()
+   self._ccounts:resize(self.k):copy(self.clusterSampleCount)
+   self._ccounts:add(0.0000001) -- prevent division by zero errors
+
+   self._gradWeight2:cdiv(self._ccounts:view(self.k,1):expandAs(self.gradWeight))
+
+   self.gradWeight:add(self._gradWeight2)
+end
+
+function Kmeans:clearState()
+   -- prevent premature memory allocations
+   self._expandedSamples = nil
+   self._clusterDistances = nil
+   self._temp = nil
+   self._tempExpanded = nil
+   self._tempWeight = nil
+   self._tempWeightExp = nil
+   self._expandedWeight = nil
+   self._minScore = nil
+   self._minIndx = nil
+   self._cscAdder = nil
+end
+
+function Kmeans:type(type, tensorCache)
+   self:clearState()
+   return parent.type(self, type, tensorCache)
+end
diff --git a/ModuleCriterion.lua b/ModuleCriterion.lua
new file mode 100644
index 0000000..bfc79ef
--- /dev/null
+++ b/ModuleCriterion.lua
@@ -0,0 +1,44 @@
+local ModuleCriterion, parent = torch.class("nn.ModuleCriterion", "nn.Criterion")
+
+function ModuleCriterion:__init(criterion, inputModule, targetModule, castTarget)
+   self.inputModule = inputModule
+   self.targetModule = targetModule
+   self.castTarget = (castTarget == nil) and true or castTarget
+   if self.inputModule then
+      local params = self.inputModule:parameters()
+      if params and #params > 0 then
+         print"Warning: nn.ModuleCriterion doesn't support parameter updates"
+      end
+   end
+   self.criterion = criterion
+end
+
+function ModuleCriterion:updateOutput(input, target)
+   if self.inputModule then
+      self.input = self.inputModule:forward(input)
+   end
+   if self.targetModule then
+      self.target = self.targetModule:forward(target)
+   end
+   self.output = self.criterion:forward(self.input or input, self.target or target)
+   return self.output
+end
+
+function ModuleCriterion:updateGradInput(input, target)
+   self.gradInput = self.criterion:backward(self.input or input, self.target or target)
+   if self.inputModule then
+      self.gradInput = self.inputModule:backward(input, self.gradInput)
+   end
+   return self.gradInput
+end
+
+function ModuleCriterion:type(type, typecache)
+   if self.inputModule then
+      self.inputModule:type(type, typecache)
+   end
+   if self.castTarget and self.targetModule then
+      self.targetModule:type(type, typecache)
+   end
+   self.criterion:type(type, typecache)
+   return parent.type(self, type, typecache)
+end
diff --git a/OneHot.lua b/OneHot.lua
new file mode 100644
index 0000000..d1dc1b5
--- /dev/null
+++ b/OneHot.lua
@@ -0,0 +1,69 @@
+local OneHot, parent = torch.class('nn.OneHot', 'nn.Module')
+
+-- adapted from https://github.com/karpathy/char-rnn
+-- and https://github.com/hughperkins/char-lstm
+
+function OneHot:__init(outputSize)
+   parent.__init(self)
+   self.outputSize = outputSize
+end
+
+function OneHot:updateOutput(input)
+   local size
+   if type(input) == 'number' then
+      if self:type() == 'torch.CudaTensor' then
+         self._single = self._single or torch.CudaTensor():resize(1);
+      else
+         self._single = self._single or torch.LongTensor():resize(1);
+      end
+      self._single[1] = input
+      input = self._single;
+      size = {}
+   else
+      size = input:size():totable()
+   end
+   table.insert(size, self.outputSize)
+
+   self.output:resize(table.unpack(size)):zero()
+
+   size[#size] = 1
+   local input_ = input:view(table.unpack(size))
+
+   if torch.type(input) == 'torch.CudaTensor' or torch.type(input) == 'torch.ClTensor' then
+      self.output:scatter(self.output:dim(), input_, 1)
+   else
+      if torch.type(self.output) == 'torch.CudaTensor' then
+         -- input is not cuda, module is, cast input to cuda
+         self._input = self._input or torch.CudaTensor()
+         self._input:resize(input_:size()):copy(input_)
+         input_ = self._input
+      elseif torch.type(input) ~= 'torch.LongTensor' then
+         -- input is not long, module isnot cuda, cast input to long
+         self._input = self._input or torch.LongTensor()
+         self._input:resize(input_:size()):copy(input_)
+         input_ = self._input
+      end
+      self.output:scatter(self.output:dim(), input_, 1)
+   end
+
+   return self.output
+end
+
+function OneHot:updateGradInput(input, gradOutput)
+   if type(input) == 'number' then
+      return 0
+   else
+      self.gradInput:resize(input:size()):zero()
+      return self.gradInput
+   end
+end
+
+function OneHot:clearState()
+   self._single = nil
+   self._input = nil
+end
+
+function OneHot:type(type, typecache)
+   self:clearState()
+   return parent.type(self, type, typecache)
+end
diff --git a/PrintSize.lua b/PrintSize.lua
new file mode 100644
index 0000000..d8dc91b
--- /dev/null
+++ b/PrintSize.lua
@@ -0,0 +1,36 @@
+local PrintSize, parent = torch.class('nn.PrintSize', 'nn.Module')
+
+function PrintSize:__init(prefix)
+   parent.__init(self)
+   self.prefix = prefix or "PrintSize"
+end
+
+function PrintSize:updateOutput(input)
+   self.output = input
+   local size
+   if torch.type(input) == 'table' then
+      size = input
+   elseif torch.type(input) == 'nil' then
+      size = 'missing size'
+   else
+      size = input:size()
+   end
+   print(self.prefix..":input\n", size)
+   return self.output
+end
+
+
+function PrintSize:updateGradInput(input, gradOutput)
+   local size
+   if torch.type(gradOutput) == 'table' then
+      size = gradOutput
+   elseif torch.type(gradOutput) == 'nil' then
+      size = 'missing size'
+   else
+      size = gradOutput:size()
+   end
+   print(self.prefix..":gradOutput\n", size)
+   self.gradInput = gradOutput
+   return self.gradInput
+end
+
diff --git a/SparseLinear.lua b/SparseLinear.lua
index 7c3edad..4888fc1 100644
--- a/SparseLinear.lua
+++ b/SparseLinear.lua
@@ -15,7 +15,7 @@ function SparseLinear:__init(inputSize, outputSize, doGradInput)
    self.gradWeight = torch.Tensor(outputSize, inputSize):zero()
    self.gradBias = torch.Tensor(outputSize):zero()
 
-   assert(type(self.doGradInput) == type(true))
+   assert(type(self.doGradInput) == 'boolean')
 
    self.lastInput = nil
    self.sparseUpdate = NO_LAST_INPUT
@@ -39,7 +39,7 @@ function SparseLinear:reset(stdv)
 end
 
 function SparseLinear:reshapeInput(input)
-   if type(input) == 'table' then
+   if torch.type(input) == 'table' then
       return input, true, false
    else
       if input:dim() == 2 then
@@ -57,7 +57,7 @@ function SparseLinear:updateOutput(input)
    local input, batchMode, legacyMode = self:reshapeInput(input)
    self.legacyMode = legacyMode
 
-   if legacyMode then 
+   if legacyMode then
       input.THNN.SparseLinear_legacyUpdateOutput(
          input:cdata(),
          self.output:cdata(),
@@ -149,8 +149,8 @@ function SparseLinear:accGradParameters(input, gradOutput, scale)
 end
 
 function SparseLinear:updateGradInput(input, gradOutput)
-   if self.legacyMode then 
-      if type(self.gradInput) ~= type(gradOutput) then self.gradInput = gradOutput.new() end
+   if self.legacyMode then
+      if torch.type(self.gradInput) ~= torch.type(gradOutput) then self.gradInput = gradOutput.new() end
       self.gradInput:resizeAs(input)
    else
       self.gradInput = {}
@@ -185,7 +185,7 @@ function SparseLinear:updateGradInput(input, gradOutput)
    return self.gradInput
 end
 
--- These functions do sparse updates / zeros. However, if we accumulated 
+-- These functions do sparse updates / zeros. However, if we accumulated
 -- gradients multiple times, we can't depend on the last input to do sparse
 -- updates.
 function SparseLinear:updateParameters(learningRate)
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
index e6019bc..d28579b 100644
--- a/SpatialFullConvolution.lua
+++ b/SpatialFullConvolution.lua
@@ -72,7 +72,7 @@ function SpatialFullConvolution:updateOutput(input)
 
   -- The input can be a table where the second element indicates the target
   -- output size, in which case the adj factors are computed automatically
-  if type(inputTensor) == 'table' then
+  if torch.type(inputTensor) == 'table' then
     inputTensor = input[1]
     local targetTensor = input[2]
     local tDims = targetTensor:dim()
@@ -113,7 +113,7 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput)
 
     -- The input can be a table where the second element indicates the target
     -- output size, in which case the adj factors are computed automatically
-    if type(inputTensor) == 'table' then
+    if torch.type(inputTensor) == 'table' then
       inputTensor = input[1]
       local targetTensor = input[2]
       local tDims = targetTensor:dim()
@@ -122,7 +122,7 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput)
       adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
       adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
       -- Momentarily extract the gradInput tensor
-      if type(self.gradInput) == 'table' then
+      if torch.type(self.gradInput) == 'table' then
         self.gradInput = self.gradInput[1] or inputTensor.new()
       end
     end
@@ -139,7 +139,7 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput)
       adjW, adjH
     )
 
-    if type(input) == 'table' then
+    if torch.type(input) == 'table' then
      -- Create a zero tensor to be expanded and used as gradInput[2].
       self.zeroScalar = self.zeroScalar or input[2].new(1):zero()
       self.ones:resize(input[2]:dim()):fill(1)
@@ -162,7 +162,7 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
 
   -- The input can be a table where the second element indicates the target
   -- output size, in which case the adj factors are computed automatically
-  if type(inputTensor) == 'table' then
+  if torch.type(inputTensor) == 'table' then
     inputTensor = input[1]
     local targetTensor = input[2]
     local tDims = targetTensor:dim()
diff --git a/Sum.lua b/Sum.lua
index 8dc8305..7fe8a1a 100644
--- a/Sum.lua
+++ b/Sum.lua
@@ -34,7 +34,7 @@ function Sum:updateOutput(input)
    if self.sizeAverage then
       self.output:div(input:size(dimension))
    end
-   if self.squeeze and self.output:nDimension() > 1 then
+   if (self.squeeze == nil or self.squeeze) and self.output:nDimension() > 1 then
       self.output:set(self.output:select(dimension, 1))
    end
    return self.output
diff --git a/UpSampling.lua b/UpSampling.lua
new file mode 100644
index 0000000..9ad666f
--- /dev/null
+++ b/UpSampling.lua
@@ -0,0 +1,216 @@
+require 'nn.THNN'
+local UpSampling, parent =
+   torch.class('nn.UpSampling', 'nn.Module')
+
+--[[
+Upsamples a given 2D (spatial) or 3D (volumetric) input using either nearest neighbor, or linear
+interpolation.
+
+The input data is assumed to be of the form `minibatch x channels x [depth] x height x width`.
+Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+The input parameter scale_factor specifies the amount of upsampling, and is assumed to be a positive
+integer. An optional mode parameter specifies either 'nearest' (the default) or 'linear'. Linear refers
+to either bilinear for spatial (4D) tensors, or trilinear for volumetric (5D) tensors.
+
+For nearest neighbour, output size will be:
+
+odepth  = depth*scale_factor
+owidth  = width*scale_factor
+oheight  = height*scale_factor
+
+For linear interpolation:
+
+owidth  = (width-1)*(scale_factor-1) + width
+owidth  = (width-1)*(scale_factor-1) + width
+oheight  = (height-1)*(scale_factor-1) + height
+
+Alternatively for bilinear or trilinear, [odepth], owidth and oheight can be directly provided as input
+--]]
+
+function UpSampling:__init(params, mode)
+   parent.__init(self)
+
+   -- Any ambigious mode will default to nearest
+   if mode ~= nil and (mode == 'linear' or mode == 'bilinear' or mode == 'trilinear') then
+      self.mode = 'linear'
+   else
+      self.mode = 'nearest'
+   end
+
+   self.odepth, self.owidth, self.oheight, self.scale_factor = nil, nil, nil, nil
+   if torch.type(params) == 'table' then
+      if self.mode == 'nearest' then
+         error ('Nearest neighbour upsampling requires a scale_factor')
+      end
+      self.odepth, self.owidth, self.oheight = params.odepth, params.owidth, params.oheight
+      if self.owidth == nil or self.oheight == nil then
+         error('Output height and width parameters are required')
+      end
+   else
+      self.scale_factor = params   
+      if self.scale_factor < 1 then
+         error('scale_factor must be greater than 1')
+      end
+      if math.floor(self.scale_factor) ~= self.scale_factor then
+         error('scale_factor must be integer')
+      end
+   end
+
+   self.inputSize = torch.LongStorage(5):fill(0)
+   self.outputSize = torch.LongStorage(5):fill(0)
+end
+
+function UpSampling:setSize(input)
+   local xdim = input:dim()
+   local ydim = xdim - 1
+
+   local zdim = nil
+   if xdim > 4 then
+      zdim = xdim - 2
+   end
+
+   for i = 1, input:dim() do
+      self.inputSize[i] = input:size(i)
+      self.outputSize[i] = input:size(i)
+   end
+   if self.scale_factor ~= nil then
+      if zdim ~= nil then
+         self.outputSize[zdim] = self.outputSize[zdim] * self.scale_factor
+      end
+      self.outputSize[ydim] = self.outputSize[ydim] * self.scale_factor
+      self.outputSize[xdim] = self.outputSize[xdim] * self.scale_factor
+   else
+      if zdim ~= nil then
+         -- Runtime chech that depth was supplied given received 5D input
+         if self.odepth == nil then
+            error ('No output depth dimension was supplied for volumetric upsampling')
+         end
+         self.outputSize[zdim] = self.odepth
+      end
+      self.outputSize[ydim] = self.oheight
+      self.outputSize[xdim] = self.owidth
+   end
+end
+
+function UpSampling:updateOutput(input)
+   local nDim = input:dim()
+   if nDim < 4 or nDim > 5 then
+      error('UpSampling only supports 4D or 5D tensors')
+   end
+   local xdim = nDim
+   local ydim = xdim - 1
+   local zdim
+   if nDim == 5 then
+      zdim = xdim - 2
+   end   
+   self:setSize(input)
+   if nDim == 4 then
+      if self.mode == 'nearest' then
+         input.THNN.SpatialUpSamplingNearest_updateOutput(
+            input:cdata(),
+            self.output:cdata(),
+            self.scale_factor
+         )
+      else
+         input.THNN.SpatialUpSamplingBilinear_updateOutput(
+            input:cdata(),
+            self.output:cdata(),
+            self.outputSize[ydim],
+            self.outputSize[xdim]
+         )
+      end
+   else
+      if self.mode == 'nearest' then
+         input.THNN.VolumetricUpSamplingNearest_updateOutput(
+            input:cdata(),
+            self.output:cdata(),
+            self.scale_factor
+         )
+      else
+         input.THNN.VolumetricUpSamplingTrilinear_updateOutput(
+            input:cdata(),
+            self.output:cdata(),
+            self.outputSize[zdim],
+            self.outputSize[ydim],
+            self.outputSize[xdim]
+         )
+      end
+   end
+   return self.output
+end
+
+function UpSampling:updateGradInput(input, gradOutput)
+   local nDim = input:dim()
+   if nDim < 4 or nDim > 5 then
+      error('UpSampling only supports 4D or 5D tensors')
+   end
+   if nDim ~= gradOutput:dim() then
+      error('Input and gradOutput should be of same dimension')
+   end
+   local xdim = nDim
+   local ydim = xdim - 1
+   local zdim
+   if nDim == 5 then
+      zdim = xdim - 2
+   end   
+   self.gradInput:resizeAs(input) 
+   if nDim == 4 then
+      if self.mode == 'nearest' then
+         input.THNN.SpatialUpSamplingNearest_updateGradInput(
+            input:cdata(),
+            gradOutput:cdata(),
+            self.gradInput:cdata(),
+            self.scale_factor
+         )
+      else
+         input.THNN.SpatialUpSamplingBilinear_updateGradInput(
+            gradOutput:cdata(),
+            self.gradInput:cdata(),
+            input:size(1),
+            input:size(2),
+            input:size(3),
+            input:size(4),
+            self.outputSize[ydim],
+            self.outputSize[xdim]
+         )
+      end
+   else
+      if self.mode == 'nearest' then
+         input.THNN.VolumetricUpSamplingNearest_updateGradInput(
+            input:cdata(),
+            gradOutput:cdata(),
+            self.gradInput:cdata(),
+            self.scale_factor
+         )
+      else
+         input.THNN.VolumetricUpSamplingTrilinear_updateGradInput(
+            gradOutput:cdata(),
+            self.gradInput:cdata(),
+            input:size(1),
+            input:size(2),
+            input:size(3),
+            input:size(4),
+            input:size(5),
+            self.outputSize[zdim],
+            self.outputSize[ydim],
+            self.outputSize[xdim]
+         )
+      end
+   end
+   return self.gradInput
+end
+
+function UpSampling:__tostring__()
+   local s
+   if self.scale_factor ~= nil then
+      s = string.format('%s(%dx, %s)', torch.type(self), self.scale_factor, self.mode)
+   else
+      if self.odepth ~= nil then
+         s = string.format('%s(%dx%dx%d, %s)', torch.type(self), self.odepth, self.oheight, self.owidth, self.mode)
+      else
+         s = string.format('%s(%dx%d, %s)', torch.type(self), self.oheight, self.owidth, self.mode)
+      end
+   end
+   return s
+end
diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua
index 0ce2340..60843e7 100644
--- a/VolumetricFullConvolution.lua
+++ b/VolumetricFullConvolution.lua
@@ -93,7 +93,7 @@ function VolumetricFullConvolution:updateOutput(input)
 
   -- The input can be a table where the second element indicates the target
   -- output size, in which case the adj factors are computed automatically
-  if type(inputTensor) == 'table' then
+  if torch.type(inputTensor) == 'table' then
     inputTensor = input[1]
     local targetTensor = input[2]
     local tDims = targetTensor:dim()
@@ -128,7 +128,7 @@ function VolumetricFullConvolution:updateGradInput(input, gradOutput)
 
     -- The input can be a table where the second element indicates the target
     -- output size, in which case the adj factors are computed automatically
-    if type(inputTensor) == 'table' then
+    if torch.type(inputTensor) == 'table' then
       inputTensor = input[1]
       local targetTensor = input[2]
       local tDims = targetTensor:dim()
@@ -139,7 +139,7 @@ function VolumetricFullConvolution:updateGradInput(input, gradOutput)
       adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
       adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
       -- Momentarily extract the gradInput tensor
-      if type(self.gradInput) == 'table' then
+      if torch.type(self.gradInput) == 'table' then
         self.gradInput = self.gradInput[1]
       end
     end
@@ -156,7 +156,7 @@ function VolumetricFullConvolution:updateGradInput(input, gradOutput)
       adjT, adjW, adjH
    )
 
-    if type(input) == 'table' then
+    if torch.type(input) == 'table' then
      -- Create a zero tensor to be expanded and used as gradInput[2].
       self.zeroScalar = self.zeroScalar or input[2].new(1):zero()
       self.ones:resize(input[2]:dim()):fill(1)
@@ -177,7 +177,7 @@ function VolumetricFullConvolution:accGradParameters(input, gradOutput, scale)
 
   -- The input can be a table where the second element indicates the target
   -- output size, in which case the adj factors are computed automatically
-  if type(inputTensor) == 'table' then
+  if torch.type(inputTensor) == 'table' then
     inputTensor = input[1]
     local targetTensor = input[2]
     local tDims = targetTensor:dim()
diff --git a/WhiteNoise.lua b/WhiteNoise.lua
new file mode 100644
index 0000000..f1defb6
--- /dev/null
+++ b/WhiteNoise.lua
@@ -0,0 +1,40 @@
+local WhiteNoise, parent = torch.class('nn.WhiteNoise', 'nn.Module')
+
+function WhiteNoise:__init(mean, std)
+   parent.__init(self)
+   self.mean = mean or 0
+   self.std = std or 0.1
+   self.noise = torch.Tensor()
+end
+
+function WhiteNoise:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train ~= false then
+      self.noise:resizeAs(input)
+      self.noise:normal(self.mean, self.std)
+      self.output:add(self.noise)
+   else
+      if self.mean ~= 0 then
+         self.output:add(self.mean)
+      end
+   end
+   return self.output
+end
+
+function WhiteNoise:updateGradInput(input, gradOutput)
+   if self.train ~= false then
+      -- Simply return the gradients.
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function WhiteNoise:clearState()
+   self.noise:set()
+end
+
+function WhiteNoise:__tostring__()
+  return string.format('%s mean: %f, std: %f', torch.type(self), self.mean, self.std)
+end
diff --git a/ZeroGrad.lua b/ZeroGrad.lua
new file mode 100644
index 0000000..7c941ce
--- /dev/null
+++ b/ZeroGrad.lua
@@ -0,0 +1,14 @@
+local ZeroGrad, parent = torch.class('nn.ZeroGrad', 'nn.Module')
+
+function ZeroGrad:updateOutput(input)
+   self.output:set(input)
+   return self.output
+end
+
+-- the gradient is simply zeroed.
+-- useful when you don't want to backpropgate through certain paths.
+function ZeroGrad:updateGradInput(input, gradOutput)
+   self.gradInput = nn.utils.recursiveResizeAs(self.gradInput, input)
+   self.gradInput = nn.utils.recursiveFill(self.gradInput, 0)
+   return self.gradInput
+end
diff --git a/ZipTable.lua b/ZipTable.lua
new file mode 100644
index 0000000..7b18619
--- /dev/null
+++ b/ZipTable.lua
@@ -0,0 +1,34 @@
+local ZipTable, parent = torch.class('nn.ZipTable', 'nn.Module')
+
+-- input : { {a1,a2}, {b1,b2}, {c1,c2} }
+-- output : { {a1,b1,c1}, {a2,b2,c2} }
+function ZipTable:__init()
+   parent.__init(self)
+   self.output = {}
+   self.gradInput = {}
+end
+
+function ZipTable:updateOutput(inputTable)
+   self.output = {}
+   for i,inTable in ipairs(inputTable) do
+      for j,input in ipairs(inTable) do
+         local output = self.output[j] or {}
+         output[i] = input
+         self.output[j] = output
+      end
+   end
+   return self.output
+end
+
+function ZipTable:updateGradInput(inputTable, gradOutputTable)
+   self.gradInput = {}
+   for i,gradOutTable in ipairs(gradOutputTable) do
+      for j,gradOutput in ipairs(gradOutTable) do
+         local gradInput = self.gradInput[j] or {}
+         gradInput[i] = gradOutput
+         self.gradInput[j] = gradInput
+      end
+   end
+   return self.gradInput
+end
+
diff --git a/ZipTableOneToMany.lua b/ZipTableOneToMany.lua
new file mode 100644
index 0000000..d4a80fe
--- /dev/null
+++ b/ZipTableOneToMany.lua
@@ -0,0 +1,37 @@
+local ZipTableOneToMany, parent = torch.class('nn.ZipTableOneToMany', 'nn.Module')
+
+-- based on ZipTable in dpnn
+
+-- input : { v, {a, b, c} }
+-- output : { {v,a}, {v,b}, {v,c} }
+function ZipTableOneToMany:__init()
+   parent.__init(self)
+   self.output = {}
+   self.gradInput = {}
+   -- make buffer to update during forward/backward
+   self.gradInputEl = torch.Tensor()
+end
+
+function ZipTableOneToMany:updateOutput(input)
+   assert(#input == 2, "input must be table of element and table")
+   local inputEl, inputTable = input[1], input[2]
+   self.output = {}
+   for i,v in ipairs(inputTable) do
+      self.output[i] = {inputEl, v}
+   end
+   return self.output
+end
+
+function ZipTableOneToMany:updateGradInput(input, gradOutput)
+   assert(#input == 2, "input must be table of element and table")
+   local inputEl, inputTable = input[1], input[2]
+   self.gradInputEl:resizeAs(inputEl):zero()
+   local gradInputTable = {}
+   for i,gradV in ipairs(gradOutput) do
+      self.gradInputEl:add(gradV[1])
+      gradInputTable[i] = gradV[2]
+   end
+   self.gradInput = {self.gradInputEl, gradInputTable}
+   return self.gradInput
+end
+
diff --git a/doc/convolution.md b/doc/convolution.md
index 82d890e..99b19b7 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -45,6 +45,7 @@ a kernel for computing the weighted average in a neighborhood ;
     * [VolumetricAveragePooling](#nn.VolumetricAveragePooling) : a 3D average-pooling operation over an input video.
     * [VolumetricMaxUnpooling](#nn.VolumetricMaxUnpooling) : a 3D max-unpooling operation.
     * [VolumetricReplicationPadding](#nn.VolumetricReplicationPadding) : Pads a volumetric feature map with the value at the edge of the input borders. ;
+    * [UpSampling](#nn.UpSampling): Upsampling for either spatial or volumetric inputs using nearest neighbor or linear interpolation.   
 
 
 <a name="nn.TemporalModules"></a>
@@ -1250,3 +1251,37 @@ module = nn.VolumetricReplicationPadding(padLeft, padRight, padTop, padBottom,
 ```
 
 Each feature map of a given input is padded with the replication of the input boundary.
+
+<a name="nn.UpSampling"></a>
+### UpSampling ###
+
+```lua
+module = nn.UpSampling(scale, 'nearest')
+module = nn.UpSampling(scale, 'linear')
+module = nn.UpSampling({[odepth=D,] oheight=H, owidth=W}, 'linear')
+```
+
+Applies a 2D (spatial) or 3D (volumetric) up-sampling over an input image composed of several input planes. Available interpolation modes are nearest neighbor or linear (i.e. bilinear or trilinear depending on the input dimensions).  The `input` tensor in `forward(input)` is expected to be of the form `minibatch x channels x [depth] x height x width`. I.e. for 4D input the final two dimensions will be upsampled, for 5D output the final three dimensions will be upsampled. The number of ou [...]
+
+The parameters are the following:
+  * `scale`: The upscale ratio.  Must be a positive integer. Required if using nearest neighbor.
+  * Or a table `{[odepth=D,] oheight=H, owidth=W}`: The required output depth, height and width, should be positive integers.
+  * `mode`: The method of interpolation, either `'nearest'` or `'linear'`. Default is `'nearest'`
+
+If `scale` is specified, given an input of depth iD, height iH and width iW, output depth, height and width will be, for nearest neighbor:
+
+```lua
+oD = iD * scale
+oH = iH * scale
+oW = iW * scale
+```
+
+For linear interpolation:
+
+```lua
+oD = (iD - 1)(scale - 1) + iD
+oH = (iH - 1)(scale - 1) + iH
+oW = (iW - 1)(scale - 1) + iW
+```
+
+There are no learnable parameters.
diff --git a/doc/criterion.md b/doc/criterion.md
index a3e1b2e..06d97dc 100644
--- a/doc/criterion.md
+++ b/doc/criterion.md
@@ -29,6 +29,7 @@ target, they compute a gradient according to a given loss function.
     * [`MultiCriterion`](#nn.MultiCriterion) : a weighted sum of other criterions each applied to the same input and target;
     * [`ParallelCriterion`](#nn.ParallelCriterion) : a weighted sum of other criterions each applied to a different input and target;
     * [`MarginRankingCriterion`](#nn.MarginRankingCriterion): ranks two inputs;
+    * [`ModuleCriterion`](#nn.ModuleCriterion) : adds an optional `inputModule` and `targetModule` before a decorated criterion;
 
 <a name="nn.Criterion"></a>
 ## Criterion ##
@@ -180,7 +181,7 @@ crit.nll.sizeAverage = false
 ```
 The losses are averaged across observations for each minibatch.
 
-<a name="nn.ClassSimplexCriterion"/>
+<a name="nn.ClassSimplexCriterion"></a>
 ## ClassSimplexCriterion ##
 
 ```lua
@@ -877,3 +878,17 @@ for i = 1, 100 do
    end
 end
 ```
+
+<a name='nn.ModuleCriterion'></a>
+## ModuleCriterion ##
+
+```lua
+criterion = nn.ModuleCriterion(criterion [, inputModule, targetModule, castTarget])
+```
+
+This criterion decorates a `criterion` by allowing the `input` and `target` to be
+fed through an optional `inputModule` and `targetModule` before being passed to the
+`criterion`. The `inputModule` must not contain parameters as these would not be updated.
+
+When `castTarget = true` (the default), the `targetModule` is cast along with the `inputModule` and
+`criterion`. Otherwise, the `targetModule` isn't.
diff --git a/doc/simple.md b/doc/simple.md
index 7d19fd4..e18e15d 100755
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -16,6 +16,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [Euclidean](#nn.Euclidean) : the euclidean distance of the input to `k` mean centers ;
     * [WeightedEuclidean](#nn.WeightedEuclidean) : similar to [Euclidean](#nn.Euclidean), but additionally learns a diagonal covariance matrix ;
     * [Cosine](#nn.Cosine) : the cosine similarity of the input to `k` mean centers ;
+    * [Kmeans](#nn.Kmeans) : [Kmeans](https://en.wikipedia.org/wiki/K-means_clustering) clustering layer;
   * Modules that adapt basic Tensor methods :
     * [Copy](#nn.Copy) : a [copy](https://github.com/torch/torch7/blob/master/doc/tensor.md#torch.Tensor.copy) of the input with [type](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-or-string-typetype) casting ;
     * [Narrow](#nn.Narrow) : a [narrow](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-narrowdim-index-size) operation over a given dimension ;
@@ -57,6 +58,13 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [GradientReversal](#nn.GradientReversal) : reverses the gradient (to maximize an objective function) ;
     * [GPU](#nn.GPU) : decorates a module so that it can be executed on a specific GPU device.
     * [TemporalDynamicKMaxPooling](#nn.TemporalDynamicKMaxPooling) : selects the k highest values in a sequence. k can be calculated based on sequence length ;
+    * [Constant](#nn.Constant) : outputs a constant value given an input (which is ignored);
+    * [WhiteNoise](#nn.WhiteNoise) : adds isotropic Gaussian noise to the signal when in training mode;
+    * [OneHot](#nn.OneHot) : transforms a tensor of indices into [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding;
+    * [PrintSize](#nn.PrintSize) : prints the size of `input` and `gradOutput` (useful for debugging);
+    * [ZeroGrad](#nn.ZeroGrad) : forwards the `input` as-is, yet zeros the `gradInput`;
+    * [Collapse](#nn.Collapse) : just like `nn.View(-1)`;
+    * [Convert](#nn.Convert) : convert between different tensor types or shapes;
 
 <a name="nn.Linear"></a>
 ## Linear ##
@@ -675,6 +683,54 @@ Outputs the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
 
 The distance `y_j` between center `j` and input `x` is formulated as `y_j = (x · w_j) / ( || w_j || * || x || )`.
 
+<a name='nn.Kmeans'></a>
+## Kmeans ##
+
+```lua
+km = nn.Kmeans(k, dim)
+```
+
+`k` is the number of centroids and `dim` is the dimensionality of samples.
+The `forward` pass computes distances with respect to centroids and returns index of closest centroid.
+Centroids can be updated using gradient descent.
+Centroids can be initialized randomly or by using [kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B) algoirthm:
+
+```lua
+km:initRandom(samples) -- Randomly initialize centroids from input samples.
+km:initKmeansPlus(samples) -- Use Kmeans++ to initialize centroids.
+```
+
+Example showing how to use Kmeans module to do standard Kmeans clustering.
+
+```lua
+attempts = 10
+iter = 100 -- Number of iterations
+bestKm = nil
+bestLoss = math.huge
+learningRate = 1
+for j=1, attempts do
+   local km = nn.Kmeans(k, dim)
+   km:initKmeansPlus(samples)
+   for i=1, iter do
+      km:zeroGradParameters()
+      km:forward(samples) -- sets km.loss
+      km:backward(samples, gradOutput) -- gradOutput is ignored
+
+      -- Gradient Descent weight/centroids update
+      km:updateParameters(learningRate)
+   end
+
+   if km.loss < bestLoss then
+      bestLoss = km.loss
+      bestKm = km:clone()
+   end
+end
+```
+`nn.Kmeans()` module maintains loss only for the latest forward. If you want to maintain loss over the whole dataset then you who would need do it my adding the module loss for every forward.
+
+You can also use `nn.Kmeans()` as an auxillary layer in your network.
+A call to `forward` will generate an `output` containing the index of the nearest cluster for each sample in the batch.
+The `gradInput` generated by `updateGradInput` will be zero.
 
 <a name="nn.Identity"></a>
 ## Identity ##
@@ -1024,6 +1080,8 @@ Example 2:
 [torch.LongStorage of size 2]
 ```
 
+For collapsing non-batch dims, check out [nn.Collapse](#nn.Collapse).
+
 <a name="nn.Contiguous"></a>
 ## Contiguous ##
 
@@ -1490,7 +1548,7 @@ C = model:forward(A)  -- C will be of size `b x m`
 ## PixelShuffle ##
 ```module = nn.PixelShuffle(r)```
 
-Rearranges elements in a tensor of shape `[C*r, H, W]` to a tensor of shape `[C, H*r, W*r]`. This is useful for implementing efficient sub-pixel convolution with a stride of `1/r` (see [Shi et. al](https://arxiv.org/abs/1609.05158)). Below we show how the `PixelShuffle` module can be used to learn upscaling filters to transform a low-resolution input to a high resolution one, with a 3x upscale factor. This is useful for tasks such as super-resolution, see ["Real-Time Single Image and Vid [...]
+Rearranges elements in a tensor of shape `[C*r*r, H, W]` to a tensor of shape `[C, H*r, W*r]`. This is useful for implementing efficient sub-pixel convolution with a stride of `1/r` (see [Shi et. al](https://arxiv.org/abs/1609.05158)). Below we show how the `PixelShuffle` module can be used to learn upscaling filters to transform a low-resolution input to a high resolution one, with a 3x upscale factor. This is useful for tasks such as super-resolution, see ["Real-Time Single Image and V [...]
 
 ```
 upscaleFactor = 3
@@ -1663,3 +1721,207 @@ If `factor` is not provided, `k = minK`, else the value of k is calculated with:
 ```lua
 k = math.max(minK, math.ceil(factor*nInputFrame)))
 ```
+
+<a name='nn.Constant'></a>
+## Constant ##
+
+```lua
+module = nn.Constant(value, nInputDim)
+```
+
+This module outputs a constant value given an input.
+If `nInputDim` is specified, it uses the input to determine the size of the batch.
+The `value` is then replicated over the batch.
+Otherwise, the `value` Tensor is output as is.
+During `backward`, the returned `gradInput` is a zero Tensor of the same size as the `input`.
+This module has no trainable parameters.
+
+You can use this with nn.ConcatTable() to append constant inputs to an input :
+
+```lua
+nn.ConcatTable():add(nn.Constant(v)):add(nn.Identity())
+```
+
+This is useful when you want to output a value that is independent of the
+input to the neural network.
+
+<a name='nn.WhiteNoise'></a>
+## WhiteNoise ##
+
+```lua
+module = nn.WhiteNoise([mean, stdev])
+```
+
+This module adds isotropic Gaussian noise to the `input`.
+This can be useful for training [Denoising Autoencoders](http://arxiv.org/pdf/1507.02672v1.pdf).
+Takes `mean` and `stdev` of the normal distribution as constructor arguments.
+Default values for mean and standard deviation are 0 and 0.1 respectively.
+With `module:training()`, Gaussian noise is added during `forward`.
+During `backward` gradients are passed as is.
+With `module:evaluate()` the `mean` is added to the input.
+
+
+<a name = 'nn.OneHot'></a>
+## OneHot ##
+
+```lua
+module = nn.OneHot(outputSize)
+```
+
+Transforms a tensor of `input` indices having integer values between 1 and `outputSize` into
+a tensor of one-hot vectors of size `outputSize`.
+
+Forward an index to get a one-hot vector :
+
+```lua
+> module = nn.OneHot(5) -- 5 classes
+> module:forward(torch.LongTensor{3})
+ 0  0  1  0  0
+[torch.DoubleTensor of size 1x5]
+```
+
+Forward a batch of 3 indices. Notice that these need not be stored as `torch.LongTensor` :
+
+```lua
+> module:forward(torch.Tensor{3,2,1})
+ 0  0  1  0  0
+ 0  1  0  0  0
+ 1  0  0  0  0
+[torch.DoubleTensor of size 3x5]
+```
+
+Forward batch of `2 x 3` indices :
+
+```lua
+oh:forward(torch.Tensor{{3,2,1},{1,2,3}})
+(1,.,.) =
+  0  0  1  0  0
+  0  1  0  0  0
+  1  0  0  0  0
+
+(2,.,.) =
+  1  0  0  0  0
+  0  1  0  0  0
+  0  0  1  0  0
+[torch.DoubleTensor of size 2x3x5]
+```
+
+<a name='nn.PrintSize'></a>
+## PrintSize ##
+
+```lua
+module = nn.PrintSize(name)
+```
+
+This module is useful for debugging complicated module composites.
+It prints the size of the `input` and `gradOutput` during `forward`
+and `backward` propagation respectively.
+The `name` is a string used to identify the module along side the printed size.
+
+<a name='nn.ZeroGrad'></a>
+## ZeroGrad ##
+
+```lua
+module = nn.ZeroGrad()
+input = torch.Tensor{1,2}
+gradOutput = torch.Tensor{3,4}
+print(module:forward(input))
+ 1
+ 2
+[torch.DoubleTensor of size 2]
+
+print(module:backward(input, gradOutput))
+ 0
+ 0
+[torch.DoubleTensor of size 2]
+```
+
+The module zeros the `gradInput` but forwards the `input` as-is.
+
+<a name='nn.Collapse'></a>
+## Collapse ##
+
+```lua
+module = nn.Collapse(nInputDim)
+```
+
+This module is the equivalent of:
+```
+view = nn.View(-1)
+view:setNumInputDim(nInputDim)
+```
+
+It collapses all non-batch dimensions. This is useful for converting
+a spatial feature map to the single dimension required by a dense
+hidden layer like Linear.
+
+<a name='nn.Convert'></a>
+## Convert ##
+
+```lua
+module = nn.Convert([inputShape, outputShape])
+```
+Module to convert between different data formats.
+For example, we can flatten images by using :
+```lua
+module = nn.Convert('bchw', 'bf')
+```
+or equivalently
+```lua
+module = nn.Convert('chw', 'f')
+```
+Lets try it with an input:
+```lua
+print(module:forward(torch.randn(3,2,3,1)))
+ 0.5692 -0.0190  0.5243  0.7530  0.4230  1.2483
+-0.9142  0.6013  0.5608 -1.0417 -1.4014  1.0177
+-1.5207 -0.1641 -0.4166  1.4810 -1.1725 -1.0037
+[torch.DoubleTensor of size 3x6]
+```
+You could also try:
+
+```lua
+module = nn.Convert('chw', 'hwc')
+input = torch.randn(1,2,3,2)
+input:select(2,1):fill(1)
+input:select(2,2):fill(2)
+print(input)
+(1,1,.,.) =
+  1  1
+  1  1
+  1  1
+(1,2,.,.) =
+  2  2
+  2  2
+  2  2
+[torch.DoubleTensor of size 1x2x3x2]
+print(module:forward(input))
+(1,1,.,.) =
+  1  2
+  1  2
+
+(1,2,.,.) =
+  1  2
+  1  2
+
+(1,3,.,.) =
+  1  2
+  1  2
+[torch.DoubleTensor of size 1x3x2x2]
+```
+
+
+Furthermore, it automatically converts the `input` to have the same type as `self.output`
+(i.e. the type of the module).
+So you can also just use is for automatic input type converions:
+```lua
+module = nn.Convert()
+print(module.output) -- type of module
+[torch.DoubleTensor with no dimension]
+input = torch.FloatTensor{1,2,3}
+print(module:forward(input))
+ 1
+ 2
+ 3
+[torch.DoubleTensor of size 3]
+```
diff --git a/doc/table.md b/doc/table.md
index b3e2e5f..8734bf3 100644
--- a/doc/table.md
+++ b/doc/table.md
@@ -15,6 +15,8 @@ This allows one to build very rich architectures:
     * [`SelectTable`](#nn.SelectTable): select one element from a `table`;
     * [`NarrowTable`](#nn.NarrowTable): select a slice of elements from a `table`;
     * [`FlattenTable`](#nn.FlattenTable): flattens a nested `table` hierarchy;
+    * [`ZipTable`](#nn.ZipTable) : zip a table of tables into a table of tables;
+    * [`ZipTableOneToMany`](#nn.ZipTableOneToMany) : zip a table to a single tensor;
   * Pair Modules compute a measure like distance or similarity from a pair (`table`) of input `Tensor`s:
     * [`PairwiseDistance`](#nn.PairwiseDistance): outputs the `p`-norm. distance between inputs;
     * [`DotProduct`](#nn.DotProduct): outputs the dot product (similarity) between inputs;
@@ -26,6 +28,7 @@ This allows one to build very rich architectures:
     * [`CDivTable`](#nn.CDivTable): division of input `Tensor`s;
     * [`CMaxTable`](#nn.CMaxTable): max of input `Tensor`s;
     * [`CMinTable`](#nn.CMinTable): min of input `Tensor`s;
+    * [`CAddTensorTable`](#nn.CAddTensorTable): adds a tensor to a table of tensors of the same size;
   * `Table` of Criteria:
     * [`CriterionTable`](#nn.CriterionTable): wraps a [Criterion](criterion.md#nn.Criterion) so that it can accept a `table` of inputs.
 
@@ -692,7 +695,7 @@ Forwarding a batch of 2 examples gives us something like this:
 
 `module` = `SelectTable(index)`
 
-Creates a module that takes a (nested) `table` as input and outputs the element at index `index`. `index` can be strings or integers (positive or negative). 
+Creates a module that takes a (nested) `table` as input and outputs the element at index `index`. `index` can be strings or integers (positive or negative).
 This can be either a `table` or a [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor).
 
 The gradients of the non-`index` elements are zeroed `Tensor`s of the same size. This is true regardless of the
@@ -731,7 +734,7 @@ Exmaple 2:
 
 > gradInput = nn.SelectTable("A"):backward(input, torch.randn(2, 3))
 
-> gradInput 
+> gradInput
 {
   A : DoubleTensor - size: 2x3
   B : DoubleTensor - size: 2x1
@@ -811,11 +814,11 @@ Example 3:
 
 `module` = `NarrowTable(offset [, length])`
 
-Creates a module that takes a `table` as input and outputs the subtable 
+Creates a module that takes a `table` as input and outputs the subtable
 starting at index `offset` having `length` elements (defaults to 1 element).
 The elements can be either a `table` or a [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor).
 
-The gradients of the elements not included in the subtable are zeroed `Tensor`s of the same size. 
+The gradients of the elements not included in the subtable are zeroed `Tensor`s of the same size.
 This is true regardless of the depth of the encapsulated `Tensor` as the function used internally to do so is recursive.
 
 Example:
@@ -883,6 +886,36 @@ gives the output:
 }
 ```
 
+<a name='nn.ZipTable'></a>
+## ZipTable ##
+
+```lua
+module = nn.ZipTable()
+```
+
+Zips a table of tables into a table of tables.
+
+Example:
+```lua
+print(module:forward{ {'a1','a2'}, {'b1','b2'}, {'c1','c2'} })
+{ {'a1','b1','c1'}, {'a2','b2','c2'} }
+```
+
+<a name='nn.ZipTableOneToMany'></a>
+## ZipTableOneToMany ##
+
+```lua
+module = nn.ZipTableOneToMany()
+```
+
+Zips a table of element `el` and table of elements `tab` into a table of tables, where the i-th table contains the element `el` and the i-th element in table `tab`
+
+Example:
+```lua
+print(module:forward{ 'el', {'a','b','c'} })
+{ {'el','a'}, {'el','b'}, {'el','c'} }
+```
+
 <a name="nn.PairwiseDistance"></a>
 ## PairwiseDistance ##
 
@@ -1319,3 +1352,18 @@ m = nn.CMinTable()
  1
 [torch.DoubleTensor of size 3]
 ```
+
+<a name='nn.CAddTensorTable'></a>
+## CAddTensorTable ##
+
+```lua
+module = nn.CAddTensorTable()
+```
+
+Adds the first element `el` of the input table `tab` to each tensor contained in the second element of `tab`, which is itself a table
+
+Example:
+```lua
+print(module:forward{ (0,1,1), {(0,0,0),(1,1,1)} })
+{ (0,1,1), (1,2,2) }
+```
diff --git a/hessian.lua b/hessian.lua
index 33ef2b0..7518e1a 100644
--- a/hessian.lua
+++ b/hessian.lua
@@ -216,7 +216,7 @@ function nn.hessian.enable()
    function nn.SpatialConvolution.initDiagHessianParameters(self)
       initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
    end
-   
+
    ----------------------------------------------------------------------
    -- SpatialConvolutionLocal
    ----------------------------------------------------------------------
@@ -361,7 +361,7 @@ function nn.hessian.enable()
 
    function nn.Sequential.parameters(self)
       local function tinsert(to, from)
-         if type(from) == 'table' then
+         if torch.type(from) == 'table' then
             for i=1,#from do
                tinsert(to,from[i])
             end
diff --git a/init.lua b/init.lua
index 18c3c8c..21ac789 100755
--- a/init.lua
+++ b/init.lua
@@ -56,9 +56,14 @@ require('nn.MulConstant')
 require('nn.CAdd')
 require('nn.Add')
 require('nn.AddConstant')
+require('nn.Constant')
 require('nn.Dropout')
 require('nn.SpatialDropout')
 require('nn.VolumetricDropout')
+require('nn.WhiteNoise')
+require('nn.OneHot')
+require('nn.PrintSize')
+require('nn.ZeroGrad')
 
 require('nn.CAddTable')
 require('nn.CDivTable')
@@ -66,6 +71,7 @@ require('nn.CMulTable')
 require('nn.CSubTable')
 require('nn.CMaxTable')
 require('nn.CMinTable')
+require('nn.CAddTensorTable')
 
 require('nn.Euclidean')
 require('nn.WeightedEuclidean')
@@ -74,6 +80,7 @@ require('nn.CosineDistance')
 require('nn.DotProduct')
 require('nn.Normalize')
 require('nn.Cosine')
+require('nn.Kmeans')
 
 require('nn.Exp')
 require('nn.Log')
@@ -140,6 +147,7 @@ require('nn.SpatialReplicationPadding')
 require('nn.SpatialUpSamplingNearest')
 require('nn.SpatialUpSamplingBilinear')
 require('nn.SpatialBatchNormalization')
+require('nn.UpSampling')
 
 require('nn.VolumetricConvolution')
 require('nn.VolumetricFullConvolution')
@@ -152,6 +160,8 @@ require('nn.VolumetricAveragePooling')
 require('nn.VolumetricBatchNormalization')
 require('nn.VolumetricReplicationPadding')
 
+require('nn.FeatureLPPooling')
+
 require('nn.GPU')
 
 require('nn.ParallelTable')
@@ -165,6 +175,10 @@ require('nn.CriterionTable')
 require('nn.FlattenTable')
 require('nn.NarrowTable')
 require('nn.MapTable')
+require('nn.ZipTable')
+require('nn.ZipTableOneToMany')
+require('nn.Collapse')
+require('nn.Convert')
 
 require('nn.Criterion')
 require('nn.MSECriterion')
@@ -192,6 +206,7 @@ require('nn.BCECriterion')
 require('nn.CrossEntropyCriterion')
 require('nn.ParallelCriterion')
 require('nn.DistanceRatioCriterion')
+require('nn.ModuleCriterion')
 
 require('nn.PixelShuffle')
 
diff --git a/lib/THNN/doc/api_reference.md b/lib/THNN/doc/api_reference.md
index 830cc3d..70c5c79 100644
--- a/lib/THNN/doc/api_reference.md
+++ b/lib/THNN/doc/api_reference.md
@@ -59,7 +59,10 @@ These are all modules implemented in THNN:
 * [SpatialMaxPooling](#spatialmaxpooling)
 * [SpatialMaxUnpooling](#spatialmaxunpooling)
 * [SpatialSubSampling](#spatialsubsampling)
+* [SpatialReflectionPadding](#spatialreflectionpadding)
+* [SpatialReplicationPadding](#spatialreplicationpadding)
 * [SpatialUpSamplingNearest](#spatialupsamplingnearest)
+* [SpatialUpSamplingBilinear](#spatialupsamplingbilinear)
 * [Sqrt](#sqrt)
 * [Square](#square)
 * [Tanh](#tanh)
@@ -70,6 +73,9 @@ These are all modules implemented in THNN:
 * [VolumetricFullConvolution](#volumetricfullconvolution)
 * [VolumetricMaxPooling](#volumetricmaxpooling)
 * [VolumetricMaxUnpooling](#volumetricmaxunpooling)
+* [VolumetricReplicationPadding](#volumetricreplicationpadding)
+* [VolumetricUpSamplingNearest](#volumetricupsamplingnearest)
+* [VolumetricUpSamplingTrilinear](#volumetricupsamplingtrilinear)
 
 ## Abs
 ```C
@@ -1254,6 +1260,42 @@ void THNN_SpatialSubSampling_accGradParameters(
           int dW, int dH,
           real scale);
 ```
+## SpatialReflectionPadding
+```C
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+```
+```C
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+```
+## SpatialReplicationPadding
+```C
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+```
+```C
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+```
 ## SpatialUpSamplingNearest
 ```C
 void THNN_SpatialUpSamplingNearest_updateOutput(
@@ -1270,6 +1312,27 @@ void THNN_SpatialUpSamplingNearest_updateGradInput(
           THTensor *gradInput,
           int scale_factor);
 ```
+## SpatialUpSamplingBilinear
+```C
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+	        int outputHeight,
+          int outputWidth);
+```
+```C
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputHeight,
+          int inputWidth,
+          int outputHeight,
+          int outputWidth);
+```
 ## Sqrt
 ```C
 void THNN_Sqrt_updateOutput(
@@ -1507,3 +1570,63 @@ void THNN_VolumetricMaxUnpooling_updateGradInput(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 ```
+## VolumetricReplicationPadding
+```C
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
+```
+```C
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
+```
+## VolumetricUpSamplingNearest
+```C
+TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+```
+```C
+TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+```
+## VolumetricUpSamplingTrilinear
+```C
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputDepth,
+          int outputHeight,
+          int outputWidth);
+```
+```C
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputDepth,
+          int inputHeight,
+          int inputWidth,
+          int outputDepth,
+          int outputHeight,
+          int outputWidth);
+```
diff --git a/lib/THNN/generic/BCECriterion.c b/lib/THNN/generic/BCECriterion.c
index 55909ba..637a406 100644
--- a/lib/THNN/generic/BCECriterion.c
+++ b/lib/THNN/generic/BCECriterion.c
@@ -18,12 +18,18 @@ void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input,
       real x = *input_data;
       real y = *target_data;
       real w = *weights_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
       sum -= (log(x + EPS) * y + log(1. - x + EPS) * (1. - y)) * w;
     )
   else
     TH_TENSOR_APPLY2(real, input, real, target,
       real x = *input_data;
       real y = *target_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
       sum -= log(x + EPS) * y + log(1. - x + EPS) * (1. - y);
     );
 
diff --git a/lib/THNN/generic/FeatureLPPooling.c b/lib/THNN/generic/FeatureLPPooling.c
new file mode 100644
index 0000000..25a58db
--- /dev/null
+++ b/lib/THNN/generic/FeatureLPPooling.c
@@ -0,0 +1,348 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/FeatureLPPooling.c"
+#else
+
+#ifndef FEATURE_LP_DEFS
+#define FEATURE_LP_DEFS
+
+typedef struct {
+  size_t size[4];
+  size_t stride[4];
+} FeatureLPPoolingSizes;
+
+inline size_t flpGetOffset(FeatureLPPoolingSizes* s,
+                           size_t batch,
+                           size_t feature,
+                           size_t opt1,
+                           size_t opt2) {
+  return s->stride[0] * batch +
+    s->stride[1] * feature +
+    s->stride[2] * opt1 +
+    s->stride[3] * opt2;
+}
+
+inline size_t flpOutputSize(size_t inputSize,
+                            size_t width,
+                            size_t stride) {
+  return ((inputSize - width) / stride) + 1;
+}
+
+#endif // FEATURE_LP_DEFS
+
+FeatureLPPoolingSizes
+THNN_(FeatureLPPooling_upcastCPU)(THTensor* t, bool batchMode) {
+  int dim = THTensor_(nDimension)(t);
+
+  // Upcast to [batch dim][feature dim][opt dim 1][opt dim 2]
+  FeatureLPPoolingSizes s;
+  for (int i = 0; i < 4; ++i) {
+    s.size[i] = 1;
+    s.stride[i] = 1;
+  }
+
+  if (dim == 1) {
+    THAssert(!batchMode);
+    // [feature dim]
+    s.size[1] = THTensor_(size)(t, 0);
+    s.stride[1] = THTensor_(stride)(t, 0);
+  } else if (dim == 2) {
+    if (batchMode) {
+      // [batch dim][feature dim]
+      for (int i = 0; i < 2; ++i) {
+        s.size[i] = THTensor_(size)(t, i);
+        s.stride[i] = THTensor_(stride)(t, i);
+      }
+    } else {
+      // [feature dim][opt dim 1]
+      s.size[1] = THTensor_(size)(t, 0);
+      s.stride[1] = THTensor_(stride)(t, 0);
+      s.size[2] = THTensor_(size)(t, 1);
+      s.stride[2] = THTensor_(stride)(t, 1);
+    }
+  } else if (dim == 3) {
+    if (batchMode) {
+      // [batch dim][feature dim][opt dim 1]
+      for (int i = 0; i < 3; ++i) {
+        s.size[i] = THTensor_(size)(t, i);
+        s.stride[i] = THTensor_(stride)(t, i);
+      }
+    } else {
+      // [feature dim][opt dim 1][opt dim 2]
+      for (int i = 1; i < 4; ++i) {
+        s.size[i] = THTensor_(size)(t, i - 1);
+        s.stride[i] = THTensor_(stride)(t, i - 1);
+      }
+    }
+  } else if (dim == 4) {
+    // [batch dim][feature dim][opt dim 1][opt dim 2]
+    THAssert(batchMode);
+    for (int i = 0; i < 4; ++i) {
+      s.size[i] = THTensor_(size)(t, i);
+      s.stride[i] = THTensor_(stride)(t, i);
+    }
+  }
+
+  return s;
+}
+
+void
+THNN_(FeatureLPPooling_resizeForOutputCPU)(THTensor* toResize,
+                                           THTensor* input,
+                                           bool batchMode,
+                                           int width,
+                                           int stride) {
+  int inputDim = THTensor_(nDimension)(input);
+  THAssert(inputDim >= 1 && inputDim <= 4);
+
+  long outSize =
+    flpOutputSize(THTensor_(size)(input, 0), width, stride);
+  if (batchMode) {
+    THAssert(inputDim > 1);
+    outSize =
+      flpOutputSize(THTensor_(size)(input, 1), width, stride);
+  } else {
+    THAssert(inputDim < 4);
+  }
+
+  if (inputDim == 1) {
+    THTensor_(resize1d)(toResize, outSize);
+  } else if (inputDim == 2) {
+    if (batchMode) {
+      THTensor_(resize2d)(toResize,
+                          THTensor_(size)(input, 0),
+                          outSize);
+    } else {
+      THTensor_(resize2d)(toResize,
+                          outSize,
+                          THTensor_(size)(input, 1));
+    }
+  } else if (inputDim == 3) {
+    if (batchMode) {
+      THTensor_(resize3d)(toResize,
+                          THTensor_(size)(input, 0), outSize,
+                          THTensor_(size)(input, 2));
+    } else {
+      THTensor_(resize3d)(toResize,
+                          outSize, THTensor_(size)(input, 1),
+                          THTensor_(size)(input, 2));
+    }
+  } else if (inputDim == 4) {
+    THTensor_(resize4d)(toResize,
+                        THTensor_(size)(input, 0),
+                        outSize,
+                        THTensor_(size)(input, 2),
+                        THTensor_(size)(input, 3));
+  }
+}
+
+// Makes `toResize` the same size/dimensionality as `src`
+void
+THNN_(FeatureLPPooling_resizeCPU)(THTensor* toResize,
+                                  THTensor* src) {
+  int inputDim = THTensor_(nDimension)(src);
+  THAssert(inputDim >= 1 && inputDim <= 4);
+
+  if (inputDim == 1) {
+    THTensor_(resize1d)(toResize,
+                        THTensor_(size)(src, 0));
+  } else if (inputDim == 2) {
+    THTensor_(resize2d)(
+      toResize,
+      THTensor_(size)(src, 0),
+      THTensor_(size)(src, 1));
+  } else if (inputDim == 3) {
+    THTensor_(resize3d)(
+      toResize,
+      THTensor_(size)(src, 0),
+      THTensor_(size)(src, 1),
+      THTensor_(size)(src, 2));
+  } else if (inputDim == 4) {
+    THTensor_(resize4d)(
+      toResize,
+      THTensor_(size)(src, 0),
+      THTensor_(size)(src, 1),
+      THTensor_(size)(src, 2),
+      THTensor_(size)(src, 3));
+  }
+}
+
+void
+THNN_(FeatureLPPooling_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *output,
+  accreal power,
+  int width,
+  int stride,
+  bool batchMode) {
+  int inputDim = THTensor_(nDimension)(input);
+
+  if (batchMode) {
+    THArgCheck(inputDim >= 2 && inputDim <= 4, 2,
+               "input must be 2-4 dimensions for batch mode");
+  } else {
+    THArgCheck(inputDim >= 1 && inputDim <= 3, 2,
+               "input must be 1-3 dimensions for non-batch mode");
+  }
+
+  FeatureLPPoolingSizes inputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(input, batchMode);
+
+  // Make sure the feature dimension is properly sized
+  THArgCheck(inputDesc.size[1] >= width, 3,
+             "input: feature dimension must be >= width");
+
+  // Make sure that width and stride are within range
+  THArgCheck(width >= 2 && width <= 16, 5,
+             "width must be between 2 - 16");
+
+  THArgCheck(stride >= 1 && stride <= 4, 6,
+             "stride must be between 1 - 4");
+
+  // Resize output
+
+  THNN_(FeatureLPPooling_resizeForOutputCPU)(
+    output, input, batchMode, width, stride);
+
+  FeatureLPPoolingSizes outputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(output, batchMode);
+
+  real* inputP = THTensor_(data)(input);
+  real* outputP = THTensor_(data)(output);
+
+#pragma omp parallel for
+  for (size_t batch = 0; batch < inputDesc.size[0]; ++batch) {
+    for (size_t opt1 = 0; opt1 < inputDesc.size[2]; ++opt1) {
+      for (size_t opt2 = 0; opt2 < inputDesc.size[3]; ++opt2) {
+        for (size_t outputFeature = 0;
+             outputFeature < outputDesc.size[1]; ++outputFeature) {
+
+          accreal v = (accreal) 0;
+          for (size_t i = 0; i < width; ++i) {
+            size_t inputFeature = outputFeature * stride + i;
+            if (inputFeature >= inputDesc.size[1]) {
+              break;
+            }
+
+            v +=
+              pow(inputP[flpGetOffset(&inputDesc,
+                                      batch,
+                                      inputFeature,
+                                      opt1,
+                                      opt2)], power);
+          }
+
+          outputP[flpGetOffset(&outputDesc, batch, outputFeature, opt1, opt2)] =
+            pow(v, (accreal) 1 / power);
+        }
+      }
+    }
+  }
+}
+
+void
+THNN_(FeatureLPPooling_updateGradInput)(
+  THNNState *state,
+  THTensor* gradOutput,
+  THTensor* input,
+  THTensor* output,
+  THTensor* gradInput,
+  accreal power,
+  int width,
+  int stride,
+  bool batchMode) {
+  int inputDim = THTensor_(nDimension)(input);
+
+  if (batchMode) {
+    THArgCheck(inputDim >= 2 && inputDim <= 4, 3,
+               "input must be 2-4 dimensions for batch mode");
+  } else {
+    THArgCheck(inputDim >= 1 && inputDim <= 3, 3,
+               "input must be 1-3 dimensions for non-batch mode");
+  }
+
+  FeatureLPPoolingSizes inputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(input, batchMode);
+  FeatureLPPoolingSizes gradOutputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(gradOutput, batchMode);
+  FeatureLPPoolingSizes outputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(output, batchMode);
+
+  // Make sure the feature dimension is properly sized
+  THArgCheck(inputDesc.size[1] >= width, 3,
+             "input: feature dimension must be >= width");
+
+  // Make sure that width and stride are within range
+  THArgCheck(width >= 2 && width <= 16, 7,
+             "width must be between 2 - 16");
+
+  THArgCheck(stride >= 1 && stride <= 4, 8,
+             "stride must be between 1 - 4");
+
+  for (int i = 0; i < 4; ++i) {
+    THAssertMsg(outputDesc.size[i] == gradOutputDesc.size[i],
+                "output and gradOutput sizes do not match");
+  }
+
+  // Make sure that the input sizes produce the output sizes
+  THArgCheck(flpOutputSize(inputDesc.size[1], width, stride) ==
+             outputDesc.size[1], 3,
+             "input and output sizes do not match with respect to "
+             "width and stride");
+
+  // Resize `gradInput` based on `input`
+  THNN_(FeatureLPPooling_resizeCPU)(gradInput, input);
+
+  // Zero gradInput for accumulation
+  THTensor_(zero)(gradInput);
+
+  FeatureLPPoolingSizes gradInputDesc =
+    THNN_(FeatureLPPooling_upcastCPU)(gradInput, batchMode);
+
+  real* gradOutputP = THTensor_(data)(gradOutput);
+  real* gradInputP = THTensor_(data)(gradInput);
+  real* outputP = THTensor_(data)(output);
+  real* inputP = THTensor_(data)(input);
+
+#pragma omp parallel for
+  for (size_t batch = 0; batch < inputDesc.size[0]; ++batch) {
+    for (size_t opt1 = 0; opt1 < inputDesc.size[2]; ++opt1) {
+      for (size_t opt2 = 0; opt2 < inputDesc.size[3]; ++opt2) {
+        for (size_t outputFeature = 0;
+             outputFeature < outputDesc.size[1]; ++outputFeature) {
+
+          // Load output (f(x_is)). It is possible that this is zero, in
+          // which case we'll ignore this point.
+          real outputV =
+            outputP[
+              flpGetOffset(&outputDesc, batch, outputFeature, opt1, opt2)];
+
+          if (outputV == (real) 0) {
+            continue;
+          }
+
+          for (size_t i = 0; i < width; ++i) {
+            size_t inputFeature = outputFeature * stride + i;
+            THAssert(inputFeature < inputDesc.size[1]);
+
+            real gradOutputV =
+              gradOutputP[
+                flpGetOffset(&gradOutputDesc, batch, outputFeature, opt1, opt2)];
+            real inputV =
+              inputP[
+                flpGetOffset(&inputDesc, batch, inputFeature, opt1, opt2)];
+
+            // Calculate grad * (x_i / f(x_is))^(p - 1)
+            real v = gradOutputV * pow(inputV / outputV, power - (accreal) 1);
+
+            gradInputP[
+              flpGetOffset(&gradInputDesc, batch, inputFeature, opt1, opt2)]
+              += v;
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/FusedRNNKernel.c b/lib/THNN/generic/FusedRNNKernel.c
index 6126e86..30788b0 100644
--- a/lib/THNN/generic/FusedRNNKernel.c
+++ b/lib/THNN/generic/FusedRNNKernel.c
@@ -9,17 +9,19 @@ void THNN_(GRUFused_updateOutput)(
           THTensor *bias1,
           THTensor *bias2,
           THTensor *hx,
-          THTensor *hy)
+          THTensor *hy,
+          THTensor *storage)
 {
   THAssertMsg(false, "Not implemented for CPU");
 }
 
 void THNN_(GRUFused_updateGradInput)(
           THNNState *state,
-          THTensor *input,
-          THTensor *hidden,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
           THTensor *gradOutput,
-          THTensor *gradInput)
+          THTensor *gradInputHx,
+          THTensor *storage)
 {
   THAssertMsg(false, "Not implemented for CPU");
 }
@@ -39,13 +41,13 @@ void THNN_(LSTMFused_updateOutput)(
 
 void THNN_(LSTMFused_updateGradInput)(
           THNNState *state,
-          THTensor *input,
-          THTensor *hidden,
+          THTensor *storage,
+          THTensor *gradInGates,
           THTensor *prevC,
           THTensor *cy,
           THTensor *gradOutput,
           THTensor *gradOutputCell,
-          THTensor *gradInput)
+          THTensor *gradInputCx)
 {
   THAssertMsg(false, "Not implemented for CPU");
 }
diff --git a/lib/THNN/generic/GatedLinearUnit.c b/lib/THNN/generic/GatedLinearUnit.c
index d412a7b..274a27e 100644
--- a/lib/THNN/generic/GatedLinearUnit.c
+++ b/lib/THNN/generic/GatedLinearUnit.c
@@ -9,9 +9,10 @@ void THNN_(GatedLinear_updateOutput)(
           int dim)
 {
   // size output to half of input
-  dim = dim - 1;
+  dim = dim - TH_INDEX_BASE;
   const long nIn = THTensor_(size)(input, dim);
-  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim+1, nIn);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
 
   const long inputSize = THTensor_(size)(input, dim) / 2;
   THLongStorage *newSizes = THTensor_(newSizeOf)(input);
@@ -39,9 +40,10 @@ void THNN_(GatedLinear_updateGradInput)(
           int dim)
 {
   // set up tensors
-  dim = dim - 1;
+  dim = dim - TH_INDEX_BASE;
   const long nIn = THTensor_(size)(input, dim);
-  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", dim+1, nIn);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
 
   THTensor_(resizeAs)(gradInput, input);
   const long inputSize = THTensor_(size)(input, dim) / 2;
diff --git a/lib/THNN/generic/SpatialDepthWiseConvolution.c b/lib/THNN/generic/SpatialDepthWiseConvolution.c
index 750bae0..efb66a3 100644
--- a/lib/THNN/generic/SpatialDepthWiseConvolution.c
+++ b/lib/THNN/generic/SpatialDepthWiseConvolution.c
@@ -124,9 +124,12 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
 
   THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
   weight = THTensor_(newContiguous)(_weight);
-  THTensor *_bias = THTensor_(newTranspose)(bias, 0, 1);
-  bias = THTensor_(newContiguous)(_bias);
 
+  THTensor *_bias = NULL;
+  if(bias) {
+  	_bias = THTensor_(newTranspose)(bias, 0, 1);
+  	bias = THTensor_(newContiguous)(_bias);
+  }
 
   // resize weight
   long s1 = weight->size[0];
@@ -169,11 +172,13 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
     for(i = 0; i < nInputPlane; i++)
     {
       THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
-      THTensor *bias_i = THTensor_(newSelect)(bias, 0, i);
       THTensor *input_i = THTensor_(newNarrow)(input_t, 0, i, 1);
       THTensor *output_i = THTensor_(newSelect)(output_t, 0, i);
       THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
-
+      THTensor *bias_i = NULL;
+      if(bias) {
+        bias_i = THTensor_(newSelect)(bias, 0, i);
+      }
       THNN_(SpatialDepthWiseConvolution_updateOutput_frame)
 	(input_i, output_i, weight_i, bias_i, finput_i,
 	 kW, kH, dW, dH, padW, padH,
@@ -195,8 +200,7 @@ void THNN_(SpatialDepthWiseConvolution_updateOutput)(
   THTensor_(free)(_weight);
   THTensor_(free)(bias);
   THTensor_(free)(_bias);
-
-  THTensor_(resize4d)(output, T, nInputPlane * nOutputPlane, outputWidth, outputHeight);
+  THTensor_(resize4d)(output, T, nInputPlane * nOutputPlane, outputHeight, outputWidth);
 
   if (batch == 0) {
     THTensor_(select)(output, NULL, 0, 0);
@@ -430,15 +434,16 @@ void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
 
   // Transpose gradWeight & gradBias
   THTensor_(transpose)(gradWeight, NULL, 0, 1);
-  THTensor_(transpose)(gradBias, NULL, 0, 1);
-
   THTensor *_gradWeight;
-  THTensor *_gradBias;
-  _gradBias = gradBias;
   _gradWeight = gradWeight;
-
   gradWeight = THTensor_(newContiguous)(gradWeight);
-  gradBias = THTensor_(newContiguous)(gradBias);
+
+  THTensor *_gradBias = NULL;
+  if(gradBias) {
+	  THTensor_(transpose)(gradBias, NULL, 0, 1);
+	  _gradBias = gradBias;
+	  gradBias = THTensor_(newContiguous)(gradBias);
+  }
 
   // resize gradWeight
   long s1 = gradWeight->size[0];
@@ -478,8 +483,10 @@ void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
       THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
       THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
       THTensor *gradWeight_i = THTensor_(newSelect)(gradWeight, 0, i);
-      THTensor *gradBias_i = THTensor_(newSelect)(gradBias, 0, i);
-
+      THTensor *gradBias_i = NULL;
+      if(gradBias) {
+      	gradBias_i = THTensor_(newSelect)(gradBias, 0, i);
+      }
       THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(gradOutput_i, gradWeight_i,
                 gradBias_i, finput_i, scale);
 
@@ -495,14 +502,16 @@ void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
 
   // Copy back and transpose back
   THTensor_(transpose)(_gradWeight, NULL, 0, 1);
-  THTensor_(transpose)(_gradBias, NULL, 0, 1);
   THTensor_(resize4d)(_gradWeight, nInputPlane, nOutputPlane, kH, kW);
-  THTensor_(resize2d)(_gradBias, nInputPlane, nOutputPlane);
-
   THTensor_(copy)(_gradWeight, gradWeight);
-  THTensor_(copy)(_gradBias, gradBias);
   THTensor_(transpose)(_gradWeight, NULL, 0, 1);
-  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+
+  if(gradBias) {
+	  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+	  THTensor_(resize2d)(_gradBias, nInputPlane, nOutputPlane);
+	  THTensor_(copy)(_gradBias, gradBias);
+	  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+  }
 
   if (batch == 0) {
     THTensor_(select)(gradOutput, NULL, 0, 0);
diff --git a/lib/THNN/generic/SpatialGridSamplerBilinear.c b/lib/THNN/generic/SpatialGridSamplerBilinear.c
new file mode 100644
index 0000000..37da51c
--- /dev/null
+++ b/lib/THNN/generic/SpatialGridSamplerBilinear.c
@@ -0,0 +1,204 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialGridSamplerBilinear.c"
+#else
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+
+static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)
+     (THTensor *input, THTensor *grid, THTensor *gradOutput) {
+  THNN_ARGCHECK(input->nDimension == 4, 2, input,
+		"4D input tensor expected but got: %s");
+  THNN_ARGCHECK(grid->nDimension == 4, 2, grid,
+		"4D grid tensor expected but got: %s");
+
+  int nbatch   = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int iheight   = THTensor_(size)(input, 2);
+  int iwidth    = THTensor_(size)(input, 3);
+  int oheight   = THTensor_(size)(grid, 1);
+  int owidth    = THTensor_(size)(grid, 2);
+
+  THNN_CHECK_DIM_SIZE(grid, 4, 0, nbatch);
+  THNN_CHECK_DIM_SIZE(grid, 4, 3, 2);
+  
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nbatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, channels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, owidth);
+  }
+}
+
+#define SAFE_GET(input, x, y, n, c, H, W) x >= 0 && x < W && y >=0 \
+    && y < H ? THTensor_fastGet4d(input, n, c, y, x) : 0
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+	  THNNState *state,
+	  THTensor *input,
+	  THTensor *grid,
+	  THTensor *output) {
+
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, NULL);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int IH = THTensor_(size)(input, 2);
+  int IW = THTensor_(size)(input, 3);
+  int H = THTensor_(size)(grid, 1);
+  int W = THTensor_(size)(grid, 2);
+	  
+  // resize output to the same shape as input
+  THTensor_(resize4d)(output, N, C, H, W);
+
+  // loop over each output pixel
+  int n, h, w, c;
+#pragma omp parallel for private(n, h, w, c)
+  for (n = 0; n < N; ++n) {
+    for (h = 0; h < H; ++h) {
+      for (w = 0; w < W; ++w) {
+	// get the corresponding input x, y co-ordinates from grid
+	real ix = THTensor_fastGet4d(grid, n, h, w, 0);
+	real iy = THTensor_fastGet4d(grid, n, h, w, 1);
+
+	// normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
+	ix = ((ix + 1) / 2) * (IW-1);
+	iy = ((iy + 1) / 2) * (IH-1);
+
+	// get NE, NW, SE, SW pixel values from (x, y)
+	int ix_nw = floor(ix);
+	int iy_nw = floor(iy);
+	int ix_ne = ix_nw + 1;
+	int iy_ne = iy_nw;
+	int ix_sw = ix_nw;
+	int iy_sw = iy_nw + 1;
+	int ix_se = ix_nw + 1;
+	int iy_se = iy_nw + 1;
+
+	// get surfaces to each neighbor:
+	real nw = (ix_se - ix)    * (iy_se - iy);
+	real ne = (ix    - ix_sw) * (iy_sw - iy);
+	real sw = (ix_ne - ix)    * (iy    - iy_ne);
+	real se = (ix    - ix_nw) * (iy    - iy_nw);
+	  
+	// calculate bilinear weighted pixel value and set output pixel
+	for (c = 0; c < C; ++c) {
+	  //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
+	  // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
+	  real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW);
+	  real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW);
+	  real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW);
+	  real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW);
+	  real out_val = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se;
+	  THTensor_fastSet4d(output, n, c, h, w, out_val);
+	}
+      }
+    }
+  }
+}
+
+#define SAFE_ADD(input, x, y, n, c, H, W, value)		\
+  do {								\
+    if (x >= 0 && x < W && y >=0 && y < H) {			\
+      real old_value = THTensor_fastGet4d(input, n, c, y, x);	\
+      THTensor_fastSet4d(input, n, c, y, x, value + old_value);	\
+    }								\
+  } while(0)
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+	  THNNState *state,
+	  THTensor *input, THTensor *gradInput,
+	  THTensor *grid, THTensor *gradGrid,
+	  THTensor *gradOutput) {
+
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, gradOutput);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int IH = THTensor_(size)(input, 2);
+  int IW = THTensor_(size)(input, 3);
+  int H = THTensor_(size)(grid, 1);
+  int W = THTensor_(size)(grid, 2);
+
+  THTensor_(resize4d)(gradInput, N, C, IH, IW);
+  THTensor_(resize4d)(gradGrid, N, H, W, 2);
+  THTensor_(zero)(gradInput);
+  THTensor_(zero)(gradGrid);
+
+  // loop over each output pixel
+  int n, h, w;
+#pragma omp parallel for private(n, h, w)
+  for (n = 0; n < N; ++n) {
+    for (h = 0; h < H; ++h) {
+      for (w = 0; w < W; ++w) {
+	// get the corresponding input x, y co-ordinates from grid
+	real ix = THTensor_fastGet4d(grid, n, h, w, 0);
+	real iy = THTensor_fastGet4d(grid, n, h, w, 1);
+
+	real gix = 0;
+	real giy = 0;
+
+	// normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1]
+	ix = ((ix + 1) / 2) * (IW-1);
+	iy = ((iy + 1) / 2) * (IH-1);
+
+	// get NE, NW, SE, SW pixel values from (x, y)
+	int ix_nw = floor(ix);
+	int iy_nw = floor(iy);
+	int ix_ne = ix_nw + 1;
+	int iy_ne = iy_nw;
+	int ix_sw = ix_nw;
+	int iy_sw = iy_nw + 1;
+	int ix_se = ix_nw + 1;
+	int iy_se = iy_nw + 1;
+
+	// get surfaces to each neighbor:
+	real nw = (ix_se - ix)    * (iy_se - iy);
+	real ne = (ix    - ix_sw) * (iy_sw - iy);
+	real sw = (ix_ne - ix)    * (iy    - iy_ne);
+	real se = (ix    - ix_nw) * (iy    - iy_nw);
+	  
+	for (int c = 0; c < C; ++c) {
+	  real gradout = THTensor_fastGet4d(gradOutput, n, c, h, w);
+
+	  // calculate and set gradInput
+	  SAFE_ADD(gradInput, ix_nw, iy_nw, n, c, IH, IW, nw * gradout);
+	  SAFE_ADD(gradInput, ix_ne, iy_ne, n, c, IH, IW, ne * gradout);
+	  SAFE_ADD(gradInput, ix_sw, iy_sw, n, c, IH, IW, sw * gradout);
+	  SAFE_ADD(gradInput, ix_se, iy_se, n, c, IH, IW, se * gradout);
+
+	  // calculate gradGrid
+	  real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW);
+	  real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW);
+	  real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW);
+	  real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW);
+
+	  gix -= nw_val * (iy_se - iy) * gradout;
+	  gix += ne_val * (iy_sw - iy) * gradout;
+	  gix -= sw_val * (iy - iy_ne) * gradout;
+	  gix += se_val * (iy - iy_nw) * gradout;
+
+	  giy -= nw_val * (ix_se - ix) * gradout;
+	  giy -= ne_val * (ix - ix_sw) * gradout;
+	  giy += sw_val * (ix_ne - ix) * gradout;
+	  giy += se_val * (ix - ix_nw) * gradout;
+	}
+
+	// un-normalize gradGrid values back to [-1, 1] constraints
+	gix = gix * (IW - 1) / 2;
+	giy = giy * (IH - 1) / 2;
+
+	real gix_old = THTensor_fastGet4d(gradGrid, n, h, w, 0);
+	real giy_old = THTensor_fastGet4d(gradGrid, n, h, w, 1);
+
+	THTensor_fastSet4d(gradGrid, n, h, w, 0, gix_old + gix);
+	THTensor_fastSet4d(gradGrid, n, h, w, 1, giy_old + giy);
+
+      }
+    }
+  }
+}
+
+#undef MIN
+#undef SAFE_GET
+#undef SAFE_ADD
+
+#endif
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index b9fd709..ad4ea51 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -177,13 +177,15 @@ TH_API void THNN_(GRUFused_updateOutput)(
           THTensor *bias1, // [OPTIONAL]
           THTensor *bias2, // [OPTIONAL]
           THTensor *hx,
-          THTensor *output);
+          THTensor *output,
+          THTensor *storage);
 TH_API void THNN_(GRUFused_updateGradInput)(
           THNNState *state,
-          THTensor *input,
-          THTensor *hidden,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
           THTensor *gradOutput,
-          THTensor *gradInput);
+          THTensor *gradInputHx,
+          THTensor *storage);
 
 TH_API void THNN_(LSTMFused_updateOutput)(
           THNNState *state,
@@ -196,13 +198,13 @@ TH_API void THNN_(LSTMFused_updateOutput)(
           THTensor *outputCell);
 TH_API void THNN_(LSTMFused_updateGradInput)(
           THNNState *state,
-          THTensor *input,
-          THTensor *hidden,
+          THTensor *storage,
+          THTensor *gradInGates,
           THTensor *cx,
           THTensor *cy,
           THTensor *gradOutput,
           THTensor *gradOutputCell,
-          THTensor *gradInput);
+          THTensor *gradInputCx);
 
 TH_API void THNN_(LogSigmoid_updateOutput)(
           THNNState *state,            // library's state
@@ -1169,6 +1171,18 @@ TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
           int outputHeight,
           int outputWidth);
 
+TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+	  THNNState *state,
+	  THTensor *input,
+	  THTensor *grid,
+	  THTensor *output);
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+	  THNNState *state,
+	  THTensor *input, THTensor *gradInput,
+	  THTensor *grid, THTensor *gradGrid,
+	  THTensor *gradOutput);
+
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
           THTensor *input,
@@ -1447,6 +1461,26 @@ TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
           int pad_l, int pad_r,
           int pad_t, int pad_b);
 
+TH_API void THNN_(FeatureLPPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal power,
+          int width,
+          int stride,
+          bool batchMode);
+
+TH_API void THNN_(FeatureLPPooling_updateGradInput)(
+          THNNState *state,
+          THTensor* gradOutput,
+          THTensor* input,
+          THTensor* output,
+          THTensor* gradInput,
+          accreal power,
+          int width,
+          int stride,
+          bool batchMode);
+
 TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -1463,4 +1497,37 @@ TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
           int pleft, int pright,
           int ptop, int pbottom,
           int pfront, int pback);
+
+TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+	  int outputDepth,
+          int outputHeight,
+          int outputWidth);
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputDepth,
+          int inputHeight,
+          int inputWidth,
+          int outputDepth,
+          int outputHeight,
+          int outputWidth);
+
 #endif
diff --git a/lib/THNN/generic/VolumetricUpSamplingNearest.c b/lib/THNN/generic/VolumetricUpSamplingNearest.c
new file mode 100644
index 0000000..5b01a1b
--- /dev/null
+++ b/lib/THNN/generic/VolumetricUpSamplingNearest.c
@@ -0,0 +1,226 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingNearest.c"
+#else
+
+
+static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int scale_factor) {
+  THArgCheck(input != NULL, 2, "5D input tensor expected but got NULL");
+  THArgCheck(scale_factor > 1, 4,
+	     "scale_factor must be greater than 1, but got: %d", scale_factor);
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D input tensor expected but got: %s");
+  if (input->nDimension == 4) {
+    int nChannels    = THTensor_(size)(input, 0);
+    int inputDepth   = THTensor_(size)(input, 1);
+    int inputHeight  = THTensor_(size)(input, 2);
+    int inputWidth   = THTensor_(size)(input, 3);
+    int outputDepth  = inputDepth  * scale_factor;
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, outputDepth);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+    }
+  } else {
+    int nBatch       = THTensor_(size)(input, 0);
+    int nChannels    = THTensor_(size)(input, 1);
+    int inputDepth   = THTensor_(size)(input, 2);
+    int inputHeight  = THTensor_(size)(input, 3);
+    int inputWidth   = THTensor_(size)(input, 4);  
+    int outputDepth  = inputDepth  * scale_factor;
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+    }
+  }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
+{
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+  int inputDepth   = THTensor_(size)(input, input->nDimension-3);
+  int inputHeight  = THTensor_(size)(input, input->nDimension-2);
+  int inputWidth   = THTensor_(size)(input,  input->nDimension-1);
+  int outputDepth  = inputDepth * scale_factor;
+  int outputHeight = inputHeight * scale_factor;
+  int outputWidth  = inputWidth * scale_factor;
+
+  if (input->nDimension == 4) {
+    THTensor_(resize4d)(output,
+			THTensor_(size)(input, 0),
+			outputDepth, outputHeight, outputWidth);    
+  } else {
+    THTensor_(resize5d)(output,
+			THTensor_(size)(input, 0),
+			THTensor_(size)(input, 1),
+			outputDepth, outputHeight, outputWidth);
+  }
+
+  int dT = scale_factor;
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-3;
+  int yDim = input->nDimension-2;
+  int zDim = input->nDimension-1;
+
+  // dims
+  int idim = input->nDimension;
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = output->size[3];
+  int osz4 = 1;
+  if (idim > 4) {
+    osz4 = output->size[4];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, i4, isrc, idst;
+  int iout[5];  // Output indices
+  int iin[5];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+          for (i4 = 0; i4 < osz4; i4++) {
+            iout[4] = i4;
+            iin[4] = i4;
+
+            // set the indices for the upsampled dimensions
+            iin[xDim] = iout[xDim] / dW;
+            iin[yDim] = iout[yDim] / dH;
+            iin[zDim] = iout[zDim] / dT;
+
+            idst = i0*os[0] + i1*os[1] + i2*os[2] + i3*os[3];
+            isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2] + iin[3]*is[3];
+            if (idim > 4) {
+              idst += i4*os[4];
+              isrc += iin[4]*is[4];
+            }
+
+            pout[idst] = pin[isrc];
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
+{
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+  THTensor_(resizeAs)(gradInput, input);
+
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int dT = scale_factor;
+  int xDim = gradInput->nDimension-3;
+  int yDim = gradInput->nDimension-2;
+  int zDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Guaranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = gradInput->size[3];
+  int isz4 = 1;
+  if (idim > 4) {
+    isz4 = gradInput->size[4];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, i4, isrc, idst, x, y, z;
+  int iin[5];  // Input indices
+  int iout[5];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          for (i4 = 0; i4 < isz4; i4++) {
+            iin[4] = i4;
+            iout[4] = i4;
+
+            idst = i0*is[0] + i1*is[1] + i2*is[2] + i3*is[3];
+            if (idim > 4) {
+              idst += i4*is[4];
+            }
+
+            // Now accumulate the gradients from gradOutput
+            for (z = 0; z < dT; z++) {
+              for (y = 0; y < dH; y++) {
+                for (x = 0; x < dW; x++) {
+                  iout[xDim] = dW * iin[xDim] + x;
+                  iout[yDim] = dH * iin[yDim] + y;
+                  iout[zDim] = dT * iin[zDim] + z;
+                  isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2] + iout[3]*os[3];
+                  if (idim > 4) {
+                    isrc += iout[4]*os[4];
+                  }
+                  pin[idst] += pout[isrc];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricUpSamplingTrilinear.c b/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
new file mode 100644
index 0000000..d2043cd
--- /dev/null
+++ b/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
@@ -0,0 +1,213 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.c"
+#else
+
+static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputDepth, int inputHeight, int inputWidth,
+      int outputDepth, int outputHeight, int outputWidth) {
+  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+	     && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+	     inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->nDimension == 5, 2, input,
+		  "5D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+  }
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputDepth = THTensor_(size)(input, 2);
+  int inputHeight = THTensor_(size)(input, 3);
+  int inputWidth = THTensor_(size)(input, 4);
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize5d)(output, 
+		      THTensor_(size)(input, 0), 
+		      THTensor_(size)(input, 1), 
+		      outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && 
+           outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos2[0] = pos1[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    return;
+  }
+  const float rdepth  = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+  const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth  = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const float t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const float h1r = rheight * h2;
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const float w1r = rwidth * w2;
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = t0lambda * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+                              + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                                          + w1lambda * pos1[h1p * inputWidth + w1p]))
+                  + t1lambda * (h0lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth] 
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + w1p])
+                              + h1lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth 
+                                                            + h1p * inputWidth]
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth 
+                                                            + h1p * inputWidth + w1p]));
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputDepth,
+    int inputHeight,
+    int inputWidth,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth){
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos1[0] += pos2[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    return;
+  }
+  const float rdepth  = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+  const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth  = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const float t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const float h1r = rheight * h2;
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const float w1r = rwidth * w2;
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0];
+          pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0];
+          pos1[h1p * inputWidth] += t0lambda * h1lambda * w0lambda * pos2[0];
+          pos1[h1p * inputWidth + w1p] += t0lambda * h1lambda * w1lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth] += t1lambda * h0lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + w1p] += t1lambda * h0lambda * w1lambda * pos2[0]; 
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth] += t1lambda * h1lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth + w1p] += t1lambda * h1lambda * w1lambda * pos2[0]; 
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
index 6c64015..acb88c0 100644
--- a/lib/THNN/init.c
+++ b/lib/THNN/init.c
@@ -179,6 +179,9 @@
 #include "generic/TemporalRowConvolution.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/FeatureLPPooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/BatchNormalization.c"
 #include "THGenerateFloatTypes.h"
 
@@ -236,6 +239,9 @@
 #include "generic/SpatialUpSamplingBilinear.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialGridSamplerBilinear.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -271,3 +277,9 @@
 
 #include "generic/VolumetricReplicationPadding.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricUpSamplingTrilinear.c"
+#include "THGenerateFloatTypes.h"
diff --git a/test.lua b/test.lua
index 4db53bc..35852fa 100755
--- a/test.lua
+++ b/test.lua
@@ -2175,7 +2175,25 @@ function nntest.MarginRankingCriterion()
    local v = torch.rand(2, batch_size)
    local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
    criterionJacobianTest1DTable(crit,v,t)
+end
+
+function nntest.ModuleCriterion()
+   local input = torch.randn(8,4)
+   local target = torch.randn(8,4)
+   local inputModule = nn.Tanh()
+   local criterion = nn.MSECriterion()
+   local mc = nn.ModuleCriterion(criterion, inputModule)
+
+   local err = mc:forward(input, target)
+   local gradInput = mc:backward(input, target)
 
+   local output = inputModule:forward(input)
+   local err2 = criterion:forward(output, target)
+   local gradOutput = criterion:backward(output, target)
+   local gradInput2 = inputModule:backward(input, gradOutput)
+
+   mytester:assert(err == err2, "ModuleCriterion backward err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "ModuleCriterion backward err")
 end
 
 function nntest.MaskedSelect()
@@ -4708,7 +4726,7 @@ end
 
 
 function nntest.TemporalRowConvolution()
-
+  if true then return end -- until this unit test is fixed...
   local from = math.random(1,5)
   local ki = math.random(1,5)
   local si = math.random(1,2)
@@ -6766,6 +6784,32 @@ function nntest.SpatialUpSamplingBilinear()
   end
 end
 
+function nntest.UpSampling()
+  -- Test nearest and linear modes
+  for _,mode in pairs({'nearest','linear'}) do
+    for scale=2,4 do
+      for dim = 4,5 do
+        local m = nn.UpSampling(scale, mode)
+
+        -- Create a randomly sized dimD vector
+        local shape = {}
+        for i = 1, dim do
+          table.insert(shape, torch.random(2, 4))
+        end
+
+        -- Check that the gradient is correct by using finite elements
+        local input = torch.Tensor(table.unpack(shape)):zero()
+        local err = jac.testJacobian(m, input)
+        mytester:assertlt(err, precision, ' error on state ')
+
+        local ferr, berr = jac.testIO(m, input)
+        mytester:asserteq(ferr, 0, torch.typename(m)..' - i/o forward err ')
+        mytester:asserteq(berr, 0, torch.typename(m)..' - i/o backward err ')
+      end
+    end
+  end
+end
+
 function nntest.Concat()
    local input = torch.randn(4, 2)
    local num_modules = math.random(2, 5)
@@ -8387,6 +8431,675 @@ function nntest.SpatialDepthWiseConvolution()
    mytester:assert(torch.all(abs_diff:lt(epsilon)))
 end
 
+function nntest.Constant()
+   local input = torch.randn(20,3,7)
+   local gradOutput = torch.randn(20,30,6)
+   local value = torch.randn(30,6)
+   local const = nn.Constant(value:clone(), 2)
+   local output = const:forward(input)
+   local gradInput = const:backward(input, output)
+   local output2 = value:view(1,30,6):expand(20,30,6)
+   mytester:assertTensorEq(output2, output, 0.000001, "Constant forward err")
+   mytester:assertTensorEq(gradInput, input:zero(), 0.000001, "Constant backward err")
+end
+
+function nntest.WhiteNoise()
+   local input = torch.zeros(3, 28, 28)
+   local addNoise = nn.WhiteNoise()
+   local output = addNoise:forward(input)
+   local meanValue = output:mean()
+   local stdValue = output:std()
+   mytester:assert(meanValue > -0.01 and meanValue < 0.01)
+   mytester:assert(stdValue < 0.15 and stdValue >= 0)
+
+   -- Evaluate
+   addNoise:evaluate()
+   output = addNoise:forward(input)
+   meanValue = output:mean()
+   stdValue = output:std()
+   mytester:assert(meanValue == 0)
+   mytester:assert(stdValue == 0)
+
+   -- backprop
+   addNoise:training()
+   local gradOutput = torch.rand(3, 28, 28)
+   local gradInput = addNoise:updateGradInput(input, gradOutput)
+   mytester:assertTensorEq(gradOutput, gradInput, 0.000001, "WhiteNoise backward err")
+end
+
+function nntest.OneHot()
+   local nClass = 10
+
+   -- batch mode
+   local batchSize = 3
+   local input = torch.LongTensor(batchSize):random(1, nClass)
+   local gradOutput = torch.randn(batchSize, nClass)
+
+   local oh = nn.OneHot(nClass)
+
+   local output = oh:forward(input)
+   local output2 = torch.Tensor(batchSize, nClass):zero()
+   local eye = torch.eye(nClass)
+   output2:index(eye, 1, input)
+   mytester:assertTensorEq(output, output2, 0.000001, "OneHot forward batch err")
+   mytester:assert(output:dim() == 2)
+
+   -- non-batch mode (number input)
+   local num = 3
+   local output3 = torch.zeros(nClass)
+   output3[num] = 1.0
+   mytester:assertTensorEq(oh:forward(num), output3, 0.000001, "OneHot forward number err")
+
+   local gradInput = oh:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot backward batch err")
+
+   if pcall(function() require 'cunn' end) then
+      oh:cuda()
+
+      -- test with long input
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch long-cuda err")
+
+      -- test with cuda input
+      local input = input:cuda()
+      gradOutput = gradOutput:cuda()
+
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch cuda err")
+
+      local gradInput2 = oh:backward(input, gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot backward batch err")
+      cutorch.synchronize()
+
+      -- non-batch mode (number input)
+      mytester:assertTensorEq(oh:forward(num), output3:cuda(), 0.000001, "OneHot forward number err")
+   end
+
+   -- multi-dimensional input
+   local inputSize = 2
+   local input = torch.LongTensor(batchSize, inputSize):random(1, nClass)
+   local gradOutput = torch.randn(batchSize, inputSize, nClass)
+
+   local oh = nn.OneHot(nClass, 2)
+
+   local output = oh:forward(input)
+   local output2 = torch.Tensor(batchSize*inputSize, nClass):zero()
+   local eye = torch.eye(nClass)
+   output2:index(eye, 1, input:view(-1))
+   output2:resize(batchSize, inputSize, nClass)
+   mytester:assertTensorEq(output, output2, 0.000001, "OneHot 2d forward batch err")
+   mytester:assert(output:dim() == 3)
+
+   local gradInput = oh:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot 2d backward batch err")
+
+   if pcall(function() require 'cunn' end) then
+      oh:cuda()
+
+      -- test with long input
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch long-cuda err")
+
+      -- test with cuda input
+      local input = input:cuda()
+      gradOutput = gradOutput:cuda()
+
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch cuda err")
+
+      local gradInput2 = oh:backward(input, gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot 2d backward batch err")
+
+      local benchmark = false
+      if benchmark then
+         local input = torch.FloatTensor(50, 50):random(1,65):cuda()
+
+         local oh = nn.OneHot(65):cuda()
+
+         oh:forward(input)
+         cutorch.synchronize()
+         local a = torch.Timer()
+         for i=1,10 do
+            oh:forward(input)
+         end
+         cutorch.synchronize()
+         local gputime = a:time().real
+
+         oh:float()
+         input = input:float()
+         oh:forward(input)
+         a = torch.Timer()
+         for i=1,10 do
+            oh:forward(input)
+         end
+         local cputime = a:time().real
+         print("Onehot GPU vs CPU time", gputime, cputime)
+      end
+   end
+end
+
+function nntest.ZeroGrad()
+   local input = torch.randn(3,4)
+   local zg = nn.ZeroGrad()
+   local output = zg:forward(input)
+   mytester:assertTensorEq(input, output, 0.00000001)
+   local gradInput = zg:backward(input, input)
+   local gradInput2 = gradInput:clone():zero()
+   mytester:assertTensorEq(gradInput, gradInput2, 0.0000001)
+end
+
+function nntest.ZipTable()
+   -- input : { {a1,a2}, {b1,b2}, {c1,c2} }
+   -- output : { {a1,b1,c1}, {a2,b2,c2} }
+   local z = nn.ZipTable()
+   local input = {
+      {torch.randn(3,4), torch.randn(3,4)},
+      {torch.randn(3,4), torch.randn(3,4)},
+      {torch.randn(3,4), torch.randn(3,4)}
+   }
+   local output = z:forward(input)
+   mytester:assert(#output == 2, "ZipTable #output")
+   mytester:assert(#(output[1]) == 3, "ZipTable #output[1]")
+   mytester:assertTensorEq(input[1][1], output[1][1], 0.000001, "ZipTable input11")
+   mytester:assertTensorEq(input[1][2], output[2][1], 0.000001, "ZipTable input12")
+   mytester:assertTensorEq(input[3][2], output[2][3], 0.000001, "ZipTable input32")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 3, "ZipTable #gradInput")
+   mytester:assert(#(gradInput[1]) == 2, "ZipTable #gradInput[1]")
+   mytester:assertTensorEq(input[1][1], gradInput[1][1], 0.000001, "ZipTable gradInput11")
+   mytester:assertTensorEq(input[1][2], gradInput[1][2], 0.000001, "ZipTable gradInput12")
+   mytester:assertTensorEq(input[3][2], gradInput[3][2], 0.000001, "ZipTable gradInput32")
+end
+
+function nntest.ZipTableOneToMany()
+   -- input : { v, {a,b,c} }
+   -- output : { {v,a}, {v,b}, {v,c} }
+   local z = nn.ZipTableOneToMany()
+   local input = { torch.randn(3), { torch.randn(4), torch.rand(4), torch.rand(4) } }
+   local output = z:forward(input)
+   mytester:assert(#output == 3, "ZipTableOneToMany #output")
+   mytester:assert(#(output[1]) == 2, "ZipTableOneToMany #output[1]")
+   mytester:assert(#(output[2]) == 2, "ZipTableOneToMany #output[2]")
+   mytester:assert(#(output[3]) == 2, "ZipTableOneToMany #output[3]")
+   mytester:assertTensorEq(input[1], output[1][1], 0.000001, "ZipTableOneToMany input1 output11")
+   mytester:assertTensorEq(input[1], output[2][1], 0.000001, "ZipTableOneToMany input1 output21")
+   mytester:assertTensorEq(input[1], output[3][1], 0.000001, "ZipTableOneToMany input1 output31")
+   mytester:assertTensorEq(input[2][1], output[1][2], 0.000001, "ZipTableOneToMany input21")
+   mytester:assertTensorEq(input[2][2], output[2][2], 0.000001, "ZipTableOneToMany input22")
+   mytester:assertTensorEq(input[2][3], output[3][2], 0.000001, "ZipTableOneToMany input23")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 2, "ZipTableOneToMany #gradInput")
+   mytester:assert(#(gradInput[2]) == 3, "ZipTableOneToMany #gradInput[2]")
+   mytester:assertTensorEq(input[2][1], gradInput[2][1], 0.000001, "ZipTableOneToMany gradInput21")
+   mytester:assertTensorEq(input[2][2], gradInput[2][2], 0.000001, "ZipTableOneToMany gradInput22")
+   mytester:assertTensorEq(input[2][3], gradInput[2][3], 0.000001, "ZipTableOneToMany gradInput32")
+   mytester:assertTensorEq(torch.mul(input[1], 3), gradInput[1], 0.000001, "ZipTableOneToMany gradInput21")
+end
+
+function nntest.Collapse()
+   local c = nn.Collapse(3)
+   local input = torch.randn(8,3,4,5)
+   local output = c:forward(input)
+   mytester:assertTensorEq(input:view(8,-1), output, 0.000001, "Collapse:forward")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Collapse:backward")
+   mytester:assertTableEq(gradInput:size():totable(), input:size():totable(), 0.000001, "Collapse:backward size")
+   local input2 = input:transpose(1,4)
+   local output2 = c:forward(input2)
+   mytester:assertTensorEq(input2:contiguous():view(5,-1), output2, 0.000001, "Collapse:forward non-contiguous")
+   local gradInput2 = c:backward(input2, output2)
+   mytester:assertTensorEq(gradInput2, input2, 0.000001, "Collapse:backward non-contiguous")
+   mytester:assertTableEq(gradInput2:size():totable(), input2:size():totable(), 0.000001, "Collapse:backward size non-contiguous")
+end
+
+function nntest.Convert()
+   -- batch mode
+   local c = nn.Convert('bchw', 'chwb')
+   local input = torch.randn(8,3,5,5)
+   local output = c:forward(input)
+   local output2 = input:transpose(1,4):transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->chwb")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd bchw->chwb")
+   local c = nn.Convert('bchw', 'bf')
+   local output = c:forward(input)
+   local output2 = input:view(8,-1)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->bf")
+   c:float()
+   local output = c:forward(input:float())
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type()")
+   local output = c:forward(input)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float")
+   -- non-batch mode
+   local c = nn.Convert('chw', 'hwc')
+   local input = torch.randn(3,5,5)
+   local output = c:forward(input)
+   local output2 = input:transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->hwc non-batch")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd chw->hwc non-batch")
+   local c = nn.Convert('chw', 'f')
+   local output = c:forward(input)
+   local output2 = input:view(-1)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->bf non-batch")
+   c:float()
+   local output = c:forward(input:float())
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() non-batch")
+   local output = c:forward(input)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float non-batch")
+end
+
+function nntest.CAddTensorTable()
+   -- input : { v, {a,b,c} }
+   -- output : { v+a, v+b, v+c }
+   local z = nn.CAddTensorTable()
+   local input = { torch.randn(3), { torch.randn(3), torch.rand(3), torch.rand(3) } }
+   local output = z:forward(input)
+   mytester:assert(#output == 3, "CAddTensorTable #output")
+   mytester:assertTensorEq(input[1]+input[2][1], output[1], 0.00001, "CAddTensorTable input21 output1")
+   mytester:assertTensorEq(input[1]+input[2][2], output[2], 0.00001, "CAddTensorTable input22 output2")
+   mytester:assertTensorEq(input[1]+input[2][3], output[3], 0.00001, "CAddTensorTable input23 output3")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 2, "CAddTensorTable #gradInput")
+   mytester:assert(#(gradInput[2]) == 3, "CAddTensorTable #gradInput[2]")
+   mytester:assertTensorEq(output[1], gradInput[2][1], 0.000001, "CAddTensorTable gradInput21")
+   mytester:assertTensorEq(output[2], gradInput[2][2], 0.000001, "CAddTensorTable gradInput22")
+   mytester:assertTensorEq(output[3], gradInput[2][3], 0.000001, "CAddTensorTable gradInput23")
+   mytester:assertTensorEq(output[1]+output[2]+output[3], gradInput[1], 0.000001, "CAddTensorTable gradInput1")
+end
+
+-- Unit Test Kmeans layer
+function nntest.Kmeans()
+   local k = 3
+   local dim = 5
+   local batchSize = 200
+   local input = torch.Tensor(batchSize, dim)
+   for i=1, batchSize do
+      input[i]:fill(torch.random(1, k))
+   end
+
+   local verbose = false
+
+   local attempts = 10
+   local iter = 100
+   local bestLoss = 100000000
+   local bestKm = nil
+   local tempLoss = 0
+   local learningRate = 1
+
+   local initTypes = {'random', 'kmeans++'}
+   local useCudas = {false}
+   if pcall(function() require 'cunn' end) then
+      useCudas[2] = true
+   end
+   for _, initType in pairs(initTypes) do
+      for _, useCuda in pairs(useCudas) do
+
+         if useCuda then
+            input = input:cuda()
+         else
+            input = input:double()
+         end
+
+         local timer = torch.Timer()
+         for j=1, attempts do
+            local km = nn.Kmeans(k, dim)
+            if useCuda then km:cuda() end
+
+            if initType == 'kmeans++' then
+               km:initKmeansPlus(input)
+            else
+               km:initRandom(input)
+            end
+
+            for i=1, iter do
+               km:zeroGradParameters()
+
+               km:forward(input)
+               km:backward(input, gradOutput)
+
+               -- Gradient descent
+               km.weight:add(-learningRate, km.gradWeight)
+               tempLoss = km.loss
+            end
+            if verbose then print("Attempt Loss " .. j ..": " .. tempLoss) end
+            if tempLoss < bestLoss then
+               bestLoss = tempLoss
+            end
+            if (initType == 'kmeans++' and bestLoss < 0.00001) or (initType == 'random' and bestLoss < 500) then
+               break
+            end
+         end
+         if verbose then
+            print("InitType: " .. initType .. " useCuda: " .. tostring(useCuda))
+            print("Best Loss: " .. bestLoss)
+            print("Total time: " .. timer:time().real)
+         end
+         if initType == 'kmeans++' then
+            mytester:assert(bestLoss < 0.00001, "Kmeans++ error ("..(useCuda and 'cuda' or 'double')..")")
+         else
+            mytester:assert(bestLoss < 500, "Kmeans error ("..(useCuda and 'cuda' or 'double')..")")
+         end
+      end
+   end
+end
+
+function nntest.FeatureLPPooling()
+   local verbose = false
+
+   local num_tries = 2
+   local jacobian = nn.Jacobian
+   local precision = 4e-3
+
+   local batch_max = 3
+   local feature_max = 100
+   local dim1_max = 3
+   local dim2_max = 3
+
+   local function pickPow()
+      local num = torch.random(4)
+      if num == 1 then
+         return 1
+      else
+         return (num - 1) * 2.0
+      end
+   end
+
+   local function runFPropTest(dims, width, stride, pow, batch_mode)
+      local pool = nn.FeatureLPPooling(width, stride, pow, batch_mode):float()
+
+      local num_batch = torch.random(batch_max)
+      local num_features = (torch.random(feature_max) - 1) * stride + width
+      local num_dim1 = torch.random(dim1_max)
+      local num_dim2 = torch.random(dim2_max)
+
+      if verbose then
+         print('test on dim ' .. dims ..
+                  ' features ' .. num_features ..
+                  ' width ' .. width .. ' stride ' .. stride ..
+                  ' p ' .. pow .. ' bm ' .. (batch_mode and 1 or 0))
+      end
+
+      local input = nil
+      if dims == 1 then
+         if batch_mode then
+            input = torch.FloatTensor(num_batch, num_features)
+
+            for i = 1, num_batch do
+               for f = 1, num_features do
+                  input[i][f] = f - 1
+               end
+            end
+
+         else
+            input = torch.FloatTensor(num_features)
+
+            for f = 1, num_features do
+               input[f] = f - 1
+            end
+
+         end
+      elseif dims == 2 then
+         if batch_mode then
+            input = torch.FloatTensor(num_batch, num_features, num_dim1)
+
+            for i = 1, num_batch do
+               for f = 1, num_features do
+                  for j = 1, num_dim1 do
+                     input[i][f][j] = f - 1
+                  end
+               end
+            end
+
+         else
+            input = torch.FloatTensor(num_features, num_dim1)
+
+            for f = 1, num_features do
+               for j = 1, num_dim1 do
+                  input[f][j] = f - 1
+               end
+            end
+
+         end
+      elseif dims == 3 then
+         if batch_mode then
+            input = torch.FloatTensor(num_batch, num_features, num_dim1, num_dim2)
+
+            for i = 1, num_batch do
+               for f = 1, num_features do
+                  for j = 1, num_dim1 do
+                     for k = 1, num_dim2 do
+                        input[i][f][j][k] = f - 1
+                     end
+                  end
+               end
+            end
+
+         else
+            input = torch.FloatTensor(num_features, num_dim1, num_dim2)
+
+            for f = 1, num_features do
+               for j = 1, num_dim1 do
+                  for k = 1, num_dim2 do
+                     input[f][j][k] = f - 1
+                  end
+               end
+            end
+
+         end
+      end
+
+      local output = pool:forward(input)
+
+      -- Each output feature o(k) (k zero based) for L1 is:
+      -- sum(i((k - 1) * s), i((k - 1) * s + 1), ..., i((k - 1) * s + w - 1))
+      -- if i(x) = x, then: o(k) = w * (k - 1) * s + w * (w - 1) / 2
+      -- For Lp (p != 1), just evaluate ourselves and compare
+
+      local function verifyFeature(val, k, width, stride, pow)
+         local sum_input = 0
+         if pow == 1 then
+            sum_input = width * (k - 1) * stride + width * (width - 1) / 2
+         else
+            for w = 0, width - 1 do
+               sum_input = sum_input + math.pow((k - 1) * stride + w, pow)
+            end
+            sum_input = math.pow(sum_input, 1 / pow)
+         end
+
+         local diff = math.abs(val - sum_input)
+         if (diff >= 1e-3) then
+            if verbose then
+               print('failed on ' .. val .. ' ' .. sum_input)
+            end
+            mytester:assertlt(math.abs(val - sum_input), 1e-3)
+         end
+      end
+
+      if dims == 1 then
+         if batch_mode then
+            for i = 1, output:size(1) do
+               for f = 1, output:size(2) do
+                  verifyFeature(output[i][f], f, width, stride, pow)
+               end
+            end
+
+         else
+            for f = 1, output:size(1) do
+               verifyFeature(output[f], f, width, stride, pow)
+            end
+
+         end
+      elseif dims == 2 then
+         if batch_mode then
+            for i = 1, output:size(1) do
+               for f = 1, output:size(2) do
+                  for j = 1, output:size(3) do
+                     verifyFeature(output[i][f][j], f, width, stride, pow)
+                  end
+               end
+            end
+
+         else
+            for f = 1, output:size(1) do
+               for j = 1, output:size(2) do
+                  verifyFeature(output[f][j], f, width, stride, pow)
+               end
+            end
+
+         end
+      elseif dims == 3 then
+         if batch_mode then
+            for i = 1, output:size(1) do
+               for f = 1, output:size(2) do
+                  for j = 1, output:size(3) do
+                     for k = 1, output:size(4) do
+                        verifyFeature(output[i][f][j][k], f, width, stride, pow)
+                     end
+                  end
+               end
+            end
+
+         else
+            for f = 1, output:size(1) do
+               for j = 1, output:size(2) do
+                  for k = 1, output:size(3) do
+                     verifyFeature(output[f][j][k], f, width, stride, pow)
+                  end
+               end
+            end
+
+         end
+      end
+   end
+
+   local function runBPropTest(dims, width, stride, pow, batch_mode)
+      local pool = nn.FeatureLPPooling(width, stride, pow, batch_mode):float()
+
+      local num_batch = torch.random(batch_max)
+      local num_features = (torch.random(feature_max) - 1) * stride + width
+      local num_dim1 = torch.random(dim1_max)
+      local num_dim2 = torch.random(dim2_max)
+
+      local input = nil
+      if dims == 1 then
+         if batch_mode then
+            input = torch.FloatTensor(num_batch, num_features)
+         else
+            input = torch.FloatTensor(num_features)
+         end
+      elseif dims == 2 then
+         if batch_mode then
+            input = torch.FloatTensor(num_batch, num_features, num_dim1)
+         else
+            input = torch.FloatTensor(num_features, num_dim1)
+         end
+      elseif dims == 3 then
+         if batch_mode then
+            input = torch.FloatTensor(num_batch, num_features, num_dim1, num_dim2)
+         else
+            input = torch.FloatTensor(num_features, num_dim1, num_dim2)
+         end
+      end
+
+      local err = jacobian.testJacobian(pool, input, -2, -2, 5e-4)
+      if verbose then
+         print('test on dim ' .. dims ..
+                  ' features ' .. num_features ..
+                  ' width ' .. width .. ' stride ' .. stride ..
+                  ' p ' .. pow .. ' err ' .. err)
+      end
+      mytester:assertlt(err, precision)
+   end
+
+   function testForwardLp()
+      for i = 1, num_tries do
+         for stride = 1, 4 do
+            for idx, batch_mode in ipairs({true, false}) do
+               for dims = 1, 3 do
+                  runFPropTest(dims, 1 + torch.random(15),
+                               stride, pickPow(), batch_mode)
+               end
+            end
+         end
+      end
+   end
+
+   function testZeroBProp()
+      local pool = nn.FeatureLPPooling(3, 1, 2.0, false):float()
+
+      local input = torch.FloatTensor(100):zero()
+      pool:forward(input)
+
+      local gradOutput = torch.FloatTensor(98):zero()
+      local gradInput = pool:backward(input, gradOutput, 1.0)
+
+      for i = 1, gradInput:size(1) do
+         mytester:asserteq(gradInput[i], 0)
+      end
+   end
+
+   function testJacobian1dNoBatch()
+      for i = 1, num_tries do
+         for stride = 1, 4 do
+            runBPropTest(1, 1 + torch.random(15), stride, pickPow(), false)
+         end
+      end
+   end
+
+   function testJacobian1dBatch()
+      for i = 1, num_tries do
+         for stride = 1, 4 do
+            runBPropTest(1, 1 + torch.random(15), stride, pickPow(), true)
+         end
+      end
+   end
+
+   function testJacobian2dNoBatch()
+      for i = 1, num_tries do
+         for stride = 1, 4 do
+            runBPropTest(2, 1 + torch.random(15), stride, pickPow(), false)
+         end
+      end
+   end
+
+   function testJacobian2dBatch()
+      for i = 1, num_tries do
+         for stride = 1, 4 do
+            runBPropTest(2, 1 + torch.random(15), stride, pickPow(), true)
+         end
+      end
+   end
+
+   function testJacobian3dNoBatch()
+      for i = 1, num_tries do
+         for stride = 1, 4 do
+            runBPropTest(3, 1 + torch.random(15), stride, pickPow(), false)
+         end
+      end
+   end
+
+   function testJacobian3dBatch()
+      for i = 1, num_tries do
+         for stride = 1, 4 do
+            runBPropTest(3, 1 + torch.random(15), stride, pickPow(), true)
+         end
+      end
+   end
+
+   testForwardLp()
+   testZeroBProp()
+   testJacobian1dBatch()
+   testJacobian2dNoBatch()
+   testJacobian2dBatch()
+   testJacobian3dNoBatch()
+   testJacobian3dBatch()
+end
+
 mytester:add(nntest)
 
 jac = nn.Jacobian
diff --git a/utils.lua b/utils.lua
index 8f9c203..09ce1b9 100644
--- a/utils.lua
+++ b/utils.lua
@@ -87,7 +87,7 @@ function nn.utils.recursiveResizeAs(t1,t2)
       end
    elseif torch.isTensor(t2) then
       t1 = torch.isTensor(t1) and t1 or t2.new()
-      t1:resizeAs(t2)
+      t1:resize(t2:size())
    else
       error("expecting nested tensors or tables. Got "..
             torch.type(t1).." and "..torch.type(t2).." instead")
@@ -130,15 +130,20 @@ function nn.utils.recursiveAdd(t1, val, t2)
    return t1, t2
 end
 
-function nn.utils.recursiveCopy(t1,t2)
+function nn.utils.recursiveCopy(t1,t2,async)
    if torch.type(t2) == 'table' then
       t1 = (torch.type(t1) == 'table') and t1 or {t1}
       for key,_ in pairs(t2) do
-         t1[key], t2[key] = nn.utils.recursiveCopy(t1[key], t2[key])
+         t1[key], t2[key] = nn.utils.recursiveCopy(t1[key], t2[key], async)
       end
    elseif torch.isTensor(t2) then
       t1 = torch.isTensor(t1) and t1 or t2.new()
-      t1:resizeAs(t2):copy(t2)
+      t1:resize(t2:size())
+      if async then
+        t1:copyAsync(t2)
+      else
+        t1:copy(t2)
+      end
    else
       error("expecting nested tensors or tables. Got "..
             torch.type(t1).." and "..torch.type(t2).." instead")
@@ -153,7 +158,7 @@ function nn.utils.addSingletonDimension(...)
   else
     view, t, dim = select(1,...)
     assert(torch.isTensor(view),
-           "output tensor expected, got " .. type(view))
+           "output tensor expected, got " .. torch.type(view))
   end
 
   assert(torch.isTensor(t), "input tensor expected")
@@ -185,7 +190,7 @@ function nn.utils.contiguousView(output, input, ...)
   if input:isContiguous() then
     output:view(input, ...)
   else
-    output:resizeAs(input)
+    output:resize(input:size())
     output:copy(input)
     output:view(output, ...)
   end
@@ -197,14 +202,14 @@ end
 -- nn.utils.clearState(self, '_buffer', '_buffer2')
 function nn.utils.clear(self, ...)
    local arg = {...}
-   if #arg > 0 and type(arg[1]) == 'table' then
+   if #arg > 0 and torch.type(arg[1]) == 'table' then
       arg = arg[1]
    end
    local function clear(f)
       if self[f] then
          if torch.isTensor(self[f]) then
             self[f]:set()
-         elseif type(self[f]) == 'table' then
+         elseif torch.type(self[f]) == 'table' then
             self[f] = {}
          else
             self[f] = nil

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git