[lua-torch-nn] 01/07: New upstream version 0~20170104-ge77b592+dfsg

Wed Jan 11 06:51:17 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit 6cb8ac819f739f7459da8ae7bc75b656a5b369eb
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Tue Jan 10 14:27:54 2017 +0000

    New upstream version 0~20170104-ge77b592+dfsg
---
 BatchNormalization.lua                          |   1 -
 Bilinear.lua                                    |   6 +-
 CReLU.lua                                       |  57 +++++
 ClassNLLCriterion.lua                           |   2 +
 Container.lua                                   |   1 +
 DistanceRatioCriterion.lua                      | 142 ++++++++++++
 GPU.lua                                         |  32 +--
 GatedLinearUnit.lua                             |  42 ++++
 Linear.lua                                      |   6 +-
 LookupTable.lua                                 |  14 +-
 Module.lua                                      |  12 +-
 Narrow.lua                                      |  14 +-
 PartialLinear.lua                               |   7 +-
 SelectTable.lua                                 |  11 +-
 SpatialAutoCropMSECriterion.lua                 |  74 +++++++
 SpatialConvolution.lua                          |  19 --
 SpatialConvolutionLocal.lua                     |  19 --
 SpatialConvolutionMM.lua                        |  19 --
 SpatialDilatedConvolution.lua                   |  19 --
 SpatialFullConvolution.lua                      |  19 --
 SpatialUpSamplingBilinear.lua                   |   1 -
 SpatialUpSamplingNearest.lua                    |   7 -
 TemporalConvolution.lua                         |   6 +-
 VolumetricConvolution.lua                       |   3 +
 VolumetricDilatedConvolution.lua                |  19 --
 VolumetricDilatedMaxPooling.lua                 |   4 +-
 VolumetricFullConvolution.lua                   |  19 --
 VolumetricMaxPooling.lua                        |   4 +-
 doc/containers.md                               | 105 +++++----
 doc/convolution.md                              |  52 ++---
 doc/criterion.md                                |  78 ++++++-
 doc/simple.md                                   |   5 +-
 doc/table.md                                    |  30 ++-
 doc/transfer.md                                 |  46 ++++
 init.lua                                        |   4 +
 lib/THNN/CMakeLists.txt                         |  16 +-
 lib/THNN/generic/LogSoftMax.c                   |   1 -
 lib/THNN/generic/MultiLabelMarginCriterion.c    |   8 +-
 lib/THNN/generic/SpatialAveragePooling.c        |  97 +++++++--
 lib/THNN/generic/SpatialConvolutionLocal.c      |   2 +
 lib/THNN/generic/SpatialConvolutionMM.c         |  13 ++
 lib/THNN/generic/SpatialDilatedConvolution.c    |  13 ++
 lib/THNN/generic/SpatialDilatedMaxPooling.c     |  15 +-
 lib/THNN/generic/SpatialFullConvolution.c       |  16 ++
 lib/THNN/generic/SpatialMaxUnpooling.c          |  50 ++---
 lib/THNN/generic/THNN.h                         |  11 +-
 lib/THNN/generic/TemporalConvolution.c          |  51 ++++-
 lib/THNN/generic/TemporalMaxPooling.c           |  50 ++++-
 lib/THNN/generic/TemporalSubSampling.c          |  44 +++-
 lib/THNN/generic/VolumetricAveragePooling.c     |  81 ++++++-
 lib/THNN/generic/VolumetricConvolutionMM.c      | 115 ++++++++--
 lib/THNN/generic/VolumetricDilatedConvolution.c | 127 +++++++----
 lib/THNN/generic/VolumetricDilatedMaxPooling.c  | 131 ++++++++++--
 lib/THNN/generic/VolumetricFullConvolution.c    | 105 +++++++--
 lib/THNN/generic/VolumetricMaxPooling.c         |   9 +-
 lib/THNN/generic/VolumetricMaxUnpooling.c       | 141 +++++++-----
 lib/THNN/generic/VolumetricReplicationPadding.c |  84 ++++++--
 test.lua                                        | 273 +++++++++++++++++++++++-
 test/LinearTHNN.lua                             |   6 +-
 59 files changed, 1834 insertions(+), 524 deletions(-)

diff --git a/BatchNormalization.lua b/BatchNormalization.lua
index 1cd30aa..8dfc576 100644
--- a/BatchNormalization.lua
+++ b/BatchNormalization.lua
@@ -116,7 +116,6 @@ function BN:updateOutput(input)
    input = makeContiguous(self, input)
    input = makeBatch(self, input)
 
-   self.output:resizeAs(input)
    self.save_mean = self.save_mean or input.new()
    self.save_mean:resizeAs(self.running_mean)
    self.save_std = self.save_std or input.new()
diff --git a/Bilinear.lua b/Bilinear.lua
index 7aa9d99..9350b03 100644
--- a/Bilinear.lua
+++ b/Bilinear.lua
@@ -142,8 +142,10 @@ function Bilinear:accGradParameters(input, gradOutput, scale)
    if self.bias then self.gradBias:add(scale, gradOutput:sum(1)) end
 end
 
--- we do not need to accumulate parameters when sharing:
-Bilinear.sharedAccUpdateGradParameters = Bilinear.accUpdateGradParameters
+function Bilinear:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
 
 function Bilinear:__tostring__()
   return torch.type(self) ..
diff --git a/CReLU.lua b/CReLU.lua
new file mode 100644
index 0000000..8da6e79
--- /dev/null
+++ b/CReLU.lua
@@ -0,0 +1,57 @@
+local CReLU, parent = torch.class('nn.CReLU', 'nn.Sequential')
+
+-- Implements the CReLU activation function as described by
+-- W. Shang et al. in "Understanding and Improving Convolutional Neural Networks
+-- via Concatenated Rectified Linear Units"
+function CReLU:__init(nInputDims, inplace)
+   parent.__init(self)
+   self.nInputDims = nInputDims
+   self.inplace = inplace or false
+
+   local concatTable = nn.ConcatTable()
+   concatTable:add(nn.Identity())
+   concatTable:add(nn.MulConstant(-1))
+   self:add(concatTable)
+   self:add(nn.JoinTable(2))
+   self:add(nn.ReLU(self.inplace))
+end
+
+function CReLU:updateOutput(input)
+   local input_
+   local batched = input:dim() == (self.nInputDims + 1)
+   if not batched then
+      input_ = input:view(1, -1)
+  else
+      input_ = input:view(input:size(1), -1)
+  end
+   parent.updateOutput(self, input_)
+   local osize = input:size()
+   if not batched then
+      osize[1] = osize[1] * 2
+   else
+      osize[2] = osize[2] * 2
+   end
+   self.output:resize(osize)
+   return self.output
+end
+
+function CReLU:backward(input, gradOutput)
+   return self:updateGradInput(input, gradOutput)
+end
+
+function CReLU:updateGradInput(input, gradOutput)
+   local batched = input:dim() == (self.nInputDims + 1)
+   if not batched then
+      parent.updateGradInput(self, input:view(1, -1), gradOutput:view(1, -1))
+   else
+      parent.updateGradInput(self, input:view(input:size(1), -1),
+                                   gradOutput:view(input:size(1), -1))
+   end
+
+   self.gradInput:resizeAs(input)
+   return self.gradInput
+end
+
+function CReLU:__tostring__()
+   return "CReLU()"
+end
diff --git a/ClassNLLCriterion.lua b/ClassNLLCriterion.lua
index 1d3f3b7..d89f439 100644
--- a/ClassNLLCriterion.lua
+++ b/ClassNLLCriterion.lua
@@ -33,6 +33,7 @@ function ClassNLLCriterion:updateOutput(input, target)
       else
           self.target = self.target:long()
       end
+      self.target:resize(1)
       self.target[1] = target
    elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
       self.target = torch.CudaLongTensor and target:cudaLong() or target
@@ -59,6 +60,7 @@ function ClassNLLCriterion:updateGradInput(input, target)
       else
           self.target = self.target:long()
       end
+      self.target:resize(1)
       self.target[1] = target
    elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
       self.target = torch.CudaLongTensor and target:cudaLong() or target
diff --git a/Container.lua b/Container.lua
index 469a370..7e264ba 100644
--- a/Container.lua
+++ b/Container.lua
@@ -96,6 +96,7 @@ function Container:share(mlp, ...)
     for i=1,#self.modules do
         self.modules[i]:share(mlp.modules[i], ...);
     end
+    return self
 end
 
 function Container:reset(stdv)
diff --git a/DistanceRatioCriterion.lua b/DistanceRatioCriterion.lua
new file mode 100644
index 0000000..271d374
--- /dev/null
+++ b/DistanceRatioCriterion.lua
@@ -0,0 +1,142 @@
+--[[
+   Probabilistic Criterion for Triplet Siamese Model for learning embedding.
+   Ref: https://arxiv.org/pdf/1610.00243.pdf
+
+   loss = -log( exp(-X) / ( exp(-X) + exp(-Y) ) )
+   where
+   X : Distance between similar samples
+   Y : Distance between dissimilar samples
+
+   The loss could be break down to following log expansion
+
+   loss = -log( exp(-X) ) - (-log( exp(-X) + exp(-Y) ))
+        = -log( exp(-X) ) + log( exp(-X) + exp(-Y) )
+        = -(-X) + log( exp(-X) + exp(-Y) )
+        = X + log( exp(-X) + exp(-Y) )
+
+   Gradients:
+      dLoss/dX = 1 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-X)
+               = 1 - exp(-X) / (exp(-X) + exp(-Y))
+
+      dLoss/dY = 0 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-Y)
+               = -exp(-Y) / (exp(-X) + exp(-Y))
+
+--]]
+
+local DistanceRatioCriterion, parent = torch.class('nn.DistanceRatioCriterion',
+                                                   'nn.Criterion')
+
+function DistanceRatioCriterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+     self.sizeAverage = sizeAverage
+   else
+     self.sizeAverage = true
+   end
+end
+
+-- Forward
+--[[
+-- X : Distance between similar samples
+-- Y : Distance between dissimilar samples
+   loss = -log( exp(-X) ) - (-log( exp(-X) + exp(-Y) ))
+        = -log( exp(-X) ) + log( exp(-X) + exp(-Y) )
+        = -(-X) + log( exp(-X) + exp(-Y) )
+        = X + log( exp(-X) + exp(-Y) )
+--]]
+function DistanceRatioCriterion:updateOutput(input)
+   assert(#input == 2, "Invalid number of inputs")
+   
+   local X = input[1]
+   local Y = input[2]
+
+   assert(X:nElement() == Y:nElement(), "Number of distances don't match.")
+   assert(X:size(1) == Y:size(1), "Invalid distances' size.")
+
+   -- Compute exp(-X) and exp(-Y)
+   self._expMinusX = self._expMinusX or X.new()
+   self._expMinusY = self._expMinusY or Y.new()
+
+   -- Compute ( exp(-X) + exp(-Y) )
+   self._expMinusX:resizeAs(X):copy(X):mul(-1):exp()
+   self._expMinusY:resizeAs(Y):copy(Y):mul(-1):exp()
+
+   self._sumExpMinusXY = self.sumExpMinusExp or X.new()
+   self._sumExpMinusXY:resizeAs(self._expMinusX):copy(self._expMinusX)
+                     :add(self._expMinusY)
+
+   -- Compute log( exp(-X) + exp(-Y) )
+   self._logSumExpMinusXY = self._logSumExpMinusXY or self._sumExpMinusXY.new()
+   self._logSumExpMinusXY:resizeAs(self._sumExpMinusXY)
+                         :copy(self._sumExpMinusXY):log()
+
+   -- Compute log( exp(-X) + exp(-Y) )
+   self.loss = self.loss or self._logSumExpMinusXY.new()
+   self.loss:resizeAs(X):copy(X):add(self._logSumExpMinusXY)
+
+   if self.sizeAverage then
+      return self.loss:sum()/X:size(1)
+   else
+      return self.loss:sum()
+   end
+end
+
+-- Backward
+--[[
+-- X : Distance between similar samples
+-- Y : Distance between dissimilar samples
+
+   Gradients:
+      dLoss/dX = 1 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-X)
+               = 1 - exp(-X) / (exp(-X) + exp(-Y))
+
+      dLoss/dY = 0 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-Y)
+               = -exp(-Y) / (exp(-X) + exp(-Y))
+
+--]]
+function DistanceRatioCriterion:updateGradInput(input)
+   assert(#input == 2, "Invalid number of inputs")
+   local X = input[1]
+   local Y = input[2]
+   assert(X:nElement() == Y:nElement(), "Number of distances don't match.")
+   assert(X:size(1) == Y:size(1), "Invalid distances' size.")
+
+   -- dLoss/dX
+   -- -exp(-X)
+   self.dX = self.dX or X.new()
+   self.dX:resizeAs(self._expMinusX):copy(self._expMinusX):mul(-1)
+
+   -- -exp(-X) / (exp(-X) + exp(-Y))
+   self.dX:cdiv(self._sumExpMinusXY)
+               
+   -- 1 - exp(-X) / (exp(-X) + exp(-Y))
+   self.dX:add(1)
+
+   -- dLoss/dY
+   -- -exp(-Y)
+   self.dY = self.dY or Y.new()
+   self.dY:resizeAs(self._expMinusY):copy(self._expMinusY):mul(-1)
+
+   -- -exp(-Y) / (exp(-X) + exp(-Y))
+   self.dY:cdiv(self._sumExpMinusXY)
+
+   if self.sizeAverage then
+      self.dX:div(X:size(1))
+      self.dY:div(X:size(1))
+   end
+
+   return {self.dX, self.dY}
+end
+
+function DistanceRatioCriterion:type(type, tensorCache)
+   if type then
+      self._expMinusX = nil 
+      self._expMinusY = nil
+      self._sumExpMinusXY = nil
+      self._logSumExpMinusXY = nil
+      self.loss = nil
+      self.dX = nil
+      self.dY = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
diff --git a/GPU.lua b/GPU.lua
index 3150236..2a7be0e 100644
--- a/GPU.lua
+++ b/GPU.lua
@@ -20,8 +20,8 @@ function GPU:__init(module, device, outdevice)
    assert(torch.isTypeOf(module, 'nn.Module'))
    self.modules[1] = module
    
-   if module:type() == 'torch.CudaTensor' then
-      self:cuda()
+   if module:type():find('torch%.Cuda.*Tensor') then
+      self:type(module:type())
    end
 end
 
@@ -86,7 +86,7 @@ function GPU.recursiveSetDevice(dst, src, proto)
 end
 
 function GPU:updateOutput(input)
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       self._input = self.recursiveSetDevice(self._input, input, self.device)
       
       local output = cutorch.withDevice(self.device, function()
@@ -106,7 +106,7 @@ function GPU:updateOutput(input)
 end
 
 function GPU:updateGradInput(input, gradOutput)
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       self._gradOutput = self.recursiveSetDevice(self._gradOutput, gradOutput, self.device)
       
       local gradInput = cutorch.withDevice(self.device, function()
@@ -122,7 +122,7 @@ function GPU:updateGradInput(input, gradOutput)
 end
 
 function GPU:accGradParameters(input, gradOutput, scale) 
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function()
          self.modules[1]:accGradParameters(self._input, self._gradOutput, scale)
       end)
@@ -132,7 +132,7 @@ function GPU:accGradParameters(input, gradOutput, scale)
 end
 
 function GPU:apply(callback)
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.apply(self, callback) end)
    else
       parent.apply(self, callback)
@@ -140,7 +140,7 @@ function GPU:apply(callback)
 end
 
 function GPU:type(type, typecache)
-   if type and type == 'torch.CudaTensor' then
+   if type and type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.type(self, type, typecache) end)
       self:setDevice()
    else
@@ -157,7 +157,7 @@ function GPU:clearState()
    nn.utils.clear(self, 'output', 'gradInput')
    self._input = nil
    self._gradOutput = nil
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.clearState(self) end)
    else
       parent.clearState(self)
@@ -165,7 +165,7 @@ function GPU:clearState()
 end
 
 function GPU:zeroGradParameters()
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.zeroGradParameters(self) end)
    else
       parent.zeroGradParameters(self)
@@ -173,7 +173,7 @@ function GPU:zeroGradParameters()
 end
 
 function GPU:updateParameters(lr)
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.updateParameters(self, lr) end)
    else
       parent.updateParameters(self, lr)
@@ -181,7 +181,7 @@ function GPU:updateParameters(lr)
 end
 
 function GPU:training()
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.training(self) end)
    else
       parent.training(self)
@@ -189,7 +189,7 @@ function GPU:training()
 end
 
 function GPU:evaluate()
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.evaluate(self) end)
    else
       parent.evaluate(self)
@@ -198,7 +198,7 @@ end
 
 function GPU:share(mlp, ...)
    local args = {...}
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.share(self, mlp, unpack(args)) end)
    else
       parent.share(self, mlp, unpack(args))
@@ -208,7 +208,7 @@ end
 
 function GPU:reset(...)
    local args = {...}
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       cutorch.withDevice(self.device, function() parent.reset(self, unpack(args)) end)
    else
       parent.reset(self, unpack(args))
@@ -218,7 +218,7 @@ end
 
 function GPU:clone(...)
    local args = {...}
-   if self._type == 'torch.CudaTensor' then
+   if self._type:find('torch%.Cuda.*Tensor') then
       return cutorch.withDevice(self.device, function() parent.clone(self, unpack(args)) end)
    else
       return parent.clone(self, unpack(args))
@@ -239,7 +239,7 @@ end
 function GPU:read(file)
    local header = file:readObject()
    local object
-   if header[1] == 'torch.CudaTensor' then
+   if header[1] and header[1]:find('torch%.Cuda.*Tensor') then
       local device = header[2] 
       if device > cutorch.getDeviceCount() then
          print"Warning : model was saved with more devices than available on current host."
diff --git a/GatedLinearUnit.lua b/GatedLinearUnit.lua
new file mode 100644
index 0000000..5f215ca
--- /dev/null
+++ b/GatedLinearUnit.lua
@@ -0,0 +1,42 @@
+local GatedLinearUnit, parent = torch.class('nn.GatedLinearUnit', 'nn.Module')
+
+function GatedLinearUnit:__init(dim)
+   parent.__init(self)
+   self.sigmoid = nn.Sigmoid()
+   self.dim = dim
+end
+
+function GatedLinearUnit:updateOutput(input)
+    local dim = self.dim or input:dim()
+    local inputSize = input:size(dim)
+
+    assert(inputSize % 2 == 0, "halving dimension needs to be even")
+
+    self.fHalf = input:narrow(dim, 1, inputSize/2)
+    self.sHalf = input:narrow(dim, inputSize/2 + 1, inputSize/2)
+
+    self.sHalfOut = self.sigmoid:forward(self.sHalf)
+    self.output:resizeAs(self.fHalf):copy(self.fHalf):cmul(self.sHalfOut)
+
+    return self.output
+end
+
+function GatedLinearUnit:updateGradInput(input, gradOutput)
+    local dim = self.dim or input:dim()
+    local inputSize = input:size(dim)
+
+    assert(inputSize % 2 == 0, "halving dimension needs to be even")
+
+    local fGradInput = self.sHalfOut
+    local sGradInput = self.sigmoid:backward(self.sHalf, gradOutput)
+                                   :cmul(self.fHalf)
+
+    self.gradInput:resizeAs(input)
+    self.gradInput:narrow(dim, 1, inputSize/2)
+                    :copy(fGradInput)
+                    :cmul(gradOutput)
+    self.gradInput:narrow(dim, inputSize/2+1, inputSize/2)
+                    :copy(sGradInput)
+
+    return self.gradInput
+end
diff --git a/Linear.lua b/Linear.lua
index c26ba37..3221227 100644
--- a/Linear.lua
+++ b/Linear.lua
@@ -105,8 +105,10 @@ function Linear:accGradParameters(input, gradOutput, scale)
    end
 end
 
--- we do not need to accumulate parameters when sharing
-Linear.sharedAccUpdateGradParameters = Linear.accUpdateGradParameters
+function Linear:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
 
 function Linear:clearState()
    if self.addBuffer then self.addBuffer:set() end
diff --git a/LookupTable.lua b/LookupTable.lua
index 8ca7ddb..6cffc6c 100644
--- a/LookupTable.lua
+++ b/LookupTable.lua
@@ -102,12 +102,6 @@ function LookupTable:accGradParameters(input, gradOutput, scale)
       error("input must be a vector or matrix")
    end
 
-   if not gradOutput:isContiguous() then
-      self._gradOutput = self._gradOutput or gradOutput.new()
-      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-      gradOutput = self._gradOutput
-   end
-
    self.gradWeight.THNN.LookupTable_accGradParameters(
       input:cdata(),
       gradOutput:cdata(),
@@ -162,9 +156,11 @@ function LookupTable:type(type, tensorCache)
 end
 
 function LookupTable:clearState()
-   nn.utils.clear(self, '_count', '_input', '_gradOutput')
+   nn.utils.clear(self, '_count', '_input')
    return parent.clearState(self)
 end
 
--- we do not need to accumulate parameters when sharing
-LookupTable.sharedAccUpdateGradParameters = LookupTable.accUpdateGradParameters
+function LookupTable:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
diff --git a/Module.lua b/Module.lua
index c1a0328..3debc57 100644
--- a/Module.lua
+++ b/Module.lua
@@ -47,6 +47,14 @@ function Module:accGradParameters(input, gradOutput, scale)
 end
 
 function Module:accUpdateGradParameters(input, gradOutput, lr)
+   if self.shared then
+      self:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   else
+      self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+   end
+end
+
+function Module:defaultAccUpdateGradParameters(input, gradOutput, lr)
    local gradWeight = self.gradWeight
    local gradBias = self.gradBias
    self.gradWeight = self.weight
@@ -95,8 +103,8 @@ function Module:share(mlp, ...)
    for i,v in ipairs(arg) do
       if self[v] ~= nil then
          self[v]:set(mlp[v])
-         self.accUpdateGradParameters = self.sharedAccUpdateGradParameters
-         mlp.accUpdateGradParameters = mlp.sharedAccUpdateGradParameters
+         self.shared = true
+         mlp.shared = true
       end
    end
    return self
diff --git a/Narrow.lua b/Narrow.lua
index 0754d45..a6ebaa3 100644
--- a/Narrow.lua
+++ b/Narrow.lua
@@ -16,7 +16,12 @@ function Narrow:updateOutput(input)
    if length < 0 then
       length = input:size(dim) - self.index + self.length + 2
    end
-   local output=input:narrow(dim,self.index,length)
+   local index = self.index
+   if self.index < 0 then
+      index = 1
+      length = input:size(dim) - length
+   end
+   local output=input:narrow(dim, index, length)
    self.output = self.output:typeAs(output)
    self.output:resizeAs(output):copy(output)
    return self.output
@@ -28,8 +33,13 @@ function Narrow:updateGradInput(input, gradOutput)
    if length < 0 then
       length = input:size(dim) - self.index + self.length + 2
    end
+   local index = self.index
+   if self.index < 0 then
+      index = 1
+      length = input:size(dim) - length
+   end
    self.gradInput = self.gradInput:typeAs(input)
    self.gradInput:resizeAs(input):zero()
-   self.gradInput:narrow(dim,self.index,length):copy(gradOutput)
+   self.gradInput:narrow(dim,index,length):copy(gradOutput)
    return self.gradInput
 end
diff --git a/PartialLinear.lua b/PartialLinear.lua
index d208f52..6e92cfc 100644
--- a/PartialLinear.lua
+++ b/PartialLinear.lua
@@ -102,9 +102,10 @@ function PartialLinear:updateParameters(learningRate)
    self.bias:add(-learningRate, self.gradBias)
 end
 
--- we do not need to accumulate parameters when sharing
-PartialLinear.sharedAccUpdateGradParameters =
-   PartialLinear.accUpdateGradParameters
+function PartialLinear:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
 
 function PartialLinear:__tostring__()
    return torch.type(self) ..
diff --git a/SelectTable.lua b/SelectTable.lua
index 8eba85e..f383a10 100644
--- a/SelectTable.lua
+++ b/SelectTable.lua
@@ -7,8 +7,12 @@ function SelectTable:__init(index)
 end
 
 function SelectTable:updateOutput(input)
+
    -- handle negative indices
-   local index = self.index < 0 and #input + self.index + 1 or self.index
+   local index = self.index
+   if type(index) == "number" then
+      index = index < 0 and #input + index + 1 or index
+   end
 
    assert(input[index], "index does not exist in the input table")
    self.output = input[index]
@@ -41,7 +45,10 @@ function SelectTable:updateGradInput(input, gradOutput)
    -- make gradInput a zeroed copy of input
    zeroTableCopy(self.gradInput, input)
    -- handle negative indices
-   local index = self.index < 0 and #input + self.index + 1 or self.index
+   local index = self.index
+   if type(index) == "number" then
+      index = index < 0 and #input + index + 1 or index
+   end
    -- copy into gradInput[index] (necessary for variable sized inputs)
    assert(self.gradInput[index])
    nn.utils.recursiveCopy(self.gradInput[index], gradOutput)
diff --git a/SpatialAutoCropMSECriterion.lua b/SpatialAutoCropMSECriterion.lua
new file mode 100644
index 0000000..97206a0
--- /dev/null
+++ b/SpatialAutoCropMSECriterion.lua
@@ -0,0 +1,74 @@
+--[[
+   SpatialAutoCropMSECriterion.
+   Implements the MSECriterion when the spatial resolution of the input is less than
+   or equal to the spatial resolution of the target. It achieves this center-cropping
+   the target to the same spatial resolution of the input and the MSE is then
+   calculated between these cropped inputs
+]]
+local SpatialAutoCropMSECriterion, parent = torch.class('nn.SpatialAutoCropMSECriterion', 'nn.MSECriterion')
+
+function SpatialAutoCropMSECriterion:__init(sizeAverage)
+    parent.__init(self, sizeAverage)
+end
+
+local function centerCrop(input, cropSize)
+   assert(input:dim() == 3 or input:dim() == 4, "input should be a 3D or  4D tensor")
+   assert(#cropSize == 2, "cropSize should have two elements only")
+   local _input = input
+   if input:dim() == 3 then
+      _input = input:view(1, input:size(1), input:size(2), input:size(3))
+   end
+   assert(cropSize[1] > 0 and cropSize[1] <= _input:size(3),
+         "0 < cropSize[1] <= input:size(3) not satisfied")
+   assert(cropSize[2] > 0 and cropSize[2] <= _input:size(4),
+        "0 < cropSize[1] <= input:size(3) not satisfied")
+
+   local inputHeight = _input:size(3)
+   local inputWidth = _input:size(4)
+
+   local rowStart = 1 + math.floor((inputHeight - cropSize[1])/2.0)
+   local rowEnd = rowStart + cropSize[1] - 1
+   local colStart = 1 +  math.floor((inputWidth - cropSize[2])/2.0)
+   local colEnd = colStart + cropSize[2] - 1
+   if input:dim() == 3 then
+      return input[{{}, {rowStart, rowEnd}, {colStart, colEnd}}]
+   else
+      return input[{{}, {}, {rowStart, rowEnd}, {colStart, colEnd}}]
+   end
+end
+
+local function getTensorHeightAndWidth(tensor)
+   local heightIdx = 2
+   local widthIdx = 3
+   if tensor:dim() == 4 then
+      heightIdx = 3
+      widthIdx = 4
+   end
+   return tensor:size(heightIdx), tensor:size(widthIdx)
+end
+
+local function inputResolutionIsSmallerThanTargetResolution(input, target)
+   local inputHeight, inputWidth = getTensorHeightAndWidth(input)
+   local targetHeight, targetWidth = getTensorHeightAndWidth(target)
+   return inputHeight <= targetHeight and inputWidth <= targetWidth
+end
+
+function SpatialAutoCropMSECriterion:updateOutput(input, target)
+   assert(input:dim() == target:dim(), "input and target should have the same number of dimensions")
+   assert(input:dim() == 4 or input:dim() == 3, "input and target must have 3 or 4 dimensions")
+   assert(inputResolutionIsSmallerThanTargetResolution(input, target),
+   "Spatial resolution of input should be less than or equal to the spatial resolution of the target")
+
+   local inputHeight, inputWidth = getTensorHeightAndWidth(input)
+   local targetCropped = centerCrop(target, {inputHeight, inputWidth})
+   return parent.updateOutput(self, input, targetCropped)
+end
+
+
+function SpatialAutoCropMSECriterion:updateGradInput(input, gradOutput)
+   assert(input:dim() == gradOutput:dim(), "input and gradOutput should have the same number of dimensions")
+   assert(input:dim() == 4 or input:dim() == 3, "input and gradOutput must have 3 or 4 dimensions")
+   assert(input:isSameSizeAs(gradOutput), "gradOutput and input must have the same size")
+
+   return parent.updateGradInput(self, input, gradOutput)
+end
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index 01a08cd..15a2b4b 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -73,26 +73,9 @@ local function backCompatibility(self)
    end
 end
 
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput then
-      if not gradOutput:isContiguous() then
-	 self._gradOutput = self._gradOutput or gradOutput.new()
-	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-	 gradOutput = self._gradOutput
-      end
-   end
-   return input, gradOutput
-end
-
 function SpatialConvolution:updateOutput(input)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    backCompatibility(self)
-   input = makeContiguous(self, input)
    input.THNN.SpatialConvolutionMM_updateOutput(
       input:cdata(),
       self.output:cdata(),
@@ -111,7 +94,6 @@ function SpatialConvolution:updateGradInput(input, gradOutput)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    if self.gradInput then
       backCompatibility(self)
-      input, gradOutput = makeContiguous(self, input, gradOutput)
       input.THNN.SpatialConvolutionMM_updateGradInput(
          input:cdata(),
          gradOutput:cdata(),
@@ -131,7 +113,6 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    scale = scale or 1
    backCompatibility(self)
-   input, gradOutput = makeContiguous(self, input, gradOutput)
    input.THNN.SpatialConvolutionMM_accGradParameters(
       input:cdata(),
       gradOutput:cdata(),
diff --git a/SpatialConvolutionLocal.lua b/SpatialConvolutionLocal.lua
index 3abc46b..9494c2f 100644
--- a/SpatialConvolutionLocal.lua
+++ b/SpatialConvolutionLocal.lua
@@ -48,22 +48,6 @@ function SpatialConvolutionLocal:reset(stdv)
    end
 end
 
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput then
-      if not gradOutput:isContiguous() then
-	 self._gradOutput = self._gradOutput or gradOutput.new()
-	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-	 gradOutput = self._gradOutput
-      end
-   end
-   return input, gradOutput
-end
-
 local function viewWeight(self)
    self.weight = self.weight:view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
    if self.gradWeight and self.gradWeight:dim() > 0 then
@@ -118,7 +102,6 @@ function SpatialConvolutionLocal:updateOutput(input)
    self.fgradInput = self.fgradInput or input.new()
    checkInputSize(self, input)
    viewWeight(self)
-   input = makeContiguous(self, input)
    input.THNN.SpatialConvolutionLocal_updateOutput(
       input:cdata(),
       self.output:cdata(),
@@ -141,7 +124,6 @@ function SpatialConvolutionLocal:updateGradInput(input, gradOutput)
    checkOutputSize(self, input, gradOutput)
    if self.gradInput then
       viewWeight(self)
-      input, gradOutput = makeContiguous(self, input, gradOutput)
       input.THNN.SpatialConvolutionLocal_updateGradInput(
          input:cdata(),
          gradOutput:cdata(),
@@ -164,7 +146,6 @@ function SpatialConvolutionLocal:accGradParameters(input, gradOutput, scale)
    scale = scale or 1
    checkInputSize(self, input)
    checkOutputSize(self, input, gradOutput)
-   input, gradOutput = makeContiguous(self, input, gradOutput)
    viewWeight(self)
    input.THNN.SpatialConvolutionLocal_accGradParameters(
       input:cdata(),
diff --git a/SpatialConvolutionMM.lua b/SpatialConvolutionMM.lua
index f3e5293..f20734f 100644
--- a/SpatialConvolutionMM.lua
+++ b/SpatialConvolutionMM.lua
@@ -50,22 +50,6 @@ function SpatialConvolutionMM:reset(stdv)
    end
 end
 
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput then
-      if not gradOutput:isContiguous() then
-	 self._gradOutput = self._gradOutput or gradOutput.new()
-	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-	 gradOutput = self._gradOutput
-      end
-   end
-   return input, gradOutput
-end
-
 function SpatialConvolutionMM:updateOutput(input)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    self.finput = self.finput or input.new()
@@ -76,7 +60,6 @@ function SpatialConvolutionMM:updateOutput(input)
       self.padH = self.padding
       self.padding = nil
    end
-   input = makeContiguous(self, input)
    input.THNN.SpatialConvolutionMM_updateOutput(
       input:cdata(),
       self.output:cdata(),
@@ -94,7 +77,6 @@ end
 function SpatialConvolutionMM:updateGradInput(input, gradOutput)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    if self.gradInput then
-      input, gradOutput = makeContiguous(self, input, gradOutput)
       input.THNN.SpatialConvolutionMM_updateGradInput(
          input:cdata(),
          gradOutput:cdata(),
@@ -113,7 +95,6 @@ end
 function SpatialConvolutionMM:accGradParameters(input, gradOutput, scale)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    scale = scale or 1
-   input, gradOutput = makeContiguous(self, input, gradOutput)
    assert((self.bias and self.gradBias) or (self.bias == nil and self.gradBias == nil))
    input.THNN.SpatialConvolutionMM_accGradParameters(
       input:cdata(),
diff --git a/SpatialDilatedConvolution.lua b/SpatialDilatedConvolution.lua
index 0ae914e..a0590c7 100644
--- a/SpatialDilatedConvolution.lua
+++ b/SpatialDilatedConvolution.lua
@@ -8,26 +8,9 @@ function SpatialDilatedConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW,
    self.dilationH = dilationH or 1
 end
 
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput then
-      if not gradOutput:isContiguous() then
-	 self._gradOutput = self._gradOutput or gradOutput.new()
-	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-	 gradOutput = self._gradOutput
-      end
-   end
-   return input, gradOutput
-end
-
 function SpatialDilatedConvolution:updateOutput(input)
    self.finput = self.finput or self.weight.new()
    self.fgradInput = self.fgradInput or self.weight.new()
-   input = makeContiguous(self, input)
    input.THNN.SpatialDilatedConvolution_updateOutput(
       input:cdata(),
       self.output:cdata(),
@@ -45,7 +28,6 @@ end
 
 function SpatialDilatedConvolution:updateGradInput(input, gradOutput)
    if self.gradInput then
-      input, gradOutput = makeContiguous(self, input, gradOutput)
       self.fgradInput = self.fgradInput or self.weight.new()
       input.THNN.SpatialDilatedConvolution_updateGradInput(
          input:cdata(),
@@ -64,7 +46,6 @@ end
 
 function SpatialDilatedConvolution:accGradParameters(input, gradOutput, scale)
    scale = scale or 1
-   input, gradOutput = makeContiguous(self, input, gradOutput)
    self.fgradInput = self.fgradInput or self.weight.new()
    input.THNN.SpatialDilatedConvolution_accGradParameters(
       input:cdata(),
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
index a234769..e6019bc 100644
--- a/SpatialFullConvolution.lua
+++ b/SpatialFullConvolution.lua
@@ -55,22 +55,6 @@ function SpatialFullConvolution:reset(stdv)
    end
 end
 
-local function makeContiguous(self, input, gradOutput)
-  if not input:isContiguous() then
-    self._input = self._input or input.new()
-    self._input:resizeAs(input):copy(input)
-    input = self._input
-  end
-  if gradOutput then
-    if not gradOutput:isContiguous() then
-      self._gradOutput = self._gradOutput or gradOutput.new()
-      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-      gradOutput = self._gradOutput
-    end
-  end
-  return input, gradOutput
-end
-
 local function calculateAdj(targetSize, ker, pad, stride)
   return (targetSize + 2 * pad - ker) % stride
 end
@@ -103,7 +87,6 @@ function SpatialFullConvolution:updateOutput(input)
     self.fgradInput = self.fgradInput or input.new()
   end
 
-  inputTensor = makeContiguous(self, inputTensor)
   inputTensor.THNN.SpatialFullConvolution_updateOutput(
     inputTensor:cdata(),
     self.output:cdata(),
@@ -144,7 +127,6 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput)
       end
     end
 
-    inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
     inputTensor.THNN.SpatialFullConvolution_updateGradInput(
       inputTensor:cdata(),
       gradOutput:cdata(),
@@ -190,7 +172,6 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
     adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
   end
 
-  inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
   inputTensor.THNN.SpatialFullConvolution_accGradParameters(
     inputTensor:cdata(),
     gradOutput:cdata(),
diff --git a/SpatialUpSamplingBilinear.lua b/SpatialUpSamplingBilinear.lua
index 8f19f91..e86beb3 100644
--- a/SpatialUpSamplingBilinear.lua
+++ b/SpatialUpSamplingBilinear.lua
@@ -79,7 +79,6 @@ function SpatialUpSamplingBilinear:updateOutput(input)
    local xdim = input:dim()
    local ydim = xdim - 1
    self:setSize(input)
-   self.output:resize(self.outputSize)
    input.THNN.SpatialUpSamplingBilinear_updateOutput(
       input:cdata(),
       self.output:cdata(),
diff --git a/SpatialUpSamplingNearest.lua b/SpatialUpSamplingNearest.lua
index b1b261a..362ae73 100644
--- a/SpatialUpSamplingNearest.lua
+++ b/SpatialUpSamplingNearest.lua
@@ -39,13 +39,6 @@ function SpatialUpSamplingNearest:updateOutput(input)
    end
    self.outputSize[ydim] = self.outputSize[ydim] * self.scale_factor
    self.outputSize[xdim] = self.outputSize[xdim] * self.scale_factor
-   -- Resize the output if needed
-   if input:dim() == 3 then
-     self.output:resize(self.outputSize[1], self.outputSize[2],
-       self.outputSize[3])
-   else
-     self.output:resize(self.outputSize)
-   end
    input.THNN.SpatialUpSamplingNearest_updateOutput(
       input:cdata(),
       self.output:cdata(),
diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua
index cdf0217..eb010bd 100644
--- a/TemporalConvolution.lua
+++ b/TemporalConvolution.lua
@@ -67,5 +67,7 @@ function TemporalConvolution:accGradParameters(input, gradOutput, scale)
    )
 end
 
--- we do not need to accumulate parameters when sharing
-TemporalConvolution.sharedAccUpdateGradParameters = TemporalConvolution.accUpdateGradParameters
+function TemporalConvolution:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index 89ce106..24dbfe3 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -124,6 +124,9 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
          self.gradWeight:cdata(),
          self.gradBias:cdata(),
          self.finput:cdata(),
+         self.kT, self.kW, self.kH,
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH,
          scale or 1
       )
    end
diff --git a/VolumetricDilatedConvolution.lua b/VolumetricDilatedConvolution.lua
index fc7f037..f1337eb 100644
--- a/VolumetricDilatedConvolution.lua
+++ b/VolumetricDilatedConvolution.lua
@@ -9,26 +9,9 @@ function VolumetricDilatedConvolution:__init(nInputPlane, nOutputPlane, kT, kW,
    self.dilationH = dilationH or 1
 end
 
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput then
-      if not gradOutput:isContiguous() then
-	 self._gradOutput = self._gradOutput or gradOutput.new()
-	 self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-	 gradOutput = self._gradOutput
-      end
-   end
-   return input, gradOutput
-end
-
 function VolumetricDilatedConvolution:updateOutput(input)
    self.finput = self.finput or self.weight.new()
    self.fgradInput = self.fgradInput or self.weight.new()
-   input = makeContiguous(self, input)
    input.THNN.VolumetricDilatedConvolution_updateOutput(
       input:cdata(),
       self.output:cdata(),
@@ -46,7 +29,6 @@ end
 
 function VolumetricDilatedConvolution:updateGradInput(input, gradOutput)
    if self.gradInput then
-      input, gradOutput = makeContiguous(self, input, gradOutput)
       self.fgradInput = self.fgradInput or self.weight.new()
       input.THNN.VolumetricDilatedConvolution_updateGradInput(
          input:cdata(),
@@ -65,7 +47,6 @@ end
 
 function VolumetricDilatedConvolution:accGradParameters(input, gradOutput, scale)
    scale = scale or 1
-   input, gradOutput = makeContiguous(self, input, gradOutput)
    self.fgradInput = self.fgradInput or self.weight.new()
    input.THNN.VolumetricDilatedConvolution_accGradParameters(
       input:cdata(),
diff --git a/VolumetricDilatedMaxPooling.lua b/VolumetricDilatedMaxPooling.lua
index f4c8d5b..249b2b5 100644
--- a/VolumetricDilatedMaxPooling.lua
+++ b/VolumetricDilatedMaxPooling.lua
@@ -41,9 +41,11 @@ function VolumetricDilatedMaxPooling:updateGradInput(input, gradOutput)
       gradOutput:cdata(),
       self.gradInput:cdata(),
       self.indices:cdata(),
+      self.kT, self.kW, self.kH,
       self.dT, self.dW, self.dH,
       self.padT, self.padW, self.padH,
-      self.dilationT, self.dilationW, self.dilationH
+      self.dilationT, self.dilationW, self.dilationH,
+      self.ceil_mode
    )
    return self.gradInput
 end
diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua
index 3c86a14..58eaa1d 100644
--- a/VolumetricFullConvolution.lua
+++ b/VolumetricFullConvolution.lua
@@ -57,22 +57,6 @@ function VolumetricFullConvolution:reset(stdv)
    self.bias:uniform(-stdv, stdv)
 end
 
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput then
-      if not gradOutput:isContiguous() then
-         self._gradOutput = self._gradOutput or gradOutput.new()
-         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-         gradOutput = self._gradOutput
-     end
-   end
-   return input, gradOutput
-end
-
 local function calculateAdj(targetSize, ker, pad, stride)
   return (targetSize + 2 * pad - ker) % stride
 end
@@ -113,7 +97,6 @@ function VolumetricFullConvolution:updateOutput(input)
     adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
   end
 
-   inputTensor = makeContiguous(self, inputTensor)
    inputTensor.THNN.VolumetricFullConvolution_updateOutput(
       inputTensor:cdata(),
       self.output:cdata(),
@@ -153,7 +136,6 @@ function VolumetricFullConvolution:updateGradInput(input, gradOutput)
       end
     end
 
-   inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
    inputTensor.THNN.VolumetricFullConvolution_updateGradInput(
       inputTensor:cdata(),
       gradOutput:cdata(),
@@ -199,7 +181,6 @@ function VolumetricFullConvolution:accGradParameters(input, gradOutput, scale)
     adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
   end
 
-   inputTensor, gradOutput = makeContiguous(self, inputTensor, gradOutput)
    inputTensor.THNN.VolumetricFullConvolution_accGradParameters(
       inputTensor:cdata(),
       gradOutput:cdata(),
diff --git a/VolumetricMaxPooling.lua b/VolumetricMaxPooling.lua
index 20733ed..e25c5b3 100644
--- a/VolumetricMaxPooling.lua
+++ b/VolumetricMaxPooling.lua
@@ -65,8 +65,10 @@ function VolumetricMaxPooling:updateGradInput(input, gradOutput)
       gradOutput:cdata(),
       self.gradInput:cdata(),
       self.indices:cdata(),
+      self.kT, self.kW, self.kH,
       self.dT, self.dW, self.dH,
-      self.padT, self.padW, self.padH
+      self.padT, self.padW, self.padH,
+      self.ceil_mode
    )
    return self.gradInput
 end
diff --git a/doc/containers.md b/doc/containers.md
index 44060e8..98264fa 100644
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -41,14 +41,19 @@ E.g.
 creating a one hidden-layer multi-layer perceptron is thus just as easy as:
 ```lua
 mlp = nn.Sequential()
-mlp:add( nn.Linear(10, 25) ) -- 10 input, 25 hidden units
-mlp:add( nn.Tanh() ) -- some hyperbolic tangent transfer function
-mlp:add( nn.Linear(25, 1) ) -- 1 output
+mlp:add(nn.Linear(10, 25)) -- Linear module (10 inputs, 25 hidden units)
+mlp:add(nn.Tanh())         -- apply hyperbolic tangent transfer function on each hidden units
+mlp:add(nn.Linear(25, 1))  -- Linear module (25 inputs, 1 output)
 
-print(mlp:forward(torch.randn(10)))
-```
-which gives the output:
-```lua
+> mlp
+nn.Sequential {
+  [input -> (1) -> (2) -> (3) -> output]
+  (1): nn.Linear(10 -> 25)
+  (2): nn.Tanh
+  (3): nn.Linear(25 -> 1)
+}
+
+> print(mlp:forward(torch.randn(10)))
 -0.1815
 [torch.Tensor of dimension 1]
 ```
@@ -104,13 +109,15 @@ on dimension `inputDimension`. It concatenates the results of its contained modu
 
 Example:
 ```lua
- mlp=nn.Parallel(2,1);     -- iterate over dimension 2 of input
- mlp:add(nn.Linear(10,3)); -- apply to first slice
- mlp:add(nn.Linear(10,2))  -- apply to first second slice
- print(mlp:forward(torch.randn(10,2)))
-```
-gives the output:
-```lua
+mlp = nn.Parallel(2,1);   -- Parallel container will associate a module to each slice of dimension 2
+                           -- (column space), and concatenate the outputs over the 1st dimension.
+                           
+mlp:add(nn.Linear(10,3)); -- Linear module (input 10, output 3), applied on 1st slice of dimension 2
+mlp:add(nn.Linear(10,2))  -- Linear module (input 10, output 2), applied on 2nd slice of dimension 2
+ 
+                                  -- After going through the Linear module the outputs are
+                                  -- concatenated along the unique dimension, to form 1D Tensor
+> mlp:forward(torch.randn(10,2)) -- of size 5.
 -0.5300
 -1.1015
  0.7764
@@ -122,26 +129,40 @@ gives the output:
 A more complicated example:
 ```lua
 
-mlp=nn.Sequential();
-c=nn.Parallel(1,2)
-for i=1,10 do
+mlp = nn.Sequential();
+c = nn.Parallel(1,2)     -- Parallel container will associate a module to each slice of dimension 1
+                         -- (row space), and concatenate the outputs over the 2nd dimension.           
+                         
+for i=1,10 do            -- Add 10 Linear+Reshape modules in parallel (input = 3, output = 2x1)
  local t=nn.Sequential()
- t:add(nn.Linear(3,2))
- t:add(nn.Reshape(2,1))
+ t:add(nn.Linear(3,2))   -- Linear module (input = 3, output = 2)
+ t:add(nn.Reshape(2,1))  -- Reshape 1D Tensor of size 2 to 2D Tensor of size 2x1
  c:add(t)
 end
-mlp:add(c)
 
-pred=mlp:forward(torch.randn(10,3))
-print(pred)
+mlp:add(c)               -- Add the Parallel container in the Sequential container
+
+pred = mlp:forward(torch.randn(10,3)) -- 2D Tensor of size 10x3 goes through the Sequential container
+                                      -- which contains a Parallel container of 10 Linear+Reshape.
+                                      -- Each Linear+Reshape module receives a slice of dimension 1
+                                      -- which corresponds to a 1D Tensor of size 3.
+                                      -- Eventually all the Linear+Reshape modules' outputs of size 2x1
+                                      -- are concatenated alond the 2nd dimension (column space)
+                                      -- to form pred, a 2D Tensor of size 2x10.
+
+> pred
+-0.7987 -0.4677 -0.1602 -0.8060  1.1337 -0.4781  0.1990  0.2665 -0.1364  0.8109
+-0.2135 -0.3815  0.3964 -0.4078  0.0516 -0.5029 -0.9783 -0.5826  0.4474  0.6092
+[torch.DoubleTensor of size 2x10]
+
 
-for i=1,10000 do     -- Train for a few iterations
- x=torch.randn(10,3);
- y=torch.ones(2,10);
- pred=mlp:forward(x)
+for i = 1, 10000 do     -- Train for a few iterations
+ x = torch.randn(10,3);
+ y = torch.ones(2,10);
+ pred = mlp:forward(x)
 
- criterion= nn.MSECriterion()
- local err=criterion:forward(pred,y)
+ criterion = nn.MSECriterion()
+ local err = criterion:forward(pred,y)
  local gradCriterion = criterion:backward(pred,y);
  mlp:zeroGradParameters();
  mlp:backward(x, gradCriterion); 
@@ -161,13 +182,11 @@ Concat concatenates the output of one layer of "parallel" modules along the
 provided dimension `dim`: they take the same inputs, and their output is
 concatenated.
 ```lua
-mlp=nn.Concat(1);
+mlp = nn.Concat(1);
 mlp:add(nn.Linear(5,3))
 mlp:add(nn.Linear(5,7))
-print(mlp:forward(torch.randn(5)))
-```
-which gives the output:
-```lua
+
+> print(mlp:forward(torch.randn(5)))
  0.7486
  0.1349
  0.7924
@@ -205,14 +224,13 @@ spatial dimensions and adds zero-padding around the smaller Tensors.
 inputSize = 3
 outputSize = 2
 input = torch.randn(inputSize,7,7)
+
 mlp=nn.DepthConcat(1);
 mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 1, 1))
 mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 3, 3))
 mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
-print(mlp:forward(input))
-```
-which gives the output:
-```lua
+
+> print(mlp:forward(input))
 (1,.,.) = 
  -0.2874  0.6255  1.1122  0.4768  0.9863 -0.2201 -0.1516
   0.2779  0.9295  1.1944  0.4457  1.1470  0.9693  0.1654
@@ -287,18 +305,17 @@ Bottle allows varying dimensionality input to be forwarded through any module th
 Bottle can be used to forward a 4D input of varying sizes through a 2D module `b x n`. The module `Bottle(module, 2)` will accept input of shape `p x q x r x n` and outputs with the shape `p x q x r x m`. Internally Bottle will view the input of `module` as `p*q*r x n`, and view the output as `p x q x r x m`. The numbers `p x q x r` are inferred from the input and can change for every forward/backward pass.
 
 ```lua
-input=torch.Tensor(4, 5, 3, 10)
-mlp=nn.Bottle(nn.Linear(10, 2))
-print(input:size())
-print(mlp:forward(input):size())
-```
-which gives the output:
-```lua
+input = torch.Tensor(4, 5, 3, 10)
+mlp = nn.Bottle(nn.Linear(10, 2))
+
+> print(input:size())
   4
   5
   3
  10
 [torch.LongStorage of size 4]
+
+> print(mlp:forward(input):size())
  4
  5
  3
diff --git a/doc/convolution.md b/doc/convolution.md
index dfc48b9..21cfa57 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -83,14 +83,14 @@ If the input sequence is a 3D tensor of dimension `nBatchFrame x nInputFrame x i
 `nBatchFrame x nOutputFrame x outputFrameSize`.
 
 The parameters of the convolution can be found in `self.weight` (Tensor of
-size `outputFrameSize x (inputFrameSize x kW) `) and `self.bias` (Tensor of
+size `outputFrameSize x (kW x inputFrameSize) `) and `self.bias` (Tensor of
 size `outputFrameSize`). The corresponding gradients can be found in
 `self.gradWeight` and `self.gradBias`.
 
 For a 2D input, the output value of the layer can be precisely described as:
 ```lua
 output[t][i] = bias[i]
-  + sum_j sum_{k=1}^kW weight[i][j][k]
+  + sum_j sum_{k=1}^kW weight[i][k][j]
                                 * input[dW*(t-1)+k)][j]
 ```
 
@@ -190,7 +190,7 @@ size `inputFrameSize`). The corresponding gradients can be found in
 
 The output value of the layer can be precisely described as:
 ```lua
-output[i][t] = bias[i] + weight[i] * sum_{k=1}^kW input[i][dW*(t-1)+k)]
+output[t][i] = bias[i] + weight[i] * sum_{k=1}^kW input[dW*(t-1)+k][i]
 ```
 
 <a name="nn.LookupTable"></a>
@@ -345,8 +345,8 @@ The parameters are the following:
   * `kH`: The kernel height of the convolution
   * `dW`: The step of the convolution in the width dimension. Default is `1`.
   * `dH`: The step of the convolution in the height dimension. Default is `1`.
-  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
-  * `padH`: The additional zeros added per height to the input planes. Default is `padW`, a good number is `(kH-1)/2`.
+  * `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`. `(kW-1)/2` is often used here.
+  * `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`. `(kH-1)/2` is often used here.
 
 Note that depending of the size of your kernel, several (of the last)
 columns or rows of the input image might be lost. It is up to the user to
@@ -438,8 +438,8 @@ The parameters are the following:
   * `kH`: The kernel height of the convolution
   * `dW`: The step of the convolution in the width dimension. Default is `1`.
   * `dH`: The step of the convolution in the height dimension. Default is `1`.
-  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
-  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+  * `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`. `(kW-1)/2` is often used here.
+  * `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`. `(kH-1)/2` is often used here.
   * `adjW`: Extra width to add to the output image. Default is `0`. Cannot be greater than dW-1.
   * `adjH`: Extra height to add to the output image. Default is `0`. Cannot be greater than dH-1.
 
@@ -470,8 +470,8 @@ The parameters are the following:
   * `kH`: The kernel height of the convolution
   * `dW`: The step of the convolution in the width dimension. Default is `1`.
   * `dH`: The step of the convolution in the height dimension. Default is `1`.
-  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
-  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+  * `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`. `(kW-1)/2` is often used here.
+  * `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`. `(kH-1)/2` is often used here.
   * `dilationW`: The number of pixels to skip. Default is `1`. `1` makes it a SpatialConvolution
   * `dilationH`: The number of pixels to skip. Default is `1`. `1` makes it a SpatialConvolution
 
@@ -505,8 +505,8 @@ The parameters are the following:
   * `kH`: The kernel height.
   * `dW`: The step in the width dimension. Default is `1`.
   * `dH`: The step in the height dimension. Default is `1`.
-  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
-  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+  * `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`.
+  * `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`.
 
 If the input image is a 3D tensor `nInputPlane x iH x iW`, the output image size
 will be `nOutputPlane x oH x oW` where
@@ -923,10 +923,9 @@ The parameters are the following:
   * `dT`: The step of the convolution in the time dimension. Default is `1`.
   * `dW`: The step of the convolution in the width dimension. Default is `1`.
   * `dH`: The step of the convolution in the height dimension. Default is `1`.
-  * `padT`: The additional zeros added per time to the input planes. Default is `0`, a good number is `(kT-1)/2`.
-  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
-  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
-
+  * `padT`: Additional zeros added to the input plane data on both sides of time axis. Default is `0`. `(kT-1)/2` is often used here.
+  * `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`. `(kW-1)/2` is often used here.
+  * `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`. `(kH-1)/2` is often used here.
 
 Note that depending of the size of your kernel, several (of the last)
 columns or rows of the input image might be lost. It is up to the user to
@@ -949,7 +948,7 @@ size `nOutputPlane`). The corresponding gradients can be found in
 ### VolumetricFullConvolution ###
 
 ```lua
-module = nn.VolumetricFullConvolution(nInputPlane, nOutputPlane, kT, kW, kH, [dT], [dW], [dH], [padT], [padW], [padH])
+module = nn.VolumetricFullConvolution(nInputPlane, nOutputPlane, kT, kW, kH, [dT], [dW], [dH], [padT], [padW], [padH], [adjT], [adjW], [adjH])
 ```
 
 Applies a 3D full convolution over an input image composed of several input planes. The `input` tensor in
@@ -967,16 +966,19 @@ The parameters are the following:
 * `dT`: The step of the convolution in the depth dimension. Default is `1`.
 * `dW`: The step of the convolution in the width dimension. Default is `1`.
 * `dH`: The step of the convolution in the height dimension. Default is `1`.
-* `padT`: The additional zeros added per depth to the input planes. Default is `0`, a good number is `(kT-1)/2`.
-* `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
-* `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+* `padT`: Additional zeros added to the input plane data on both sides of time (depth) axis. Default is `0`. `(kT-1)/2` is often used here.
+* `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`. `(kW-1)/2` is often used here.
+* `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`. `(kH-1)/2` is often used here.
+* `adjT`: Extra depth to add to the output image. Default is `0`.  Cannot be greater than dT-1.
+* `adjW`: Extra width to add to the output image. Default is `0`. Cannot be greater than dW-1.
+* `adjH`: Extra height to add to the output image. Default is `0`. Cannot be greater than dH-1.
 
 If the input image is a 3D tensor `nInputPlane x depth x height x width`, the output image size
 will be `nOutputPlane x odepth x oheight x owidth` where
 ```lua
-odepth  = (depth  - 1) * dT - 2*padT + kT
-owidth  = (width  - 1) * dW - 2*padW + kW
-oheight = (height - 1) * dH - 2*padH + kH
+odepth  = (depth  - 1) * dT - 2*padT + kT + adjT
+owidth  = (width  - 1) * dW - 2*padW + kW + adjW
+oheight = (height - 1) * dH - 2*padH + kH + adjH
 ```
 
 <a name="nn.VolumetricDilatedConvolution"></a>
@@ -998,9 +1000,9 @@ The parameters are the following:
   * `dT`: The step of the convolution in the depth dimension. Default is `1`.
   * `dW`: The step of the convolution in the width dimension. Default is `1`.
   * `dH`: The step of the convolution in the height dimension. Default is `1`.
-  * `padT`: The additional zeros added per depth to the input planes. Default is `0`, a good number is `(kT-1)/2`.
-  * `padW`: The additional zeros added per width to the input planes. Default is `0`, a good number is `(kW-1)/2`.
-  * `padH`: The additional zeros added per height to the input planes. Default is `0`, a good number is `(kH-1)/2`.
+  * `padT`: Additional zeros added to the input plane data on both sides of time (depth) axis. Default is `0`. `(kT-1)/2` is often used here.
+  * `padW`: Additional zeros added to the input plane data on both sides of width axis. Default is `0`. `(kW-1)/2` is often used here.
+  * `padH`: Additional zeros added to the input plane data on both sides of height axis. Default is `0`. `(kH-1)/2` is often used here.
   * `dilationT`: The number of pixels to skip. Default is `1`. `1` makes it a VolumetricConvolution
   * `dilationW`: The number of pixels to skip. Default is `1`. `1` makes it a VolumetricConvolution
   * `dilationH`: The number of pixels to skip. Default is `1`. `1` makes it a VolumetricConvolution
diff --git a/doc/criterion.md b/doc/criterion.md
index 337d873..cb2bbd0 100644
--- a/doc/criterion.md
+++ b/doc/criterion.md
@@ -18,11 +18,13 @@ target, they compute a gradient according to a given loss function.
     * [`AbsCriterion`](#nn.AbsCriterion): measures the mean absolute value of the element-wise difference between input;
     * [`SmoothL1Criterion`](#nn.SmoothL1Criterion): a smooth version of the AbsCriterion;
     * [`MSECriterion`](#nn.MSECriterion): mean square error (a classic);
+    * [`SpatialAutoCropMSECriterion`](#nn.SpatialAutoCropMSECriterion): Spatial mean square error when the input is spatially smaller than the target, by only comparing their spatial overlap;
     * [`DistKLDivCriterion`](#nn.DistKLDivCriterion): Kullback–Leibler divergence (for fitting continuous probability distributions);
   * Embedding criterions (measuring whether two inputs are similar or dissimilar):
     * [`HingeEmbeddingCriterion`](#nn.HingeEmbeddingCriterion): takes a distance as input;
     * [`L1HingeEmbeddingCriterion`](#nn.L1HingeEmbeddingCriterion): L1 distance between two inputs;
     * [`CosineEmbeddingCriterion`](#nn.CosineEmbeddingCriterion): cosine distance between two inputs;
+    * [`DistanceRatioCriterion`](#nn.DistanceRatioCriterion): Probabilistic criterion for training siamese model with triplets.
   * Miscelaneus criterions:
     * [`MultiCriterion`](#nn.MultiCriterion) : a weighted sum of other criterions each applied to the same input and target;
     * [`ParallelCriterion`](#nn.ParallelCriterion) : a weighted sum of other criterions each applied to a different input and target;
@@ -256,7 +258,7 @@ or in the case of the weights argument being specified:
 loss(o, t) = - 1/n sum_i weights[i] * (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
 ```
 
-This is used for measuring the error of a reconstruction in for example an auto-encoder. Note that the targets `t[i]` should be numbers between 0 and 1, for instance, the output of an [`nn.Sigmoid`](transfer.md#nn.Sigmoid) layer.
+This is used for measuring the error of a reconstruction in for example an auto-encoder. Note that the outputs `o[i]` should be numbers between 0 and 1, for instance, the output of an [`nn.Sigmoid`](transfer.md#nn.Sigmoid) layer and should be interpreted as the probability of predicting `t[i] = 1`. Note `t[i]` can be either 0 or 1.
 
 By default, the losses are averaged for each minibatch over observations *as well as* over dimensions. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
 
@@ -502,6 +504,25 @@ criterion.sizeAverage = false
 By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
 
 
+<a name="nn.SpatialAutoCropMSECriterion"></a>
+## SpatialAutoCropMSECriterion ##
+
+```lua
+criterion = nn.SpatialAutoCropMSECriterion()
+```
+
+Creates a criterion that measures the mean squared error between the input and target, even if the target is spatially larger than the input. It achieves this by center-cropping the target to the same spatial resolution as the input, the mean squared error is then calculated between the input and this cropped target.
+
+If the input and cropped target tensors are `d`-dimensional `Tensor`s with a total of `n` elements, the sum operation operates over all the elements, and divides by `n`.
+
+The division by `n` can be avoided if one sets the internal variable `sizeAverage` to `false`:
+
+```lua
+criterion = nn.SpatialAutoCropMSECriterion()
+criterion.sizeAverage = false
+```
+
+
 <a name="nn.MultiCriterion"></a>
 ## MultiCriterion ##
 
@@ -709,6 +730,61 @@ For batched inputs, if the internal variable `sizeAverage` is equal to `true`, t
 
 By default, the losses are averaged over observations for each minibatch. However, if the field `sizeAverage` is set to `false`, the losses are instead summed.
 
+<a name="nn.DistanceRatioCriterion"></a>
+## DistanceRatioCriterion ##
+Ref A. [Unsupervised Learning through Spatial Contrasting](https://arxiv.org/pdf/1610.00243.pdf)
+
+```lua
+criterion = nn.DistanceRatioCriterion(sizeAverage)
+```
+
+This criterion is probabilistic treatment of margin cost. The model is trained using sample triplets `{Xs, Xa, Xd}` where `Xa` is anchor sample, `Xs` is sample similar to anchor sample and `Xd` is a sample not similar to anchor sample. Let `Ds` be distance between embeddings of `{Xs, Xa}` and `Dd` be distance between embeddings of `{Xa, Xd}` then the loss is defined as follow
+
+```lua
+   loss = -log( exp(-Ds) / ( exp(-Ds) + exp(-Dd) ) )
+```
+
+Sample example
+```lua
+   torch.setdefaulttensortype("torch.FloatTensor")
+
+   require 'nn'
+
+   -- triplet : with batchSize of 32 and dimensionality 512
+   sample = {torch.rand(32, 512), torch.rand(32, 512), torch.rand(32, 512)}
+
+   embeddingModel = nn.Sequential()
+   embeddingModel:add(nn.Linear(512, 96)):add(nn.ReLU())
+
+   tripleModel = nn.ParallelTable()
+   tripleModel:add(embeddingModel)
+   tripleModel:add(embeddingModel:clone('weight', 'bias', 
+                                        'gradWeight', 'gradBias'))
+   tripleModel:add(embeddingModel:clone('weight', 'bias',
+                                        'gradWeight', 'gradBias'))
+
+   -- Similar sample distance w.r.t anchor sample
+   posDistModel = nn.Sequential()
+   posDistModel:add(nn.NarrowTable(1,2)):add(nn.PairwiseDistance())
+
+   -- Different sample distance w.r.t anchor sample
+   negDistModel = nn.Sequential()
+   negDistModel:add(nn.NarrowTable(2,2)):add(nn.PairwiseDistance())
+
+   distanceModel = nn.ConcatTable():add(posDistModel):add(negDistModel)
+
+   -- Complete Model
+   model = nn.Sequential():add(tripleModel):add(distanceModel)
+
+   -- DistanceRatioCriterion
+   criterion = nn.DistanceRatioCriterion(true)
+
+   -- Forward & Backward
+   output = model:forward(sample)
+   loss   = criterion:forward(output)
+   dLoss  = criterion:backward(output)
+   model:backward(sample, dLoss)
+```
 
 <a name="nn.MarginRankingCriterion"></a>
 ## MarginRankingCriterion ##
diff --git a/doc/simple.md b/doc/simple.md
index b7044ae..09c60ca 100644
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -25,7 +25,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [MaskedSelect](#nn.MaskedSelect) : a [masked select](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-maskedselect-index) module performs the torch.maskedSelect operation ;
     * [Index](#nn.Index) : a [index](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-indexdim-index) over a given dimension ;
     * [Squeeze](#nn.Squeeze) : [squeezes](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-squeezedim) the input;
-    * [Unsqueeze](#nn.Unsqueeze) : unsqueeze the input, i.e., insert singleton dimension;  
+    * [Unsqueeze](#nn.Unsqueeze) : unsqueeze the input, i.e., insert singleton dimension;
     * [Transpose](#nn.Transpose) : [transposes](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-transposedim1-dim2) the input ;
   * Modules that adapt mathematical Tensor methods :
     * [AddConstant](https://github.com/torch/nn/blob/master/doc/transfer.md#addconstant) : adding a constant ;
@@ -679,8 +679,7 @@ The default is false.
 module = nn.Narrow(dimension, offset, length)
 ```
 
-Narrow is application of [narrow](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-narrowdim-index-size) operation in a module.
-The module further supports a negative `length` in order to handle inputs with an unknown size.
+Narrow is application of [narrow](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-narrowdim-index-size) operation in a module. The module further supports negative `length`, `dim` and `offset` to handle inputs of unknown size.
 
 ```lua
 > x = torch.rand(4, 5)
diff --git a/doc/table.md b/doc/table.md
index ee61719..d5174a7 100644
--- a/doc/table.md
+++ b/doc/table.md
@@ -692,7 +692,7 @@ Forwarding a batch of 2 examples gives us something like this:
 
 `module` = `SelectTable(index)`
 
-Creates a module that takes a `table` as input and outputs the element at index `index` (positive or negative). 
+Creates a module that takes a (nested) `table` as input and outputs the element at index `index`. `index` can be strings or integers (positive or negative). 
 This can be either a `table` or a [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor).
 
 The gradients of the non-`index` elements are zeroed `Tensor`s of the same size. This is true regardless of the
@@ -719,10 +719,36 @@ Example 1:
 0
 0
 [torch.DoubleTensor of dimension 2x1]
+```
+
+Exmaple 2:
+```lua
+> input = { A=torch.randn(2, 3), B=torch.randn(2, 1) }
+> =nn.SelectTable("A"):forward(input)
+-0.3060  0.1398  0.2707
+ 0.0576  1.5455  0.0610
+[torch.DoubleTensor of dimension 2x3]
+
+> gradInput = nn.SelectTable("A"):backward(input, torch.randn(2, 3))
+
+> gradInput 
+{
+  A : DoubleTensor - size: 2x3
+  B : DoubleTensor - size: 2x1
+}
+
+> gradInput["A"]
+-0.4891 -0.3495 -0.3182
+-2.0999  0.7381 -0.5312
+[torch.DoubleTensor of dimension 2x3]
 
+> gradInput["B"]
+0
+0
+[torch.DoubleTensor of dimension 2x1]
 ```
 
-Example 2:
+Example 3:
 ```lua
 > input = {torch.randn(2, 3), {torch.randn(2, 1), {torch.randn(2, 2)}}}
 
diff --git a/doc/transfer.md b/doc/transfer.md
index 3d2d034..964030a 100644
--- a/doc/transfer.md
+++ b/doc/transfer.md
@@ -465,6 +465,42 @@ gnuplot.grid(true)
 
 ![](image/rrelu.png)
 
+<a name="nn.CReLU"></a>
+## CReLU ##
+```
+f = nn.CReLU(nInputDims, [inplace])
+```
+
+Applies the Concatenated Rectified Linear Unit (`CReLU`) function to the input Tensor, outputting a `Tensor` with twice as many channels.  The parameter `nInputDim` is the number of non-batched dimensions,  larger than that value will be considered batches.
+`CReLU` is defined as:
+
+```
+f(x) = concat(max(0, x), max(0, -x))
+```
+
+i.e. `CReLU` applies `ReLU` to the input, `x`, and the negated input, `-x`, and concatenates the output along the 1st non-batched dimension.
+
+```
+crelu = nn.CReLU(3)
+input = torch.Tensor(2, 3, 20, 20):uniform(-1, 1)
+output = crelu:forward(input)
+output:size()
+2
+6
+20
+20
+[torch.LongStorage of size 4]
+
+input = torch.Tensor(3, 20, 20):uniform(-1, 1)
+output = crelu:forward(input)
+output:size()
+6
+20
+20
+[torch.LongStorage of size 3]
+```
+
+For reference see [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units](https://arxiv.org/abs/1603.05201).
 
 <a name="nn.ELU"></a>
 ## ELU ##
@@ -522,6 +558,16 @@ Can optionally do its operation in-place without using extra state memory:
 f = nn.LeakyReLU(negval, true) -- true = in-place, false = keeping separate state.
 ```
 
+<a name="nn.GatedLinearUnit"></a>
+## GatedLinearUnit ##
+
+Applies a Gated Linear unit activation function, which halves the input dimension as follows:
+
+`GatedLinearUnit` is defined as `f([x1, x2])` = `x1 * sigmoid(x2)`
+
+where x1 is the first half of the input vector and x2 is the second half. The multiplication is component-wise, and the input vector must have an even number of elements.
+
+The GatedLinearUnit optionally takes a `dim` parameter, which is the dimension of the input tensor to operate over. It defaults to the last dimension.
 
 <a name="nn.SpatialSoftMax"></a>
 ## SpatialSoftMax ##
diff --git a/init.lua b/init.lua
index 1e3924b..cad1c3c 100644
--- a/init.lua
+++ b/init.lua
@@ -89,11 +89,13 @@ require('nn.Threshold')
 require('nn.ReLU')
 require('nn.ReLU6')
 require('nn.PReLU')
+require('nn.CReLU')
 require('nn.LeakyReLU')
 require('nn.SpatialSoftMax')
 require('nn.SpatialLogSoftMax')
 require('nn.RReLU')
 require('nn.ELU')
+require('nn.GatedLinearUnit')
 
 require('nn.LookupTable')
 require('nn.SpatialConvolution')
@@ -152,6 +154,7 @@ require('nn.MapTable')
 
 require('nn.Criterion')
 require('nn.MSECriterion')
+require('nn.SpatialAutoCropMSECriterion')
 require('nn.SmoothL1Criterion')
 require('nn.MarginCriterion')
 require('nn.SoftMarginCriterion')
@@ -174,6 +177,7 @@ require('nn.WeightedMSECriterion')
 require('nn.BCECriterion')
 require('nn.CrossEntropyCriterion')
 require('nn.ParallelCriterion')
+require('nn.DistanceRatioCriterion')
 
 require('nn.PixelShuffle')
 
diff --git a/lib/THNN/CMakeLists.txt b/lib/THNN/CMakeLists.txt
index cb704b1..33eaf56 100644
--- a/lib/THNN/CMakeLists.txt
+++ b/lib/THNN/CMakeLists.txt
@@ -5,6 +5,11 @@ IF(NOT Torch_FOUND)
   FIND_PACKAGE(Torch REQUIRED)
 ENDIF()
 
+IF(NOT TH_LIBRARIES)
+  SET(TH_LIBRARIES "TH")
+ENDIF(NOT TH_LIBRARIES)
+MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
+
 IF(NOT THNN_INSTALL_LIB_SUBDIR)
   SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
 ENDIF()
@@ -61,6 +66,15 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 SET_TARGET_PROPERTIES(THNN PROPERTIES
   PREFIX "lib"
   IMPORT_PREFIX "lib")
-TARGET_LINK_LIBRARIES(THNN TH)
+
+TARGET_LINK_LIBRARIES(THNN ${TH_LIBRARIES})
+
+# Luarocks bug pre-14.04 prevents us from setting it for Lua-Torch
+IF(THNN_SO_VERSION)
+  MESSAGE(STATUS "THNN_SO_VERSION: ${THNN_SO_VERSION}")
+  SET_TARGET_PROPERTIES(THNN PROPERTIES
+    VERSION   ${THNN_SO_VERSION}
+    SOVERSION ${THNN_SO_VERSION})
+ENDIF(THNN_SO_VERSION)
 
 INSTALL(TARGETS THNN LIBRARY DESTINATION ${THNN_INSTALL_LIB_SUBDIR})
diff --git a/lib/THNN/generic/LogSoftMax.c b/lib/THNN/generic/LogSoftMax.c
index 3ed9c3b..a728042 100644
--- a/lib/THNN/generic/LogSoftMax.c
+++ b/lib/THNN/generic/LogSoftMax.c
@@ -76,7 +76,6 @@ void THNN_(LogSoftMax_updateGradInput)(
           THTensor *output)
 {
   THNN_CHECK_SHAPE(input, gradOutput);
-  gradOutput = THTensor_(newContiguous)(gradOutput);
   real *gradInput_data, *gradOutput_data, *output_data;
   ptrdiff_t nframe = 0, dim = 0, stride = 0;
   ptrdiff_t t, d;
diff --git a/lib/THNN/generic/MultiLabelMarginCriterion.c b/lib/THNN/generic/MultiLabelMarginCriterion.c
index fe851c9..16398c1 100644
--- a/lib/THNN/generic/MultiLabelMarginCriterion.c
+++ b/lib/THNN/generic/MultiLabelMarginCriterion.c
@@ -35,8 +35,8 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
 	       && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
-  THArgCheck(THIndexTensor_(minall)(target) >= 0, 3, "target out of range");
-  THArgCheck(THIndexTensor_(maxall)(target) <= dim, 3, "target out of range");
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
 
   target = THIndexTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);
@@ -128,8 +128,8 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
 	       && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
   }
 
-  THArgCheck(THIndexTensor_(minall)(target) >= 0, 3, "target out of range");
-  THArgCheck(THIndexTensor_(maxall)(target) <= dim, 3, "target out of range");
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
 
   THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
   THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
diff --git a/lib/THNN/generic/SpatialAveragePooling.c b/lib/THNN/generic/SpatialAveragePooling.c
index 56db162..c063502 100644
--- a/lib/THNN/generic/SpatialAveragePooling.c
+++ b/lib/THNN/generic/SpatialAveragePooling.c
@@ -2,6 +2,74 @@
 #define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
 #else
 
+static inline void THNN_(SpatialAveragePooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  long nInputPlane = input->size[dimh-1];
+  long inputHeight = input->size[dimh];
+  long inputWidth = input->size[dimw];
+  long outputHeight, outputWidth;
+  long nOutputPlane = nInputPlane;
+
+  if(ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
 void THNN_(SpatialAveragePooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -31,12 +99,8 @@ void THNN_(SpatialAveragePooling_updateOutput)(
 
   long k;
 
-  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
-		"3D or 4D (batch mode) tensor expected for input, but got: %s");
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
-	     "pad should be smaller than half of kernel size, but got "
-	     "padW = %d, padH = %d, kW = %d, kH = %d",
-	     padW, padH, kW, kH);
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode);
 
   if (input->nDimension == 4) {
     nbatch = input->size[0];
@@ -69,21 +133,16 @@ void THNN_(SpatialAveragePooling_updateOutput)(
       --outputWidth;
   }
 
-  THArgCheck(inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2,
-	     "input image smaller than (kernel size - 2 * padW). Got "
-	     "inputHeight: %d inputWidth: %d kH %d kW %d padH %d padW %d",
-	     inputHeight, inputWidth, kH, kW, padH, padW);
-
   if (input->nDimension == 3)
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
   else
     THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
-  
+
   input = THTensor_(newContiguous)(input);
   THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
   input_data = THTensor_(data)(input);
   output_data = THTensor_(data)(output);
-  
+
 #pragma omp parallel for private(k)
   for(k = 0; k < nInputPlane; k++)
   {
@@ -97,7 +156,7 @@ void THNN_(SpatialAveragePooling_updateOutput)(
       long i;
       for(i = 0; i < outputWidth*outputHeight; i++)
         ptr_output[i] = 0;
-      
+
       for(yy = 0; yy < outputHeight; yy++)
       {
         for(xx = 0; xx < outputWidth; xx++)
@@ -156,7 +215,7 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
   int dimc = 0;
   long nbatch = 1;
   long ndim = 3;
-  
+
   long inputWidth;
   long inputHeight;
   long outputWidth;
@@ -168,6 +227,10 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
 
   long k;
 
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, gradOutput, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+
   if (input->nDimension == 4) {
     nbatch = input->size[0];
     dimw++;
@@ -203,11 +266,8 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
   THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
   THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
 
-  input_data = THTensor_(data)(input);
-
   THTensor_(resizeAs)(gradInput, input);
 
-  input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
   THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
 
@@ -263,7 +323,6 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
     }
   }
 
-  THTensor_(free)(input);
   THTensor_(free)(gradOutput);
 }
 
diff --git a/lib/THNN/generic/SpatialConvolutionLocal.c b/lib/THNN/generic/SpatialConvolutionLocal.c
index 4d446dd..efba30e 100644
--- a/lib/THNN/generic/SpatialConvolutionLocal.c
+++ b/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -224,6 +224,7 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
      inputHeight, inputWidth, outputHeight, outputWidth);
 
   input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
   long nOutputPlane = THTensor_(size)(weight,1);
 
@@ -266,6 +267,7 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
   THTensor_(transpose)(weight, weight, 1, 2);
 
   THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
   if (freeWeight)
     THTensor_(free)(weight);
 
diff --git a/lib/THNN/generic/SpatialConvolutionMM.c b/lib/THNN/generic/SpatialConvolutionMM.c
index d093bee..83635c1 100644
--- a/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/lib/THNN/generic/SpatialConvolutionMM.c
@@ -124,6 +124,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   THNN_(SpatialConvolutionMM_shapeCheck)
     (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
 
+  input = THTensor_(newContiguous)(input);
   int ndim = input->nDimension;
   int dimf = 0;
   int dimh = 1;
@@ -180,6 +181,7 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
     }
   }
 
+  THTensor_(free)(input);
   if (freeWeight)
     THTensor_(free)(weight);
 }
@@ -239,6 +241,9 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
   THNN_(SpatialConvolutionMM_shapeCheck)
     (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
 
@@ -279,6 +284,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
 
   THTensor_(transpose)(weight, weight, 0, 1);
 
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
   if (freeWeight)
     THTensor_(free)(weight);
 }
@@ -345,6 +352,9 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
   THNN_(SpatialConvolutionMM_shapeCheck)
     (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
   if(input->nDimension == 3)
   {
     THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
@@ -367,6 +377,9 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
       THTensor_(free)(finput_t);
     }
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
   if (freeWeight)
     THTensor_(free)(gradWeight);
 }
diff --git a/lib/THNN/generic/SpatialDilatedConvolution.c b/lib/THNN/generic/SpatialDilatedConvolution.c
index 9dcc1b4..8b18910 100644
--- a/lib/THNN/generic/SpatialDilatedConvolution.c
+++ b/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -80,6 +80,7 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
   int nInputPlane = weight->size[1];
   int nOutputPlane = weight->size[0];
 
+  input = THTensor_(newContiguous)(input);
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -175,6 +176,8 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
 }
 
 void THNN_(SpatialDilatedConvolution_updateGradInput)(
@@ -197,6 +200,8 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
   int nInputPlane = weight->size[1];
   int nOutputPlane = weight->size[0];
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -266,6 +271,9 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
     THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 
@@ -291,6 +299,8 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
   int nInputPlane = gradWeight->size[1];
   int nOutputPlane = gradWeight->size[0];
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -380,6 +390,9 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
     THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialDilatedMaxPooling.c b/lib/THNN/generic/SpatialDilatedMaxPooling.c
index 1a40b8f..5a2b764 100644
--- a/lib/THNN/generic/SpatialDilatedMaxPooling.c
+++ b/lib/THNN/generic/SpatialDilatedMaxPooling.c
@@ -29,10 +29,6 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
   THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
 		"3D or 4D input tensor expected but got: %s");
 
-  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2,
-	     "input image (H: %d, W: %d) smaller than kernel "
-	     "size - padding( kH: %d padH: %d kW: %d padW: %d",
-	     input->size[dimh], input->size[dimw], kH, padH, kW, padW);
   THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
 	     "pad should be smaller than half of kernel size, but got "
 	     "padW = %d, padH = %d, kW = %d, kH = %d",
@@ -55,6 +51,16 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
     outputWidth  = (long)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
   }
 
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
   if (outputWidth < 1 || outputHeight < 1)
     THError("Given input size: (%dx%dx%d). "
 	    "Calculated output size: (%dx%dx%d). Output size is too small",
@@ -201,6 +207,7 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
   if (padW || padH)
   {
     // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
     if ((outputHeight - 1)*dH >= inputHeight + padH)
       --outputHeight;
     if ((outputWidth  - 1)*dW >= inputWidth  + padW)
diff --git a/lib/THNN/generic/SpatialFullConvolution.c b/lib/THNN/generic/SpatialFullConvolution.c
index 94a7fc1..4adcca6 100644
--- a/lib/THNN/generic/SpatialFullConvolution.c
+++ b/lib/THNN/generic/SpatialFullConvolution.c
@@ -66,6 +66,9 @@ static inline void THNN_(SpatialFullConvolution_shapeCheck)(
 	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
   THArgCheck(dW > 0 && dH > 0, 11,
 	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(adjW < dW && adjH < dH, 15,
+        "output adjustment must be smaller than stride, but got adjH: %d adjW: %d dH: %d dW: %d",
+        adjH, adjW, dH, dW);
   THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
 		"2D or 4D weight tensor expected, but got: %s");
 
@@ -127,6 +130,7 @@ void THNN_(SpatialFullConvolution_updateOutput)(
   int nInputPlane = THTensor_(size)(weight,0);
   int nOutputPlane = THTensor_(size)(weight,1);
 
+  input = THTensor_(newContiguous)(input);
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -224,6 +228,8 @@ void THNN_(SpatialFullConvolution_updateOutput)(
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
 }
 
 void THNN_(SpatialFullConvolution_updateGradInput)(
@@ -244,6 +250,8 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
   int nInputPlane = THTensor_(size)(weight,0);
   int nOutputPlane = THTensor_(size)(weight,1);
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -316,6 +324,9 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
     THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 
@@ -339,6 +350,8 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
   int nInputPlane = THTensor_(size)(gradWeight,0);
   int nOutputPlane = THTensor_(size)(gradWeight,1);
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -431,6 +444,9 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
     THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialMaxUnpooling.c b/lib/THNN/generic/SpatialMaxUnpooling.c
index 1b7b517..3205386 100644
--- a/lib/THNN/generic/SpatialMaxUnpooling.c
+++ b/lib/THNN/generic/SpatialMaxUnpooling.c
@@ -4,13 +4,13 @@
 
 static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
                                                       THIndex_t *ind_p,
-                                                      long nslices,
-                                                      long iwidth, long iheight,
-                                                      long owidth, long oheight)
+                                                      int nslices,
+                                                      int iwidth, int iheight,
+                                                      int owidth, int oheight)
 {
-  long k;
+  int k;
   int has_error = 0;
-  long error_index;
+  THIndex_t error_index;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
@@ -18,7 +18,8 @@ static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *o
     real *input_p_k = input_p + k*iwidth*iheight;
     THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
 
-    long i, j, maxp;
+    int i, j;
+    THIndex_t maxp;
     for(i = 0; i < iheight; i++)
     {
       for(j = 0; j < iwidth; j++)
@@ -37,7 +38,7 @@ static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *o
     }
   }
   if (has_error) {
-    THError("found an invalid max index %ld (output volumes are of size %ldx%ld)",
+    THError("found an invalid max index %ld (output volumes are of size %dx%d)",
         error_index, oheight, owidth);
   }
 }
@@ -98,7 +99,7 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
   }
   else
   {
-    long p;
+    int p;
 
     THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
     THTensor_(zero)(output);
@@ -107,14 +108,15 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
     output_data = THTensor_(data)(output);
     indices_data = THIndexTensor_(data)(indices);
 
-#pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
-                                                indices_data+p*nslices*iwidth*iheight,
-                                                nslices,
-                                                iwidth, iheight,
-                                                owidth, oheight);
+      THNN_(SpatialMaxUnpooling_updateOutput_frame)(
+						    input_data+p*nslices*iwidth*iheight,
+						    output_data+p*nslices*owidth*oheight,
+						    indices_data+p*nslices*iwidth*iheight,
+						    nslices,
+						    iwidth, iheight,
+						    owidth, oheight);
     }
   }
 
@@ -125,11 +127,11 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
 
 static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
                                                          THIndex_t *ind_p,
-                                                         long nslices,
-                                                         long iwidth, long iheight,
-                                                         long owidth, long oheight)
+                                                         int nslices,
+                                                         int iwidth, int iheight,
+                                                         int owidth, int oheight)
 {
-  long k;
+  int k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
@@ -137,14 +139,15 @@ static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p,
     real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
     THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
 
-    long i, j, maxp;
+    int i, j;
+    THIndex_t maxp;
     for(i = 0; i < iheight; i++)
     {
       for(j = 0; j < iwidth; j++)
       {
         maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
-        if(maxp<0 || maxp>=owidth*oheight){
-            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+        if(maxp < 0 || maxp >= owidth * oheight) {
+            THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight);
         }
         gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
       }
@@ -193,7 +196,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
 
   if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
     THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
-	    oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+	    oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]);
   }
 
   /* get raw pointers */
@@ -212,8 +215,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
   }
   else
   {
-    long p;
-#pragma omp parallel for private(p)
+    int p;
     for (p = 0; p < nbatch; p++)
     {
       THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 450998a..8fd50f5 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -1076,6 +1076,9 @@ TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
           real scale);
 
 TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
@@ -1167,8 +1170,10 @@ TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THIndexTensor *indices,
+          int kT, int kW, int kH,
           int dT, int dW, int dH,
-          int pT, int pW, int pH);
+          int pT, int pW, int pH,
+          bool ceilMode);
 
 TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
           THNNState *state,
@@ -1186,9 +1191,11 @@ TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THIndexTensor *indices,
+          int kT, int kW, int kH,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
-          int dilationT, int dilationW, int dilationH);
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
 
 TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
           THNNState *state,
diff --git a/lib/THNN/generic/TemporalConvolution.c b/lib/THNN/generic/TemporalConvolution.c
index 0e8e83a..14297ad 100644
--- a/lib/THNN/generic/TemporalConvolution.c
+++ b/lib/THNN/generic/TemporalConvolution.c
@@ -2,6 +2,38 @@
 #define TH_GENERIC_FILE "generic/TemporalConvolution.c"
 #else
 
+static inline void THNN_(TemporalConvolution_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+
+  THArgCheck(kW > 0, 9,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 11,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+               "invalid input frame size. Got: %d, Expected: %d",
+               input->size[dimF], *inputFrameSize);
+  }
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+}
+
 void THNN_(TemporalConvolution_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -20,21 +52,14 @@ void THNN_(TemporalConvolution_updateOutput)(
   int dimS = 0; // sequence dimension
   int dimF = 1; // feature dimension
   
-  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
-		"2D or 3D (batch mode) tensor expected for input, but got: %s");
-  
   if (input->nDimension == 3) 
   {
     dimS = 1;
     dimF = 2;
   }
-  THArgCheck(input->size[dimF] == inputFrameSize, 2,
-	     "invalid input frame size. Got: %d, Expected: %d",
-	     input->size[dimF], inputFrameSize);
-  THArgCheck(input->size[dimS] >= kW, 2,
-	     "input sequence smaller than kernel size. Got: %d, Expected: %d",
-	     input->size[dimS], kW);
 
+  THNN_(TemporalConvolution_shapeCheck)
+       (state, input, kW, dW, &inputFrameSize);
   input = THTensor_(newContiguous)(input);
   outputWindow = THTensor_(new)();
   inputWindow = THTensor_(new)();
@@ -159,7 +184,9 @@ void THNN_(TemporalConvolution_updateGradInput)(
     dimS = 1;
     dimF = 2;
   }
-  
+
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
   nInputFrame = input->size[dimS];
   nOutputFrame = gradOutput->size[dimS];
 
@@ -264,7 +291,9 @@ void THNN_(TemporalConvolution_accGradParameters)(
     dimS = 1;
     dimF = 2;
   }
-  
+
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
   nInputFrame = input->size[dimS];
   nOutputFrame = gradOutput->size[dimS];
 
diff --git a/lib/THNN/generic/TemporalMaxPooling.c b/lib/THNN/generic/TemporalMaxPooling.c
index 0a2f004..e2976ab 100644
--- a/lib/THNN/generic/TemporalMaxPooling.c
+++ b/lib/THNN/generic/TemporalMaxPooling.c
@@ -2,6 +2,52 @@
 #define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
 #else
 
+static inline void THNN_(TemporalMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kW,
+                         int dW) {
+  long niframe;
+  long framesize;
+  long noframe;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  int ndims = input->nDimension;
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  THArgCheck(kW > 0, 5,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 6,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize)
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize);
+  }
+}
+
 void THNN_(TemporalMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -23,14 +69,13 @@ void THNN_(TemporalMaxPooling_updateOutput)(
   int dimS = 0; // sequence dimension
   int dimF = 1; // feature dimension
 
-  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW);
 
   if (input->nDimension == 3)
   {
     dimS = 1;
     dimF = 2;
   }
-  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
 
   /* sizes */
   niframe = input->size[dimS];
@@ -159,6 +204,7 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
 
   long t, y;
 
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW);
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
diff --git a/lib/THNN/generic/TemporalSubSampling.c b/lib/THNN/generic/TemporalSubSampling.c
index 7fa323d..bfc7d30 100644
--- a/lib/THNN/generic/TemporalSubSampling.c
+++ b/lib/THNN/generic/TemporalSubSampling.c
@@ -2,6 +2,42 @@
 #define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
 #else
 
+static inline void THNN_(TemporalSubSampling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+  int nInputFrame, nOutputFrame;
+
+  THArgCheck(kW > 0, 6,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 7,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(input->nDimension == 2, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck( input->size[1] == *inputFrameSize, 2,
+                "invalid input frame size.  Got: %d, Expected: %d",
+                input->size[1], *inputFrameSize);
+  }
+  THArgCheck( input->size[0] >= kW, 2,
+              "input sequence smaller than kernel size.  Got %d, Expected: %d",
+              input->size[0], kW);
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 0, nOutputFrame);
+    if (inputFrameSize != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 1, *inputFrameSize);
+    }
+  }
+}
+
 void THNN_(TemporalSubSampling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -16,9 +52,7 @@ void THNN_(TemporalSubSampling_updateOutput)(
   int nInputFrame, nOutputFrame;
   long k;
   
-  THArgCheck( input->nDimension == 2, 2, "2D tensor expected");
-  THArgCheck( input->size[1] == inputFrameSize, 2, "invalid input frame size");
-  THArgCheck( input->size[0] >= kW, 2, "input sequence smaller than kernel size");
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize);
 
   outputFrame = THTensor_(new)();
   inputWindow = THTensor_(new)();
@@ -57,6 +91,8 @@ void THNN_(TemporalSubSampling_updateGradInput)(
   THTensor *gradInputWindow, *buffer, *kwunit;
   long k;
 
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+
   gradOutputFrame = THTensor_(new)();
   gradInputWindow = THTensor_(new)();
   buffer = THTensor_(new)();
@@ -94,7 +130,7 @@ void THNN_(TemporalSubSampling_accGradParameters)(
   THTensor *inputWindow, *buffer;
   long k;
 
-
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
   gradOutputFrame = THTensor_(new)();
   inputWindow = THTensor_(new)();
   buffer = THTensor_(new)();
diff --git a/lib/THNN/generic/VolumetricAveragePooling.c b/lib/THNN/generic/VolumetricAveragePooling.c
index a317cbb..91c870e 100644
--- a/lib/THNN/generic/VolumetricAveragePooling.c
+++ b/lib/THNN/generic/VolumetricAveragePooling.c
@@ -2,6 +2,70 @@
 #define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
 #else
 
+static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH) {
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  int ndim = input->nDimension;
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+             && input->size[dimt] >= kT, 2,
+             "input image (T: %d H: %d W: %d) smaller than "
+             "kernel size (kT: %d kH: %d kW: %d)",
+             input->size[dimt], input->size[dimh], input->size[dimw],
+             kT, kH, kW);
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+}
+
 static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
@@ -81,8 +145,9 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
   real *input_data;
   real *output_data;
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, NULL, kT, kW, kH,
+        dT, dW, dH);
 
   int dimN = 0;
   int dimt = 1;
@@ -97,13 +162,6 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
     dimw++;
   }
 
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
-	     && input->size[dimt] >= kT, 2,
-	     "input image (T: %d H: %d W: %d) smaller than "
-	     "kernel size (kT: %d kH: %d kW: %d)",
-	     input->size[dimt], input->size[dimh], input->size[dimw],
-	     kT, kH, kW);
-
   /* sizes */
   nslices = input->size[dimN];
   itime   = input->size[dimt];
@@ -244,7 +302,10 @@ void THNN_(VolumetricAveragePooling_updateGradInput)(
   int dimh = 2;
   int dimw = 3;
 
-  // TODO: gradOutput shape check
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, gradOutput, kT, kW, kH,
+        dT, dW, dH);
+
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
diff --git a/lib/THNN/generic/VolumetricConvolutionMM.c b/lib/THNN/generic/VolumetricConvolutionMM.c
index 4b00c47..4085e2b 100644
--- a/lib/THNN/generic/VolumetricConvolutionMM.c
+++ b/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -2,11 +2,89 @@
 #define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
 #else
 
+static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         THTensor *bias,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1 || outputDepth < 1)
+  {
+    THError(
+      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      nOutputPlane, outputDepth, outputHeight, outputWidth
+    );
+  }
+
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 5, 4,
+             "weight tensor should be 2D or 5D - got %d", weight->nDimension);
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
 static int THNN_(view_weight)(THTensor **_weight)
 {
   THTensor *weight = *_weight;
-  THArgCheck(weight->nDimension == 2 || weight->nDimension == 5, 4,
-          "weight tensor should be 2D or 5D - got %dD", weight->nDimension);
   if (weight->nDimension == 5) {
     long s1 = weight->size[0];
     long s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
@@ -268,8 +346,9 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
   long outputHeight;
   long outputWidth;
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, NULL, weight, bias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
   input = THTensor_(newContiguous)(input);
 
   if (input->nDimension == 5)
@@ -289,15 +368,6 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
   outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
   outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
 
-  if (outputWidth < 1 || outputHeight < 1)
-  {
-    THError(
-      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
-      nInputPlane, inputDepth, inputHeight, inputWidth,
-      nOutputPlane, outputDepth, outputHeight, outputWidth
-    );
-  }
-
   freeWeight = THNN_(view_weight)(&weight);
 
   if (input->nDimension == 4)
@@ -405,9 +475,9 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
 {
   int nOutputPlane = (int)weight->size[0];
 
-  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
-    "Number of output features is not equal to nOutputPlane"
-  );
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, weight, NULL,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
@@ -502,18 +572,17 @@ void THNN_(VolumetricConvolutionMM_accGradParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
           real scale)
 {
   int freeWeight;
   int nOutputPlane = (int)gradWeight->size[0];
 
-  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
-    "gradBias tensor has wrong size"
-  );
-
-  THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
-    "Number of output features is not equal to nOutputPlane"
-  );
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, gradWeight, gradBias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
diff --git a/lib/THNN/generic/VolumetricDilatedConvolution.c b/lib/THNN/generic/VolumetricDilatedConvolution.c
index e889f5a..d2d5c88 100644
--- a/lib/THNN/generic/VolumetricDilatedConvolution.c
+++ b/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -2,6 +2,65 @@
 #define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
 #else
 
+static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int kT, int kH, int kW, int dT, int dH, int dW,
+                         int padT, int padH, int padW,
+                         int dilationT, int dilationH, int dilationW) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+                "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                "expected for weight, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+             dilationT, dilationH, dilationW);
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  // Params
+  int ndim = input->nDimension;
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  long inputDepth  = input->size[dimd];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
 void THNN_(VolumetricDilatedConvolution_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -15,28 +74,21 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
           int padT, int padW, int padH,
           int dilationT, int dilationW, int dilationH)
 {
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
-  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
-		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
-		"expected for weight, but got: %s");
-  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
-  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 17, "dilation should be greater than zero");
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, NULL, weight, bias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
 
   // Params:
   int nInputPlane = weight->size[1];
   int nOutputPlane = weight->size[0];
 
+  input = THTensor_(newContiguous)(input);
   int batch = 1;
   if (input->nDimension == 4) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[0]);
     // Force batch
     batch = 0;
     THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match. Expected: %d, got %d", nInputPlane, input->size[1]);
   }
 
   long inputDepth  = input->size[2];
@@ -46,10 +98,6 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
   long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
 
-  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
-            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
-
   // Batch size + input planes
   long batchSize = input->size[0];
 
@@ -136,6 +184,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
     THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
 }
 
 void THNN_(VolumetricDilatedConvolution_updateGradInput)(
@@ -150,30 +200,23 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
           int padT, int padW, int padH,
           int dilationT, int dilationW, int dilationH)
 {
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
-  THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
-		gradOutput,
-		"4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
-  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
-		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
-		"expected for weight, but got: %s");
-  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
 
   // Params
   int nInputPlane = weight->size[1];
   int nOutputPlane = weight->size[0];
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   int batch = 1;
   if (input->nDimension == 4) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
     THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
   }
 
   long inputDepth  = input->size[2];
@@ -239,6 +282,9 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
     THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 void THNN_(VolumetricDilatedConvolution_accGradParameters)(
@@ -255,31 +301,23 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
           int dilationT, int dilationW, int dilationH,
           real scale)
 {
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
-  THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
-		gradOutput,
-		"4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
-  THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
-		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
-		"expected for gradWeight, but got: %s");
-  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
 
   // Params
   int nInputPlane = gradWeight->size[1];
   int nOutputPlane = gradWeight->size[0];
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   int batch = 1;
   if (input->nDimension == 4) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
     THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
   }
 
   long inputDepth  = input->size[2];
@@ -365,6 +403,9 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
     THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 #endif
diff --git a/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
index 629c05a..14e8177 100644
--- a/lib/THNN/generic/VolumetricDilatedMaxPooling.c
+++ b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -2,6 +2,101 @@
 #define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c"
 #else
 
+static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kT, int kW, int kH,
+                         int dT, int dW, int dH,
+                         int pT, int pW, int pH,
+                         int dilationT, int dilationW, int dilationH,
+                         bool ceilMode) {
+  int ndim = input->nDimension;
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+             "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d",
+             dilationT, dilationH, dilationW);
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+             "pad should be smaller than half of kernel size, but got "
+             "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d",
+             kT, kW, kH, pT, pW, pH);
+
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  if (otime < 1 || owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, owidth);
+  }
+}
+
 static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
@@ -133,8 +228,6 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
   real *output_data;
   THIndex_t *indices_data;
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   int dimN = 0;
   int dimt = 1;
@@ -149,19 +242,11 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
     dimw++;
   }
 
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
-	     && input->size[dimt] >= kT, 2,
-	     "input image (T: %d H: %d W: %d) smaller than "
-	     "kernel size (kT: %d kH: %d kW: %d)",
-	     input->size[dimt], input->size[dimh], input->size[dimw],
-	     kT, kH, kW);
-
-  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
-    "pad should be smaller than half of kernel size"
-  );
-
-  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
-      "dilation should be greater than 0");
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, NULL, NULL,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
 
   /* sizes */
   nslices = input->size[dimN];
@@ -181,10 +266,6 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
     owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
   }
 
-  if (otime < 1 || owidth < 1 || oheight < 1)
-    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
-            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
-
   if (pT || pW || pH)
   {
     // ensure that the last pooling starts inside the image
@@ -319,6 +400,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
           int dT,
           int dW,
           int dH,
@@ -327,7 +411,8 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
           int pH,
           int dilationT,
           int dilationW,
-          int dilationH)
+          int dilationH,
+          bool ceilMode)
 {
   int nslices;
   int itime;
@@ -345,6 +430,12 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   int dimh = 2;
   int dimw = 3;
 
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
+
   // TODO: gradOutput shape check
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
diff --git a/lib/THNN/generic/VolumetricFullConvolution.c b/lib/THNN/generic/VolumetricFullConvolution.c
index 8df9a74..b6ef1cd 100644
--- a/lib/THNN/generic/VolumetricFullConvolution.c
+++ b/lib/THNN/generic/VolumetricFullConvolution.c
@@ -85,6 +85,67 @@ static void THNN_(col2vol)(
   }
 }
 
+static inline void THNN_(VolumetricFullConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int dT, int dW, int dH, int pT, int pW, int pH,
+                         int aT, int aW, int aH) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+                "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                "expected for weight, but got: %s");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(aT < dT && aW < dW && aH < dH, 15,
+             "output adjustment must be smaller than stride, but got "
+             "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d",
+             aT, aH, aW, dT, dH, dW);
+
+  int ndim = input->nDimension;
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+  }
+
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  const long inputWidth   = input->size[dimw];
+  const long inputHeight  = input->size[dimh];
+  const long inputDepth   = input->size[dimd];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
 void THNN_(VolumetricFullConvolution_updateOutput)(
   THNNState *state,
   THTensor *input,          // 4D or 5D (batch) tensor
@@ -100,10 +161,9 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
   THTensor *columns = finput;
   THTensor *ones    = fgradInput;
 
-  // number of input & output planes and kernel size is indirectly defined by the weight tensor
-  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
-		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
-		"expected for weight, but got: %s");
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, NULL, weight, bias,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
 
   const int nInputPlane  = (int)weight->size[0];
   const int nOutputPlane = (int)weight->size[1];
@@ -111,21 +171,14 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
   const int kH           = (int)weight->size[3];
   const int kW           = (int)weight->size[4];
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
-
+  input = THTensor_(newContiguous)(input);
   int batch = 1;
   if (input->nDimension == 4)
   {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
   }
-  else
-  {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
-  }
 
   const long inputWidth   = input->size[4];
   const long inputHeight  = input->size[3];
@@ -223,6 +276,8 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
     THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
 }
 
 void THNN_(VolumetricFullConvolution_updateGradInput)(
@@ -240,9 +295,9 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
   THTensor *gradColumns = finput;
 
   // number of input & output planes and kernel size is indirectly defined by the weight tensor
-  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
-		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
-		"expected for weight, but got: %s");
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
 
   const int nInputPlane  = (int)weight->size[0];
   const int nOutputPlane = (int)weight->size[1];
@@ -250,8 +305,8 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
   const int kH           = (int)weight->size[3];
   const int kW           = (int)weight->size[4];
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
 
   int batch = 1;
   if (input->nDimension == 4)
@@ -331,6 +386,9 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
     THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 void THNN_(VolumetricFullConvolution_accGradParameters)(
@@ -347,9 +405,9 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
   real scale)
 {
   // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
-  THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
-		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
-		"expected for gradWeight, but got: %s");
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
 
   int nInputPlane  = (int)gradWeight->size[0];
   int nOutputPlane = (int)gradWeight->size[1];
@@ -360,8 +418,8 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
   THTensor *columns = finput;
   THTensor *ones = fgradInput;
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
 
   int batch = 1;
   if (input->nDimension == 4)
@@ -461,6 +519,9 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
     THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
     THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 #endif
diff --git a/lib/THNN/generic/VolumetricMaxPooling.c b/lib/THNN/generic/VolumetricMaxPooling.c
index 47af4f0..a3601e0 100644
--- a/lib/THNN/generic/VolumetricMaxPooling.c
+++ b/lib/THNN/generic/VolumetricMaxPooling.c
@@ -30,16 +30,21 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
           int dT,
           int dW,
           int dH,
           int pT,
           int pW,
-          int pH)
+          int pH,
+          bool ceilMode)
 {
   THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
           state, input, gradOutput, gradInput, indices,
-          dT, dW, dH, pT, pW, pH, 1, 1, 1);
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
 }
 
 #endif
diff --git a/lib/THNN/generic/VolumetricMaxUnpooling.c b/lib/THNN/generic/VolumetricMaxUnpooling.c
index f2f879d..d9d9e59 100644
--- a/lib/THNN/generic/VolumetricMaxUnpooling.c
+++ b/lib/THNN/generic/VolumetricMaxUnpooling.c
@@ -2,17 +2,68 @@
 #define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
 #else
 
+static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int oT,
+                         int oW,
+                         int oH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH)
+{
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int dimn = 0;
+
+  if (input->nDimension == 5)
+  {
+    dimt++;
+    dimw++;
+    dimh++;
+    dimn++;
+  }
+  int nslices = input->size[dimn];
+
+  if (gradOutput != NULL) {
+    if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+    {
+      THError(
+        "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
+        oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]
+      );
+    }
+
+    THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, dimn, nslices);
+  }
+}
+
 static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
           THIndex_t *ind_p,
-          long nslices,
-          long iT,
-          long iW,
-          long iH,
-          long oT,
-          long oW,
-          long oH,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH,
           int dT,
           int dW,
           int dH,
@@ -20,24 +71,23 @@ static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
           int pW,
           int pH)
 {
-  long k;
+  int k;
   int has_error = 0;
-  long error_index;
+  THIndex_t error_index;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
-    long ti, i, j, maxz, maxy, maxx;
+    int ti, i, j, maxz, maxy, maxx;
     for (ti = 0; ti < iT; ti++)
     {
       for (i = 0; i < iH; i++)
       {
         for (j = 0; j < iW; j++)
         {
-          long start_t = ti * dT - pT;
-          long start_h = i * dH - pH;
-          long start_w = j * dW - pW;
+          int start_t = ti * dT - pT;
+          int start_h = i * dH - pH;
+          int start_w = j * dW - pW;
 
-          //real *output_p_k = output_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
           real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
           THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
 
@@ -45,8 +95,9 @@ static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
           maxy = ((unsigned char*)(ind_p_k))[1];
           maxx = ((unsigned char*)(ind_p_k))[2];
 
-          size_t idx = k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx);
-          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          THIndex_t idx = k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx);
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT
+	      || start_h+maxy>=oH || start_w+maxx>=oW)
           {
 #pragma omp critical
             {
@@ -62,7 +113,7 @@ static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
   }
   if (has_error) {
     THError(
-        "found an invalid max index %ld (output volumes are of size %ldx%ldx%ld)",
+        "found an invalid max index %ld (output volumes are of size %dx%dx%d)",
         error_index, oT, oH, oW
     );
   }
@@ -95,10 +146,9 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
   real *output_data;
   THIndex_t *indices_data;
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
-
-  THNN_CHECK_SHAPE_INDICES(input, indices);
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, NULL, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
 
   if (input->nDimension == 5)
   {
@@ -139,7 +189,7 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
   }
   else
   {
-    long p;
+    int p;
 
     THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
     THTensor_(zero)(output);
@@ -148,7 +198,6 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
     output_data = THTensor_(data)(output);
     indices_data = THIndexTensor_(data)(indices);
 
-#pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
       THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
@@ -173,13 +222,13 @@ static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
           THIndex_t *ind_p,
-          long nslices,
-          long iT,
-          long iW,
-          long iH,
-          long oT,
-          long oW,
-          long oH,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH,
           int dT,
           int dW,
           int dH,
@@ -187,37 +236,38 @@ static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
           int pW,
           int pH)
 {
-  long k;
+  int k;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
-    long ti, i, j, maxz, maxy, maxx;
+    int ti, i, j, maxz, maxy, maxx;
     for (ti = 0; ti < iT; ti++)
     {
       for (i = 0; i < iH; i++)
       {
         for (j = 0; j < iW; j++)
         {
-          long start_t = ti * dT - pT;
-          long start_h = i * dH - pH;
-          long start_w = j * dW - pW;
+          int start_t = ti * dT - pT;
+          int start_h = i * dH - pH;
+          int start_w = j * dW - pW;
 
           real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
-          //real *gradOutput_p_k = gradOutput_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
           THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
 
           maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
           maxy = ((unsigned char*)(ind_p_k))[1];
           maxx = ((unsigned char*)(ind_p_k))[2];
 
-          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0
+	      || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
           {
             THError(
               "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
               start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
             );
           }
-          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz)
+					+ oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
         }
       }
     }
@@ -252,7 +302,9 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
   real *gradOutput_data;
   THIndex_t *indices_data;
 
-  THNN_CHECK_SHAPE_INDICES(input, indices);
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
 
   // TODO: check gradOutput shape
   /* get contiguous gradOutput */
@@ -277,14 +329,6 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
   iH = input->size[dimh];
   iW = input->size[dimw];
 
-  if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
-  {
-    THError(
-      "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%d",
-      oT, oH, oW,gradOutput->size[dimh], gradOutput->size[dimw]
-    );
-  }
-
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
@@ -305,8 +349,7 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
   }
   else
   {
-    long p;
-#pragma omp parallel for private(p)
+    int p;
     for (p = 0; p < nbatch; p++)
     {
       THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
diff --git a/lib/THNN/generic/VolumetricReplicationPadding.c b/lib/THNN/generic/VolumetricReplicationPadding.c
index aebddbd..4d8993e 100644
--- a/lib/THNN/generic/VolumetricReplicationPadding.c
+++ b/lib/THNN/generic/VolumetricReplicationPadding.c
@@ -2,6 +2,66 @@
 #define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
 #else
 
+static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int pleft, int pright,
+                         int ptop, int pbottom,
+                         int pfront, int pback) {
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 5)
+  {
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
+             "input (D: %d H: %d, W: %d)is too small."
+             " Calculated output D: %d H: %d W: %d",
+             idepth, iheight, iwidth, odepth, oheight, owidth);
+
+  if (gradOutput != NULL) {
+    THArgCheck(nslices == THTensor_(size)(gradOutput, dimslices), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               nslices, THTensor_(size)(gradOutput, dimslices));
+    THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               owidth, THTensor_(size)(gradOutput, dimw));
+    THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+               "gradOutput height unexpected. Expected: %d, Got: %d",
+               oheight, THTensor_(size)(gradOutput, dimh));
+    THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+               "gradOutput depth unexpected. Expected: %d, Got: %d",
+               odepth, THTensor_(size)(gradOutput, dimd));
+  }
+}
+
 static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
   real *input_p, real *output_p,
   long nslices,
@@ -85,8 +145,9 @@ void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
   real *input_data;
   real *output_data;
 
-  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
-		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
 
   if (input->nDimension == 5)
   {
@@ -106,11 +167,6 @@ void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
   oheight = iheight + ptop + pbottom;
   owidth  = iwidth + pleft + pright;
 
-  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
-	     "input (D: %d H: %d, W: %d)is too small."
-	     " Calculated output D: %d H: %d W: %d",
-	     idepth, iheight, iwidth, odepth, oheight, owidth);
-
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
 
@@ -255,16 +311,10 @@ void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
   oheight = iheight + ptop + pbottom;
   owidth  = iwidth + pleft + pright;
 
-  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
-	     "gradOutput width unexpected. Expected: %d, Got: %d",
-	     owidth, THTensor_(size)(gradOutput, dimw));
-  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
-	     "gradOutput height unexpected. Expected: %d, Got: %d",
-	     oheight, THTensor_(size)(gradOutput, dimh));
-  THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
-	     "gradOutput depth unexpected. Expected: %d, Got: %d",
-	     odepth, THTensor_(size)(gradOutput, dimd));
-  
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
diff --git a/test.lua b/test.lua
index 774fba1..b3e1d16 100644
--- a/test.lua
+++ b/test.lua
@@ -312,10 +312,10 @@ function nntest.CAdd()
    end
 
 
-   function testCAddWithLessDimsThanInput()
+   local function testCAddWithLessDimsThanInput()
       local input = torch.rand(4,5)
       local module = nn.CAdd(5)
-      params, gradParams = module:getParameters()
+      local params, gradParams = module:getParameters()
       testBackwardPass(module, input, params, gradParams)
 
       input:zero()
@@ -562,6 +562,122 @@ function nntest.ReLU6()
    end
 end
 
+function nntest.GatedLinearUnit()
+   local model = nn.GatedLinearUnit()
+   local t = torch.Tensor({{1, 1}, {2, 2}, {3, 3}})
+   local thalf = torch.Tensor():resizeAs(t):copy(t):narrow(2, 1, 1)
+   mytester:assertTensorEq(
+      thalf:cmul(torch.sigmoid(thalf)),
+      model:forward(t):resizeAs(thalf),
+      0.000001,
+      'Gated Linear output'
+   )
+   t = torch.Tensor({{1, 1, 1, 1}, {2, 2, 2, 2}, {3, 3, 3, 3}})
+   thalf = torch.Tensor():resizeAs(t):copy(t):narrow(2, 1, 2)
+   mytester:assertTensorEq(
+      thalf:cmul(torch.sigmoid(thalf)),
+      model:forward(t),
+      0.000001,
+      'Gated Linear Unit output'
+   )
+
+   local input = torch.rand(1, 10)
+   local err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient')
+
+   input = torch.rand(5, 10, 6)
+   model = nn.GatedLinearUnit(2)
+   err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient, non-default dim')
+
+   input = torch.rand(5, 10, 6)
+   model = nn.GatedLinearUnit(3)
+   err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient, non-default dim')
+
+   input = torch.rand(5, 10)
+   model = nn.Sequential()
+   model:add(nn.Linear(10, 10))
+   model:add(nn.GatedLinearUnit())
+   model:add(nn.ReLU())
+   model:add(nn.LogSoftMax())
+   err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient with other layers')
+end
+
+function nntest.CReLU()
+   local function _verifyCReLU(featureMaps, concatenatedFeatureMaps)
+      local rectifiedFeatureMaps = nn.ReLU():forward(featureMaps)
+      local rectifiedNegFeatureMaps = nn.ReLU():forward(-featureMaps)
+
+      mytester:asserteq(concatenatedFeatureMaps:size(1), featureMaps:size(1) * 2,
+                      "CReLU should double the number of feature maps")
+
+      for i =  1, rectifiedFeatureMaps:size(1) do
+         local found = false
+         for j = 1, concatenatedFeatureMaps:size(1) do
+            found =  found or rectifiedFeatureMaps[i]:equal(concatenatedFeatureMaps[j])
+         end
+         mytester:assert(found, "Original (rectified) feature maps should be in the output of CReLU")
+      end
+
+      for i = 1, rectifiedNegFeatureMaps:size(1) do
+         local found = false
+         for j = 1, concatenatedFeatureMaps:size(1) do
+            found =  found or rectifiedFeatureMaps[i]:equal(concatenatedFeatureMaps[j])
+         end
+         mytester:assert(found, "The negative of the original (rectified) feature maps should be in the output of CReLU")
+      end
+   end
+
+   local model = nn.Sequential()
+   model:add(nn.SpatialConvolution(1, 3, 3, 3, 1, 1, 1, 1))
+
+   for _, inplace in pairs({true, false}) do
+      --batched
+      local crelu = nn.CReLU(3, inplace)
+      local input = torch.Tensor(2, 1, 20, 20):uniform()
+      local featureMaps = model:forward(input)
+      local concatenatedFeatureMaps = crelu:forward(featureMaps)
+      for i = 1, input:size(1) do
+         _verifyCReLU(featureMaps[i], concatenatedFeatureMaps[i])
+      end
+
+      --non-batched
+      local input = torch.Tensor(1, 20, 20):uniform()
+      local featureMaps = model:forward(input)
+      local concatenatedFeatureMaps = crelu:forward(featureMaps)
+      _verifyCReLU(featureMaps, concatenatedFeatureMaps)
+   end
+
+   --test gradients w.r.t input
+   local jac = nn.Jacobian
+
+   for _, inplace in pairs({true, false}) do
+      local crelu = nn.CReLU(3, inplace)
+      --batched
+      local input = torch.Tensor(2, 3, 20, 20):uniform()
+      local err = jac.testJacobian(crelu, input)
+      mytester:assertlt(err, precision, "error computing gradients w.r.t. inputs")
+
+      --I/O
+      local fwdErr,bkwdErr = jac.testIO(crelu,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(crelu) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(crelu) .. " - i/o backward err ")
+
+      --non-batched
+      input = torch.Tensor(3, 20, 20):uniform()
+      err = jac.testJacobian(crelu,input)
+      mytester:assertlt(err, precision, "error computing gradients w.r.t. inputs")
+
+      --I/O
+      local fwdErr,bkwdErr = jac.testIO(crelu,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(crelu) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(crelu) .. " - i/o backward err ")
+   end
+
+end
+
 function nntest.Exp()
    local ini = math.random(3,5)
    local inj = math.random(3,5)
@@ -1577,6 +1693,114 @@ function nntest.MSECriterion()
    criterionJacobianTest(cri, input, target)
 end
 
+function nntest.SpatialAutoCropMSECriterion()
+   -- Tests the assumptions on input and target dimensions for the
+   -- nn.SpatialAutoCropMSECriterion criterion
+   local function testInputBounds()
+      for _, average in pairs({true, false}) do
+         local sMSE = nn.SpatialAutoCropMSECriterion(average)
+
+         local input = torch.Tensor(3, 3, 3)
+         local target = torch.Tensor(4, 3, 3)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                          "Target and input must have same number of channels")
+
+         input = torch.Tensor(2, 4, 3, 3)
+         target = torch.Tensor(2, 3, 3, 3)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                        "Target and input must have same number of channels")
+
+         input = torch.Tensor(2, 3, 3, 3)
+         target = torch.Tensor(1, 3, 3, 3)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                         "Target and input must have same batch size")
+
+         input = torch.Tensor(2, 5, 5)
+         target = torch.Tensor(2, 5, 4)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                         "input resolution must be smaller or equal to the spatial resolution of the target")
+
+         input = torch.Tensor(1, 2, 5, 5)
+         target = torch.Tensor(1, 2, 4, 5)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                         "input resolution must be smaller or equal to the spatial resolution of the target")
+      end
+   end
+
+   -- Tests that the forward pass of nn.SpatialAutoCropMSECriterion
+   -- is equivalent to the forward pass of nn.MSECriterion with a pre-cropped target
+   local function testSpatialAutoCropMSECriterionBatched()
+      for _, average in pairs({true, false}) do
+         local sMSE = nn.SpatialAutoCropMSECriterion(average)
+         local MSE = nn.MSECriterion(average)
+
+         local batchSize = math.random(1,10)
+         local channels = math.random(1,10)
+         local inputHeight = math.random(1, 50)
+         local inputWidth = math.random(1, 50)
+         local targetHeight = inputHeight + math.random(0,5)
+         local targetWidth = inputWidth + math.random(0,5)
+
+         local input = torch.Tensor(batchSize, channels, inputHeight, inputWidth):uniform()
+         local target = torch.Tensor(batchSize, channels, targetHeight, targetWidth):uniform()
+
+         local heightStartIdx = 1 + math.floor((targetHeight - inputHeight)/2.0)
+         local heightEndIdx = heightStartIdx + inputHeight - 1
+         local widthStartIdx = 1 +  math.floor((targetWidth - inputWidth)/2.0)
+         local widthEndIdx = widthStartIdx + inputWidth - 1
+
+         local croppedTarget = target[{{}, {}, {heightStartIdx, heightEndIdx}, {widthStartIdx, widthEndIdx}}]
+
+         local sMSEOut = nn.SpatialAutoCropMSECriterion(average):forward(input, target)
+         local MSEOut = MSE:forward(input, croppedTarget)
+         mytester:asserteq(sMSEOut, MSEOut)
+
+         local gradOutput = torch.Tensor():resizeAs(croppedTarget):uniform()
+         local sMSEGradInput = sMSE:backward(input, gradOutput)
+         local MSEGradInput = MSE:backward(input, gradOutput)
+         mytester:assertTensorEq(sMSEGradInput, MSEGradInput, 1e-7)
+         criterionJacobianTest(sMSE, input, gradOutput)
+      end
+   end
+
+   local function testSpatialAutoCropMSECriterionNonBatched()
+      for _, average in pairs({true, false}) do
+         local sMSE = nn.SpatialAutoCropMSECriterion(average)
+         local MSE = nn.MSECriterion(average)
+
+         local channels = math.random(1,10)
+         local inputHeight = math.random(1, 50)
+         local inputWidth = math.random(1, 50)
+         local targetHeight = inputHeight + math.random(0,5)
+         local targetWidth = inputWidth + math.random(0,5)
+
+         local input = torch.Tensor(channels, inputHeight, inputWidth):uniform()
+         local target = torch.Tensor(channels, targetHeight, targetWidth):uniform()
+
+         local heightStartIdx = 1 + math.floor((targetHeight - inputHeight)/2.0)
+         local heightEndIdx = heightStartIdx + inputHeight - 1
+         local widthStartIdx = 1 +  math.floor((targetWidth - inputWidth)/2.0)
+         local widthEndIdx = widthStartIdx + inputWidth - 1
+
+         local croppedTarget = target[{{}, {heightStartIdx, heightEndIdx}, {widthStartIdx, widthEndIdx}}]
+
+         local sMSEOut = nn.SpatialAutoCropMSECriterion(average):forward(input, target)
+         local MSEOut = MSE:forward(input, croppedTarget)
+         mytester:asserteq(sMSEOut, MSEOut)
+
+         local gradOutput = torch.Tensor():resizeAs(croppedTarget):uniform()
+         local sMSEGradInput = sMSE:backward(input, gradOutput)
+         local MSEGradInput = MSE:backward(input, gradOutput)
+         mytester:assertTensorEq(sMSEGradInput, MSEGradInput, 1e-7)
+         criterionJacobianTest(sMSE, input, gradOutput)
+      end
+   end
+
+   testInputBounds()
+   testSpatialAutoCropMSECriterionBatched()
+   testSpatialAutoCropMSECriterionNonBatched()
+end
+
 function nntest.ClassSimplexCriterion()
    local nClasses = torch.random(3,15)
    local input = torch.rand(nClasses)
@@ -5674,6 +5898,23 @@ function nntest.Narrow()
    mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #4 gradInput err")
    mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #4 negative output err")
    mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #4 negative gradInput err")
+
+   -- check narrow negative offset
+   local input = torch.rand(3, 10, 4)
+   local output = input:narrow(2, 1, 3)
+   local gradOutput = torch.rand(3, 3, 4)
+   local gradInput = torch.zeros(3, 10, 4)
+   gradInput:narrow(2, 1, 3):copy(gradOutput)
+   local module1 = nn.Narrow(2, -1, 7)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(2, 1, 3)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #5 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #5 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #5 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #5 negative gradInput err")
 end
 
 function nntest.NarrowTable()
@@ -6882,7 +7123,7 @@ function nntest.PixelShuffle()
    -- Sub-Pixel Convolutional Neural Network", Shi et al.
    -- This function tests for multip batch sizes, multiple channels and multiple input dimensions (square)
    -- It also tests for normal tensors (un-batched)
-   function testPixelShuffleUpdateOutput()
+   local function testPixelShuffleUpdateOutput()
       --Test with batched input
       for h = 1, 3 do
          local batchSize = torch.round(torch.uniform(1, 3))
@@ -6930,7 +7171,7 @@ function nntest.PixelShuffle()
    -- Sub-Pixel Convolutional Neural Network", Shi et al.
    -- This function tests for multip batch sizes, multiple channels and multiple input dimensions (square)
    -- It also tests for normal tensors (un-batched)
-   function testPixelShuffleUpdateGradInput()
+   local function testPixelShuffleUpdateGradInput()
       --Test with batched input
       for h = 1, 3 do
          local batchSize = torch.round(torch.uniform(1, 3))
@@ -6971,7 +7212,7 @@ function nntest.PixelShuffle()
       mytester:assertlt(err,precision, "error computing gradiens w.r.t. inputs")
    end
 
-   function testModuleIO()
+   local function testModuleIO()
       --Test with non-batched input
       local inputDim = torch.round(torch.uniform(5, 10))
       local channels = torch.round(torch.uniform(1, 4))
@@ -7190,6 +7431,28 @@ function nntest.Cosine()
    mytester:assertTensorEq(cosine.gradWeight, cosine2.gradWeight, 0.000001, "Cosine gradWeight 2D err")
 end
 
+function nntest.DistanceRatioCriterion()
+   local sizeAverage = true
+   local crit = nn.DistanceRatioCriterion(sizeAverage)
+   local X = torch.rand(32,1):fill(1)
+   local Y = torch.rand(32,1):fill(1)
+
+   -- Unit Test updateOutput
+   local loss = crit:forward({X, Y})
+   local trueLoss = 1 + math.log(math.exp(-1) + math.exp(-1))
+   assert(math.abs(loss - trueLoss) < 0.000001,
+          "DistanceRatioCriterion forward incorrect output")
+
+   -- Unit Test updateGradInput
+   local dxdy = crit:backward({X, Y})
+   local dx = dxdy[1]
+   local dy = dxdy[2]
+   assert(math.abs(dx:sum() - 0.5) < 0.000001,
+          "DistanceRatioCriterion backward (dx) incorrect output")
+   assert(math.abs(dy:sum() + 0.5) < 0.000001,
+          "DistanceRatioCriterion backward (dy) incorrect output")
+end
+
 function nntest.ErrorHandling()
    local l = nn.Linear(1, 1)
    local p = nn.Parallel(1, 1):add(l)
diff --git a/test/LinearTHNN.lua b/test/LinearTHNN.lua
index dc690dc..cd1529f 100644
--- a/test/LinearTHNN.lua
+++ b/test/LinearTHNN.lua
@@ -79,8 +79,10 @@ function LinearTHNN:accGradParameters(input, gradOutput, scale)
    return self.gradWeight
 end
 
--- we do not need to accumulate parameters when sharing
-LinearTHNN.sharedAccUpdateGradParameters = LinearTHNN.accUpdateGradParameters
+function LinearTHNN:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
 
 function LinearTHNN:clearState()
    if self.addBuffer then self.addBuffer:set() end

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git