[lua-torch-nn] 01/08: New upstream version 0~20161116-g8e0b061

Tue Nov 22 13:57:22 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit a63ea8def3ecd76c7b4356ebb906f3ad4b24fa80
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Thu Nov 17 09:49:52 2016 +0000

    New upstream version 0~20161116-g8e0b061
---
 BatchNormalization.lua                          |   4 +
 Bilinear.lua                                    |   5 +
 CAdd.lua                                        | 127 ++++
 CMaxTable.lua                                   |   2 +-
 ClassNLLCriterion.lua                           |  19 +-
 Concat.lua                                      | 108 +++-
 ConcatTable.lua                                 |   5 +-
 LookupTable.lua                                 |  10 +-
 Max.lua                                         |   9 +-
 Min.lua                                         |   9 +-
 Module.lua                                      |  26 +
 MultiLabelMarginCriterion.lua                   |  10 +
 MultiMarginCriterion.lua                        |  14 +-
 Normalize.lua                                   |  10 +-
 PairwiseDistance.lua                            |   2 +-
 Parallel.lua                                    |   3 +-
 ParallelTable.lua                               |   5 +-
 PixelShuffle.lua                                | 111 ++++
 README.md                                       |   2 +-
 SpatialAdaptiveMaxPooling.lua                   |   9 +-
 SpatialClassNLLCriterion.lua                    |  19 +-
 SpatialCrossMapLRN.lua                          |  20 +-
 SpatialDilatedMaxPooling.lua                    |   7 +-
 SpatialFractionalMaxPooling.lua                 |   7 +-
 SpatialLogSoftMax.lua                           |  19 +
 SpatialMaxPooling.lua                           |   9 +-
 SpatialUpSamplingBilinear.lua                   |  91 ++-
 SpatialUpSamplingNearest.lua                    |   1 -
 TemporalMaxPooling.lua                          |   7 +-
 VolumetricConvolution.lua                       |  46 +-
 VolumetricDilatedMaxPooling.lua                 |  11 +-
 VolumetricMaxPooling.lua                        |   9 +-
 doc/convolution.md                              |   9 +-
 doc/criterion.md                                |  12 +-
 doc/image/lena.jpg                              | Bin 0 -> 39706 bytes
 doc/image/parameterflattening.png               | Bin 74658 -> 0 bytes
 doc/image/parameterflattening.svg               | 338 -----------
 doc/overview.md                                 |   4 +-
 doc/simple.md                                   | 134 ++++-
 doc/training.md                                 | 223 +++----
 doc/transfer.md                                 | 548 +++++++++++------
 init.lua                                        |   5 +
 lib/THNN/CMakeLists.txt                         |   1 +
 lib/THNN/THNN.h                                 |  10 +-
 lib/THNN/generic/Abs.c                          |   1 +
 lib/THNN/generic/AbsCriterion.c                 |   3 +-
 lib/THNN/generic/BCECriterion.c                 |  14 +-
 lib/THNN/generic/BatchNormalization.c           |   9 +-
 lib/THNN/generic/ClassNLLCriterion.c            |   6 +-
 lib/THNN/generic/DistKLDivCriterion.c           |   5 +
 lib/THNN/generic/ELU.c                          |   3 +-
 lib/THNN/generic/HardShrink.c                   |   1 +
 lib/THNN/generic/HardTanh.c                     |   9 +-
 lib/THNN/generic/L1Cost.c                       |   2 +
 lib/THNN/generic/LeakyReLU.c                    |   1 +
 lib/THNN/generic/Linear.c                       | 110 ++++
 lib/THNN/generic/LogSigmoid.c                   |   1 +
 lib/THNN/generic/LogSoftMax.c                   |  70 ++-
 lib/THNN/generic/LookupTable.c                  |  39 +-
 lib/THNN/generic/MSECriterion.c                 |   5 +
 lib/THNN/generic/MarginCriterion.c              |   3 +
 lib/THNN/generic/MultiLabelMarginCriterion.c    |  62 +-
 lib/THNN/generic/MultiMarginCriterion.c         |  43 +-
 lib/THNN/generic/PReLU.c                        |  12 +
 lib/THNN/generic/RReLU.c                        |   1 +
 lib/THNN/generic/Sigmoid.c                      |   1 +
 lib/THNN/generic/SmoothL1Criterion.c            |   4 +
 lib/THNN/generic/SoftMarginCriterion.c          |   4 +
 lib/THNN/generic/SoftMax.c                      |  13 +-
 lib/THNN/generic/SoftPlus.c                     |   1 +
 lib/THNN/generic/SoftShrink.c                   |   1 +
 lib/THNN/generic/SpatialAdaptiveMaxPooling.c    |  40 +-
 lib/THNN/generic/SpatialAveragePooling.c        |  20 +-
 lib/THNN/generic/SpatialClassNLLCriterion.c     |   9 +-
 lib/THNN/generic/SpatialConvolutionLocal.c      | 218 +++++--
 lib/THNN/generic/SpatialConvolutionMM.c         | 207 ++++---
 lib/THNN/generic/SpatialDilatedConvolution.c    |  94 ++-
 lib/THNN/generic/SpatialDilatedMaxPooling.c     | 262 +++++---
 lib/THNN/generic/SpatialFractionalMaxPooling.c  |  36 +-
 lib/THNN/generic/SpatialFullConvolution.c       |  73 ++-
 lib/THNN/generic/SpatialMaxPooling.c            |   4 +-
 lib/THNN/generic/SpatialMaxUnpooling.c          |  59 +-
 lib/THNN/generic/SpatialReflectionPadding.c     |  15 +-
 lib/THNN/generic/SpatialReplicationPadding.c    |  16 +-
 lib/THNN/generic/SpatialSubSampling.c           |  44 +-
 lib/THNN/generic/SpatialUpSamplingBilinear.c    | 157 +++--
 lib/THNN/generic/SpatialUpSamplingNearest.c     |  60 +-
 lib/THNN/generic/Sqrt.c                         |   1 +
 lib/THNN/generic/Square.c                       |   1 +
 lib/THNN/generic/THNN.h                         |  87 ++-
 lib/THNN/generic/Tanh.c                         |   1 +
 lib/THNN/generic/TemporalConvolution.c          |  18 +-
 lib/THNN/generic/TemporalMaxPooling.c           |  30 +-
 lib/THNN/generic/Threshold.c                    |   1 +
 lib/THNN/generic/VolumetricAveragePooling.c     |  15 +-
 lib/THNN/generic/VolumetricConvolution.c        |  23 +-
 lib/THNN/generic/VolumetricConvolutionMM.c      |  57 +-
 lib/THNN/generic/VolumetricDilatedConvolution.c |  30 +-
 lib/THNN/generic/VolumetricDilatedMaxPooling.c  |  50 +-
 lib/THNN/generic/VolumetricFullConvolution.c    |  33 +-
 lib/THNN/generic/VolumetricMaxPooling.c         |   4 +-
 lib/THNN/generic/VolumetricMaxUnpooling.c       |  67 ++-
 lib/THNN/generic/VolumetricReplicationPadding.c |  20 +-
 lib/THNN/init.c                                 |  61 ++
 test.lua                                        | 767 ++++++++++++++++++------
 test/LinearTHNN.lua                             |  94 +++
 106 files changed, 3429 insertions(+), 1689 deletions(-)

diff --git a/BatchNormalization.lua b/BatchNormalization.lua
index 578f441..1cd30aa 100644
--- a/BatchNormalization.lua
+++ b/BatchNormalization.lua
@@ -208,3 +208,7 @@ function BN:clearState()
    })
    return parent.clearState(self)
 end
+
+function BN:__tostring__()
+   return string.format('%s (%dD) (%d)', torch.type(self), self.nDim, self.running_mean:nElement())
+end
diff --git a/Bilinear.lua b/Bilinear.lua
index 5527686..7aa9d99 100644
--- a/Bilinear.lua
+++ b/Bilinear.lua
@@ -84,6 +84,11 @@ function Bilinear:updateGradInput(input, gradOutput)
    assert(self)
    if self.gradInput then
       self:__assertInputGradOutput(input, gradOutput)
+
+      if #self.gradInput == 0 then
+          for i = 1, 2 do self.gradInput[i] = input[1].new() end
+      end
+
       -- compute d output / d input:
       self.gradInput[1]:resizeAs(input[1]):fill(0)
       self.gradInput[2]:resizeAs(input[2]):fill(0)
diff --git a/CAdd.lua b/CAdd.lua
new file mode 100644
index 0000000..1d7b457
--- /dev/null
+++ b/CAdd.lua
@@ -0,0 +1,127 @@
+local CAdd, parent = torch.class("nn.CAdd", "nn.Module")
+
+function CAdd:__init(...)
+   parent.__init(self)
+
+   local arg = {...}
+
+   self.size = torch.LongStorage()
+   local n = #arg
+   if n == 1 and torch.type(arg[1]) == 'torch.LongStorage' then
+      self.size:resize(#arg[1]):copy(arg[1])
+   else
+     self.size:resize(n)
+     for i=1,n do
+         self.size[i] = arg[i]
+     end
+   end
+
+   self.bias = torch.Tensor(self.size)
+   self.gradBias = torch.Tensor(self.size)
+
+   self.output:resize(self.size)
+
+   self:reset()
+end
+
+function CAdd:reset(stdv)
+   if stdv then
+      --std of uniform distribution on interval [-a,a] = a/sqrt(3)
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1.0/math.sqrt(self.bias:nElement())
+   end
+   self.bias:uniform(-stdv,stdv)
+end
+
+function CAdd:updateOutput(input)
+   self._output = self._output or input.new()
+   self._bias = self._bias or input.new()
+   self._expand = self._expand or input.new()
+   self._repeat = self._repeat or input.new()
+
+   self.output:resizeAs(input):copy(input)
+   if input:nElement() == self.bias:nElement() then
+      self.output:add(self.bias)
+   else
+      if self.bias:dim() == input:dim() then
+         self._output:set(self.output)
+         self._bias:set(self.bias)
+      else
+         local batchSize = input:size(1)
+         self._output:view(self.output, batchSize, -1)
+         self._bias:view(self.bias, 1, -1)
+      end
+
+      self._expand:expandAs(self._bias, self._output)
+
+      --expandAs uses stride 0 and self._expand is not contiguous
+      --cuda ops may assume contiguous input
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._expand):copy(self._expand)
+         self._output:add(self._repeat)
+      else
+         self._output:add(self._expand)
+      end
+   end
+
+   return self.output
+end
+
+function CAdd:updateGradInput(input, gradOutput)
+   self.gradInput = self.gradInput or input.new()
+   self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+
+   return self.gradInput
+end
+
+function CAdd:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+
+   self._gradBias = self._gradBias or gradOutput.new()
+   self._gradOutput = self._gradOutput or gradOutput.new()
+   self._repeat = self._repeat or gradOutput.new()
+
+   if self.bias:nElement() == gradOutput:nElement() then
+      self.gradBias:add(scale, gradOutput)
+   else
+      if self.bias:dim() == gradOutput:dim() then
+         self._gradBias:set(self.gradBias)
+         self._gradOutput:set(gradOutput)
+      else
+         local batchSize = input:size(1)
+         self._gradBias:view(self.gradBias, 1, -1)
+         self._gradOutput:view(gradOutput, batchSize, -1)
+      end
+
+      self._gradBias:expandAs(self._gradBias, self._gradOutput)
+
+      --expandAs uses stride 0 and self._gradBias is not contiguous
+      --cuda ops may assume contiguous input
+      if torch.type(self._gradBias) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._gradBias):copy(self._gradBias)
+         self._repeat:add(scale, self._gradOutput)
+         self._gradBias:copy(self._repeat)
+      else
+         self._gradBias:add(scale, self._gradOutput)
+      end
+   end
+end
+
+function CAdd:type(type, tensorCache)
+   if type then
+      self:clearState()
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function CAdd:clearState()
+   nn.utils.clear(self, {
+      '_gradBias',
+      '_expand',
+      '_output',
+      '_bias',
+      '_repeat'
+   })
+   return parent.clearState(self)
+end
diff --git a/CMaxTable.lua b/CMaxTable.lua
index 3907faf..62cede9 100644
--- a/CMaxTable.lua
+++ b/CMaxTable.lua
@@ -19,7 +19,7 @@ end
 
 function CMaxTable:updateGradInput(input, gradOutput)
    for i=1,#input do
-      self.gradInput[i] = torch.Tensor()
+      self.gradInput[i] = input[i].new()
       self.gradInput[i]:resizeAs(input[i]):fill(0.0)
       local mask = torch.eq(self.maxIdx, i)
       self.gradInput[i]:maskedCopy(mask, gradOutput[mask])
diff --git a/ClassNLLCriterion.lua b/ClassNLLCriterion.lua
index 8e8acbf..1d3f3b7 100644
--- a/ClassNLLCriterion.lua
+++ b/ClassNLLCriterion.lua
@@ -28,12 +28,14 @@ end
 
 function ClassNLLCriterion:updateOutput(input, target)
    if type(target) == 'number' then
-      if input:type() ~= 'torch.CudaTensor' then
-         self.target = self.target:long()
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
       end
       self.target[1] = target
-   elseif target:type() == 'torch.CudaTensor' then
-      self.target = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
    else
       self.target = target:long()
    end
@@ -52,9 +54,14 @@ end
 
 function ClassNLLCriterion:updateGradInput(input, target)
    if type(target) == 'number' then
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
+      end
       self.target[1] = target
-   elseif target:type() == 'torch.CudaTensor' then
-      self.target = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
    else
       self.target = target:long()
    end
diff --git a/Concat.lua b/Concat.lua
index 108b216..f0cc9f1 100644
--- a/Concat.lua
+++ b/Concat.lua
@@ -30,56 +30,97 @@ function Concat:updateOutput(input)
    return self.output
 end
 
-function Concat:updateGradInput(input, gradOutput)
-   self.gradInput:resizeAs(input)
+local function retable(t1, t2, f)
+   for k, v in ipairs(t2) do
+      if (torch.type(v) == "table") then
+         t1[k] = retable(t1[k] or {}, t2[k], f)
+      else
+         f(t1, k, v)
+      end
+   end
+   for i=#t2+1, #t1 do
+      t1[i] = nil
+   end
+   return t1
+end
 
-   local offset = 1
-   for i,module in ipairs(self.modules) do
-      local currentOutput = module.output
-      local currentGradInput = self:rethrowErrors(module, i, 'updateGradInput', input, gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)))
+local function backward(self, method, input, gradOutput, scale)
+   local isTable = torch.type(input) == 'table'
+   local wasTable = torch.type(self.gradInput) == 'table'
+   scale = scale or 1
 
-      if currentGradInput then -- if the module does not produce a gradInput (for example first layer), then ignore it and move on.
-         if i==1 then
-            self.gradInput:copy(currentGradInput)
+   if isTable then
+      local offset = 1
+      for i,module in ipairs(self.modules) do
+         local currentOutput = module.output
+         local currentGradInput = self:rethrowErrors(module, i, method, input, 
+                                                     gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)), scale)
+         if torch.type(currentGradInput) ~= 'table' then
+            error"currentGradInput is not a table!"
+         end
+         if #input ~= #currentGradInput then
+            error("table size mismatch: "..#input.." ~= "..#currentGradInput)
+         end
+         if i == 1 then
+            self.gradInput = wasTable and self.gradInput or {}
+            retable(self.gradInput, currentGradInput,
+                    function(t, k, v)
+                       t[k] = t[k] or v:clone()
+                       t[k]:resizeAs(v)
+                       t[k]:copy(v)
+                    end
+            )
          else
-            self.gradInput:add(currentGradInput)
+            retable(self.gradInput, currentGradInput,
+                    function(t, k, v)
+                       if t[k] then
+                          t[k]:add(v)
+                       else
+                          t[k] = v:clone()
+                       end
+                    end
+            )
          end
+         offset = offset + currentOutput:size(self.dimension)
+      end
+   else
+      self.gradInput = (not wasTable) and self.gradInput:resizeAs(input) or input:clone()
+      local offset = 1
+      for i,module in ipairs(self.modules) do
+         local currentOutput = module.output
+         local currentGradInput = self:rethrowErrors(module, i, method, input, 
+                                                     gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)), scale)
+         if currentGradInput then -- if the module does not produce a gradInput (for example first layer), then ignore it and move on.
+            if i==1 then
+               self.gradInput:copy(currentGradInput)
+            else
+               self.gradInput:add(currentGradInput)
+            end
+         end
+         offset = offset + currentOutput:size(self.dimension)
       end
-      offset = offset + currentOutput:size(self.dimension)
    end
    return self.gradInput
 end
 
-function Concat:accGradParameters(input, gradOutput, scale)
-   scale = scale or 1
-   local offset = 1
-   for i,module in ipairs(self.modules) do
-      local currentOutput = module.output
-      self:rethrowErrors(module, i, 'accGradParameters',
-          input,
-          gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)),
-          scale)
-      offset = offset + currentOutput:size(self.dimension)
-   end
+function Concat:updateGradInput(input, gradOutput)
+   return backward(self, 'updateGradInput', input, gradOutput)
 end
 
 function Concat:backward(input, gradOutput, scale)
-   self.gradInput:resizeAs(input)
+   return backward(self, 'backward', input, gradOutput, scale)
+end
+
+function Concat:accGradParameters(input, gradOutput, scale)
    scale = scale or 1
    local offset = 1
    for i,module in ipairs(self.modules) do
       local currentOutput = module.output
-      local currentGradInput = self:rethrowErrors(module, i, 'backward', input, gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)), scale)
-      if currentGradInput then -- if the module does not produce a gradInput (for example first layer), then ignore it and move on.
-         if i==1 then
-            self.gradInput:copy(currentGradInput)
-         else
-            self.gradInput:add(currentGradInput)
-         end
-      end
+      self:rethrowErrors(module, i, 'accGradParameters', input,
+          gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)),
+          scale)
       offset = offset + currentOutput:size(self.dimension)
    end
-   return self.gradInput
 end
 
 function Concat:accUpdateGradParameters(input, gradOutput, lr)
@@ -98,6 +139,7 @@ function Concat:__tostring__()
    local tab = '  '
    local line = '\n'
    local next = '  |`-> '
+   local lastNext = '   `-> '
    local ext = '  |    '
    local extlast = '       '
    local last = '   ... -> '
@@ -105,7 +147,7 @@ function Concat:__tostring__()
    str = str .. ' {' .. line .. tab .. 'input'
    for i=1,#self.modules do
       if i == #self.modules then
-         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
       else
          str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
       end
diff --git a/ConcatTable.lua b/ConcatTable.lua
index cb08de0..b1d904f 100644
--- a/ConcatTable.lua
+++ b/ConcatTable.lua
@@ -99,14 +99,15 @@ function ConcatTable:__tostring__()
    local tab = '  '
    local line = '\n'
    local next = '  |`-> '
+   local lastNext = '   `-> '
    local ext = '  |    '
    local extlast = '       '
    local last = '   ... -> '
    local str = torch.type(self)
    str = str .. ' {' .. line .. tab .. 'input'
    for i=1,#self.modules do
-      if i == self.modules then
-         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      if i == #self.modules then
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
       else
          str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
       end
diff --git a/LookupTable.lua b/LookupTable.lua
index cf9c687..8ca7ddb 100644
--- a/LookupTable.lua
+++ b/LookupTable.lua
@@ -146,12 +146,12 @@ end
 function LookupTable:type(type, tensorCache)
    parent.type(self, type, tensorCache)
 
-   if type == 'torch.CudaTensor' then
+   if type and type:find('torch%.Cuda.*Tensor') then
       -- CUDA uses _sorted and _indices temporary tensors
-      self._sorted = torch.CudaLongTensor.new()
-      self._indices = torch.CudaLongTensor.new()
-      self._count = torch.CudaLongTensor.new()
-      self._input = torch.CudaLongTensor.new()
+      self._sorted = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
+      self._indices = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
+      self._count = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
+      self._input = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
    else
       -- self._count and self._input should only be converted if using Cuda
       self._count = torch.IntTensor()
diff --git a/Max.lua b/Max.lua
index 1392d8a..2aa67d3 100644
--- a/Max.lua
+++ b/Max.lua
@@ -20,8 +20,13 @@ end
 
 function Max:_lazyInit()
    self._output = self._output or self.output.new()
-   self._indices = self._indices or
-      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaLongTensor() or torch.LongTensor())
+   if not self._indices then
+      if torch.type(self.output) == 'torch.CudaTensor' then
+         self._indices = torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()
+      else
+         self._indices = torch.LongTensor()
+      end
+   end
 end
 
 function Max:updateOutput(input)
diff --git a/Min.lua b/Min.lua
index dc07cf9..252f52e 100644
--- a/Min.lua
+++ b/Min.lua
@@ -20,8 +20,13 @@ end
 
 function Min:_lazyInit()
    self._output = self._output or self.output.new()
-   self._indices = self._indices or
-      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaLongTensor() or torch.LongTensor())
+   if not self._indices then
+      if torch.type(self.output) == 'torch.CudaTensor' then
+         self._indices = torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()
+      else
+         self._indices = torch.LongTensor()
+      end
+   end
 end
 
 function Min:updateOutput(input)
diff --git a/Module.lua b/Module.lua
index 19e2416..c1a0328 100644
--- a/Module.lua
+++ b/Module.lua
@@ -102,12 +102,38 @@ function Module:share(mlp, ...)
    return self
 end
 
+local function sharedWrite(...)
+   local arg = {...}
+   local shared = {}
+   for i,v in ipairs(arg) do
+       shared[v] = true
+   end
+   return function(self, file)
+      local object = {}
+      for k, v in pairs(self) do
+         if shared[k] then
+            assert(torch.isTensor(v), 'Shared parameters have to be Tensors')
+            object[k] = v.new()
+         else
+            object[k] = v
+         end
+      end
+      file:writeObject(object)
+   end
+end
+
 function Module:clone(...)
+   local oldWrite = nn.Module.write
+   nn.Module.write = sharedWrite(...)
+
    local f = torch.MemoryFile("rw"):binary()
    f:writeObject(self)
    f:seek(1)
    local clone = f:readObject()
    f:close()
+
+   nn.Module.write = oldWrite
+
    if select('#',...) > 0 then
       clone:share(self,...)
    end
diff --git a/MultiLabelMarginCriterion.lua b/MultiLabelMarginCriterion.lua
index a0b2a9c..908b613 100644
--- a/MultiLabelMarginCriterion.lua
+++ b/MultiLabelMarginCriterion.lua
@@ -7,6 +7,11 @@ function MultiLabelMarginCriterion:__init()
 end
 
 function MultiLabelMarginCriterion:updateOutput(input, target)
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
    self.output_tensor = self.output_tensor or input.new(1)
    input.THNN.MultiLabelMarginCriterion_updateOutput(
       input:cdata(),
@@ -20,6 +25,11 @@ function MultiLabelMarginCriterion:updateOutput(input, target)
 end
 
 function MultiLabelMarginCriterion:updateGradInput(input, target)
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
    input.THNN.MultiLabelMarginCriterion_updateGradInput(
       input:cdata(),
       target:cdata(),
diff --git a/MultiMarginCriterion.lua b/MultiMarginCriterion.lua
index 1a22bde..e312238 100644
--- a/MultiMarginCriterion.lua
+++ b/MultiMarginCriterion.lua
@@ -16,10 +16,15 @@ end
 function MultiMarginCriterion:updateOutput(input, target)
    -- backward compatibility
    if not torch.isTensor(target) then
-     self.target_tensor = self.target_tensor or input.new(1)
+     self.target_tensor = self.target_tensor or torch.LongTensor(1)
      self.target_tensor[1] = target
      target = self.target_tensor
    end
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
    self.p = self.p or 1
    self.output_tensor = self.output_tensor or input.new(1)
    input.THNN.MultiMarginCriterion_updateOutput(
@@ -37,10 +42,15 @@ end
 
 function MultiMarginCriterion:updateGradInput(input, target)
    if not torch.isTensor(target) then
-     self.target_tensor = self.target_tensor or input.new(1)
+     self.target_tensor = self.target_tensor or torch.LongTensor(1)
      self.target_tensor[1] = target
      target = self.target_tensor
    end
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
    input.THNN.MultiMarginCriterion_updateGradInput(
       input:cdata(),
       target:cdata(),
diff --git a/Normalize.lua b/Normalize.lua
index 5cd4857..b6d1298 100644
--- a/Normalize.lua
+++ b/Normalize.lua
@@ -23,9 +23,13 @@ function Normalize:updateOutput(input)
 
   if self.p == math.huge then
     -- specialization for the infinity norm
-    self._indices = self._indices or
-      (torch.type(self.output) == 'torch.CudaTensor' and
-       torch.CudaLongTensor() or torch.LongTensor())
+    if not self._indices then
+      if torch.type(self.output) == 'torch.CudaTensor' then
+        self._indices = torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()
+      else
+        self._indices = torch.LongTensor()
+      end
+    end
 
     self.buffer:abs(input)
     torch.max(self.norm, self._indices, self.buffer, 2)
diff --git a/PairwiseDistance.lua b/PairwiseDistance.lua
index d5022a7..6bf43c5 100644
--- a/PairwiseDistance.lua
+++ b/PairwiseDistance.lua
@@ -6,7 +6,7 @@ function PairwiseDistance:__init(p)
    -- state
    self.gradInput = {}
    self.diff = torch.Tensor()
-   self.norm = p
+   self.norm = p or 2 -- Default using Euclidean distance
 end 
   
 function PairwiseDistance:updateOutput(input)
diff --git a/Parallel.lua b/Parallel.lua
index 7d2b4f1..58cb974 100644
--- a/Parallel.lua
+++ b/Parallel.lua
@@ -97,6 +97,7 @@ function Parallel:__tostring__()
    local tab = '  '
    local line = '\n'
    local next = '  |`-> '
+   local lastNext = '   `-> '
    local ext = '  |    '
    local extlast = '       '
    local last = '   ... -> '
@@ -104,7 +105,7 @@ function Parallel:__tostring__()
    str = str .. ' {' .. line .. tab .. 'input'
    for i=1,#self.modules do
       if i == #self.modules then
-         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
       else
          str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
       end
diff --git a/ParallelTable.lua b/ParallelTable.lua
index 6de9534..2fe0899 100644
--- a/ParallelTable.lua
+++ b/ParallelTable.lua
@@ -39,14 +39,15 @@ function ParallelTable:__tostring__()
    local tab = '  '
    local line = '\n'
    local next = '  |`-> '
+   local lastNext = '   `-> '
    local ext = '  |    '
    local extlast = '       '
    local last = '   ... -> '
    local str = torch.type(self)
    str = str .. ' {' .. line .. tab .. 'input'
    for i=1,#self.modules do
-      if i == self.modules then
-         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      if i == #self.modules then
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
       else
          str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
       end
diff --git a/PixelShuffle.lua b/PixelShuffle.lua
new file mode 100644
index 0000000..dd58ed9
--- /dev/null
+++ b/PixelShuffle.lua
@@ -0,0 +1,111 @@
+local PixelShuffle, parent = torch.class("nn.PixelShuffle", "nn.Module")
+
+-- Shuffles pixels after upscaling with a ESPCNN model
+-- Converts a [batch x channel*r^2 x m x p] tensor to [batch x channel x r*m x r*p]
+-- tensor, where r is the upscaling factor.
+-- @param upscaleFactor - the upscaling factor to use
+function PixelShuffle:__init(upscaleFactor)
+   parent.__init(self)
+   self.upscaleFactor = upscaleFactor
+   self.upscaleFactorSquared = self.upscaleFactor * self.upscaleFactor
+end
+
+-- Computes the forward pass of the layer i.e. Converts a
+-- [batch x channel*r^2 x m x p] tensor to [batch x channel x r*m x r*p] tensor.
+-- @param input - the input tensor to be shuffled of size [b x c*r^2 x m x p]
+-- @return output - the shuffled tensor of size [b x c x r*m x r*p]
+function PixelShuffle:updateOutput(input)
+   self._intermediateShape = self._intermediateShape or torch.LongStorage(6)
+   self._outShape = self.outShape or torch.LongStorage()
+   self._shuffleOut = self._shuffleOut or input.new()
+
+   local batched = false
+   local batchSize = 1
+   local inputStartIdx = 1
+   local outShapeIdx = 1
+   if input:nDimension() == 4 then
+      batched = true
+      batchSize = input:size(1)
+      inputStartIdx = 2
+      outShapeIdx = 2
+      self._outShape:resize(4)
+      self._outShape[1] = batchSize
+   else
+      self._outShape:resize(3)
+   end
+
+   --input is of size h/r w/r, rc output should be h, r, c
+   local channels = input:size(inputStartIdx) / self.upscaleFactorSquared
+   local inHeight = input:size(inputStartIdx + 1)
+   local inWidth = input:size(inputStartIdx + 2)
+
+   self._intermediateShape[1] = batchSize
+   self._intermediateShape[2] = channels
+   self._intermediateShape[3] = self.upscaleFactor
+   self._intermediateShape[4] = self.upscaleFactor
+   self._intermediateShape[5] = inHeight
+   self._intermediateShape[6] = inWidth
+
+   self._outShape[outShapeIdx] = channels
+   self._outShape[outShapeIdx + 1] = inHeight * self.upscaleFactor
+   self._outShape[outShapeIdx + 2] = inWidth * self.upscaleFactor
+
+   local inputView = torch.view(input, self._intermediateShape)
+
+   self._shuffleOut:resize(inputView:size(1), inputView:size(2), inputView:size(5),
+                           inputView:size(3), inputView:size(6), inputView:size(4))
+   self._shuffleOut:copy(inputView:permute(1, 2, 5, 3, 6, 4))
+
+   self.output = torch.view(self._shuffleOut, self._outShape)
+
+   return self.output
+end
+
+-- Computes the backward pass of the layer, given the gradient w.r.t. the output
+-- this function computes the gradient w.r.t. the input.
+-- @param input - the input tensor of shape [b x c*r^2 x m x p]
+-- @param gradOutput - the tensor with the gradients w.r.t. output of shape [b x c x r*m x r*p]
+-- @return gradInput - a tensor of the same shape as input, representing the gradient w.r.t. input.
+function PixelShuffle:updateGradInput(input, gradOutput)
+   self._intermediateShape = self._intermediateShape or torch.LongStorage(6)
+   self._shuffleIn = self._shuffleIn or input.new()
+
+   local batchSize = 1
+   local inputStartIdx = 1
+   if input:nDimension() == 4 then
+      batchSize = input:size(1)
+      inputStartIdx = 2
+   end
+
+   local channels = input:size(inputStartIdx) / self.upscaleFactorSquared
+   local height = input:size(inputStartIdx + 1)
+   local width = input:size(inputStartIdx + 2)
+
+   self._intermediateShape[1] = batchSize
+   self._intermediateShape[2] = channels
+   self._intermediateShape[3] = height
+   self._intermediateShape[4] = self.upscaleFactor
+   self._intermediateShape[5] = width
+   self._intermediateShape[6] = self.upscaleFactor
+
+   local gradOutputView = torch.view(gradOutput, self._intermediateShape)
+
+   self._shuffleIn:resize(gradOutputView:size(1), gradOutputView:size(2), gradOutputView:size(4),
+                          gradOutputView:size(6), gradOutputView:size(3), gradOutputView:size(5))
+   self._shuffleIn:copy(gradOutputView:permute(1, 2, 4, 6, 3, 5))
+
+   self.gradInput = torch.view(self._shuffleIn, input:size())
+
+   return self.gradInput
+end
+
+
+function PixelShuffle:clearState()
+   nn.utils.clear(self, {
+      "_intermediateShape",
+      "_outShape",
+      "_shuffleIn",
+      "_shuffleOut",
+   })
+   return parent.clearState(self)
+end
diff --git a/README.md b/README.md
index e848fd8..378a440 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,6 @@ This package provides an easy and modular way to build and train simple or compl
    * [`ClassNLLCriterion`](doc/criterion.md#nn.ClassNLLCriterion): the Negative Log Likelihood criterion used for classification;
  * Additional documentation:
    * [Overview](doc/overview.md#nn.overview.dok) of the package essentials including modules, containers and training;
-   * [Training](doc/training.md#nn.traningneuralnet.dok): how to train a neural network using [optim](https://github.com/torch/optim);
+   * [Training](doc/training.md#nn.traningneuralnet.dok): how to train a neural network using [`StochasticGradient`](doc/training.md#nn.StochasticGradient);
    * [Testing](doc/testing.md): how to test your modules.
    * [Experimental Modules](https://github.com/clementfarabet/lua---nnx/blob/master/README.md): a package containing experimental modules and criteria.
diff --git a/SpatialAdaptiveMaxPooling.lua b/SpatialAdaptiveMaxPooling.lua
index 74d4cd6..b78261c 100644
--- a/SpatialAdaptiveMaxPooling.lua
+++ b/SpatialAdaptiveMaxPooling.lua
@@ -2,13 +2,18 @@ local SpatialAdaptiveMaxPooling, parent = torch.class('nn.SpatialAdaptiveMaxPool
 
 function SpatialAdaptiveMaxPooling:__init(W, H)
    parent.__init(self)
-   
+
    self.W = W
    self.H = H
 end
 
 function SpatialAdaptiveMaxPooling:updateOutput(input)
-   self.indices = self.indices or input.new()
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
    input.THNN.SpatialAdaptiveMaxPooling_updateOutput(
       input:cdata(),
       self.output:cdata(),
diff --git a/SpatialClassNLLCriterion.lua b/SpatialClassNLLCriterion.lua
index 8652e88..fbd3674 100644
--- a/SpatialClassNLLCriterion.lua
+++ b/SpatialClassNLLCriterion.lua
@@ -28,12 +28,14 @@ end
 
 function SpatialClassNLLCriterion:updateOutput(input, target)
    if type(target) == 'number' then
-      if input:type() ~= 'torch.CudaTensor' then
-         self.target = self.target:long()
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
       end
       self.target[1] = target
-   elseif target:type() == 'torch.CudaTensor' then
-      self.target = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
    else
       self.target = target:long()
    end
@@ -52,9 +54,14 @@ end
 
 function SpatialClassNLLCriterion:updateGradInput(input, target)
    if type(target) == 'number' then
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
+      end
       self.target[1] = target
-   elseif target:type() == 'torch.CudaTensor' then
-      self.target = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
    else
       self.target = target:long()
    end
diff --git a/SpatialCrossMapLRN.lua b/SpatialCrossMapLRN.lua
index 9758c79..088eb07 100644
--- a/SpatialCrossMapLRN.lua
+++ b/SpatialCrossMapLRN.lua
@@ -15,7 +15,7 @@ function SpatialCrossMapLRN:updateOutput(input)
 
   self.scale = self.scale or input.new()
 
-  if torch.type(input) == 'torch.CudaTensor' then
+  if torch.typename(input):find('torch%.Cuda.*Tensor') then
      input.THNN.SpatialCrossMapLRN_updateOutput(
         input:cdata(),
         self.output:cdata(),
@@ -33,9 +33,9 @@ function SpatialCrossMapLRN:updateOutput(input)
      end
 
      local batchSize   = input:size(1)
-     local channels    = input:size(2) 
-     local inputHeight = input:size(3) 
-     local inputWidth  = input:size(4) 
+     local channels    = input:size(2)
+     local inputHeight = input:size(3)
+     local inputWidth  = input:size(4)
 
      self.output:resizeAs(input)
      self.scale:resizeAs(input)
@@ -43,7 +43,7 @@ function SpatialCrossMapLRN:updateOutput(input)
      -- use output storage as temporary buffer
      local inputSquare = self.output
      inputSquare:pow(input, 2)
-       
+
      local prePad = (self.size - 1)/2 + 1
      local prePadCrop = prePad > channels and channels or prePad
 
@@ -86,8 +86,8 @@ end
 function SpatialCrossMapLRN:updateGradInput(input, gradOutput)
   assert(input:dim() == 3 or input:dim() == 4,
          'Input must be 3D or 4D')
- 
-  if torch.type(input) == 'torch.CudaTensor' then
+
+  if torch.typename(input):find('torch%.Cuda.*Tensor') then
      input.THNN.SpatialCrossMapLRN_updateGradInput(
         input:cdata(),
         gradOutput:cdata(),
@@ -109,9 +109,9 @@ function SpatialCrossMapLRN:updateGradInput(input, gradOutput)
      end
 
      local batchSize   = input:size(1)
-     local channels    = input:size(2) 
-     local inputHeight = input:size(3) 
-     local inputWidth  = input:size(4) 
+     local channels    = input:size(2)
+     local inputHeight = input:size(3)
+     local inputWidth  = input:size(4)
 
      self.paddedRatio = self.paddedRatio or input.new()
      self.accumRatio = self.accumRatio or input.new()
diff --git a/SpatialDilatedMaxPooling.lua b/SpatialDilatedMaxPooling.lua
index 2f0eba0..34525a4 100644
--- a/SpatialDilatedMaxPooling.lua
+++ b/SpatialDilatedMaxPooling.lua
@@ -9,7 +9,12 @@ function SpatialDilatedMaxPooling:__init(kW, kH, dW, dH, padW, padH, dilationW,
 end
 
 function SpatialDilatedMaxPooling:updateOutput(input)
-   self.indices = self.indices or input.new()
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
 
    local dims = input:dim()
    self.iheight = input:size(dims-1)
diff --git a/SpatialFractionalMaxPooling.lua b/SpatialFractionalMaxPooling.lua
index f5d8076..884751d 100644
--- a/SpatialFractionalMaxPooling.lua
+++ b/SpatialFractionalMaxPooling.lua
@@ -114,7 +114,12 @@ function SpatialFractionalMaxPooling:fixPoolingRegions(val)
 end
 
 function SpatialFractionalMaxPooling:updateOutput(input)
-   self.indices = self.indices or input.new()
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
    self:initSampleBuffer_(input)
    local outW, outH = self:getOutputSizes_(input)
 
diff --git a/SpatialLogSoftMax.lua b/SpatialLogSoftMax.lua
new file mode 100644
index 0000000..9c81d49
--- /dev/null
+++ b/SpatialLogSoftMax.lua
@@ -0,0 +1,19 @@
+local SpatialLogSoftMax = torch.class('nn.SpatialLogSoftMax', 'nn.Module')
+
+function SpatialLogSoftMax:updateOutput(input)
+   input.THNN.LogSoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SpatialLogSoftMax:updateGradInput(input, gradOutput)
+   input.THNN.LogSoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/SpatialMaxPooling.lua b/SpatialMaxPooling.lua
index 8475b13..5c865c6 100644
--- a/SpatialMaxPooling.lua
+++ b/SpatialMaxPooling.lua
@@ -15,7 +15,7 @@ function SpatialMaxPooling:__init(kW, kH, dW, dH, padW, padH)
    self.padH = padH or 0
 
    self.ceil_mode = false
-   self.indices = torch.Tensor()
+   self.indices = torch.LongTensor()
 end
 
 function SpatialMaxPooling:ceil()
@@ -29,7 +29,12 @@ function SpatialMaxPooling:floor()
 end
 
 function SpatialMaxPooling:updateOutput(input)
-   self.indices = self.indices or input.new()
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
 
    local dims = input:dim()
    self.iheight = input:size(dims-1)
diff --git a/SpatialUpSamplingBilinear.lua b/SpatialUpSamplingBilinear.lua
index d911eae..8f19f91 100644
--- a/SpatialUpSamplingBilinear.lua
+++ b/SpatialUpSamplingBilinear.lua
@@ -8,21 +8,28 @@ input planes.
 
 The Y and X dimensions are assumed to be the last 2 tensor dimensions.  For
 instance, if the tensor is 4D, then dim 3 is the y dimension and dim 4 is the x.
-scale_factor is assumed to be a positive integer.
 
+scale_factor is assumed to be a positive integer. 
 owidth  = (width-1)*(scale_factor-1) + width
 oheight  = (height-1)*(scale_factor-1) + height
+
+Alternatively, owidth and oheight can be directly provided as input.
 --]]
 
-function SpatialUpSamplingBilinear:__init(scale_factor)
+function SpatialUpSamplingBilinear:__init(params)
    parent.__init(self)
 
-   self.scale_factor = scale_factor
-   if self.scale_factor < 1 then
-     error('scale_factor must be greater than 1')
-   end
-   if math.floor(self.scale_factor) ~= self.scale_factor then
-     error('scale_factor must be integer')
+   self.owidth, self.oheight, self.scale_factor = nil, nil, nil
+   if torch.type(params) == 'table' then
+      self.owidth, self.oheight = params.owidth, params.oheight
+   else
+      self.scale_factor = params   
+      if self.scale_factor < 1 then
+         error('scale_factor must be greater than 1')
+      end
+      if math.floor(self.scale_factor) ~= self.scale_factor then
+         error('scale_factor must be integer')
+      end
    end
    self.inputSize = torch.LongStorage(4)
    self.outputSize = torch.LongStorage(4)
@@ -44,32 +51,40 @@ local function makeContiguous(self, input, gradOutput)
    return input, gradOutput
 end
 
+function SpatialUpSamplingBilinear:setSize(input)
+   local xdim = input:dim()
+   local ydim = xdim - 1
+   for i = 1, input:dim() do
+      self.inputSize[i] = input:size(i)
+      self.outputSize[i] = input:size(i)
+   end
+   if self.scale_factor ~= nil then
+      self.outputSize[ydim] = self.outputSize[ydim] * self.scale_factor
+      self.outputSize[xdim] = self.outputSize[xdim] * self.scale_factor
+   else
+      self.outputSize[ydim] = self.oheight
+      self.outputSize[xdim] = self.owidth
+   end
+end
+
 function SpatialUpSamplingBilinear:updateOutput(input)
    assert(input:dim() == 4 or input:dim()==3,
-            'SpatialUpSamplingBilinear only support 3D or 4D tensors' )
+            'SpatialUpSamplingBilinear only supports 3D or 4D tensors' )
+   input = makeContiguous(self, input)
    local inputwas3D = false
    if input:dim() == 3 then
       input=input:view(-1, input:size(1), input:size(2), input:size(3))
       inputwas3D = true
    end
-   input = makeContiguous(self, input)
-   assert(input:dim() == 4)
-   -- Copy the input size
    local xdim = input:dim()
-   local ydim = input:dim() - 1
-   for i = 1, input:dim() do
-     self.inputSize[i] = input:size(i)
-     self.outputSize[i] = input:size(i)
-   end
-   self.outputSize[ydim] = (self.outputSize[ydim]-1) * (self.scale_factor-1)
-                           + self.outputSize[ydim]
-   self.outputSize[xdim] = (self.outputSize[xdim]-1) * (self.scale_factor -1)
-                           + self.outputSize[xdim]
-   -- Resize the output if needed
+   local ydim = xdim - 1
+   self:setSize(input)
    self.output:resize(self.outputSize)
    input.THNN.SpatialUpSamplingBilinear_updateOutput(
       input:cdata(),
-      self.output:cdata()
+      self.output:cdata(),
+      self.outputSize[ydim],
+      self.outputSize[xdim]
    )
    if inputwas3D then
       input = input:squeeze(1)
@@ -82,19 +97,27 @@ function SpatialUpSamplingBilinear:updateGradInput(input, gradOutput)
    assert(input:dim() == 4 or input:dim()==3,
             'SpatialUpSamplingBilinear only support 3D or 4D tensors' )
    assert(input:dim() == gradOutput:dim(),
-            'Input and gradOutput should be of same dimension' )
+	  'Input and gradOutput should be of same dimension' )
+   input, gradOutput = makeContiguous(self, input, gradOutput)
    local inputwas3D = false
    if input:dim() == 3 then
-      input=input:view(-1, input:size(1), input:size(2), input:size(3))
-      gradOutput=gradOutput:view(-1, gradOutput:size(1), gradOutput:size(2),
-                                 gradOutput:size(3))
+      input = input:view(-1, input:size(1), input:size(2), input:size(3))
+      gradOutput = gradOutput:view(-1, gradOutput:size(1), gradOutput:size(2),
+				   gradOutput:size(3))
       inputwas3D = true
    end
-   assert(input:dim() == 4 and gradOutput:dim() == 4)
-   self.gradInput:resizeAs(input)
+   local xdim = input:dim()
+   local ydim = xdim - 1
+   self.gradInput:resizeAs(input)   
    input.THNN.SpatialUpSamplingBilinear_updateGradInput(
       gradOutput:cdata(),
-      self.gradInput:cdata()
+      self.gradInput:cdata(),
+      input:size(1),
+      input:size(2),
+      input:size(3),
+      input:size(4),
+      self.outputSize[ydim],
+      self.outputSize[xdim]
    )
    if inputwas3D then
       input = input:squeeze(1)
@@ -106,6 +129,12 @@ end
 
 
 function SpatialUpSamplingBilinear:__tostring__()
-   local s = string.format('%s(%d)', torch.type(self), self.scale_factor)
+   local s
+   if self.scale_factor ~= nil then
+      s = string.format('%s(%d)', torch.type(self), self.scale_factor)
+   else
+      s = string.format('%s(%d, %d)', 
+         torch.type(self), self.oheight, self.owidth)
+   end
    return s
 end
diff --git a/SpatialUpSamplingNearest.lua b/SpatialUpSamplingNearest.lua
index c3b2330..b1b261a 100644
--- a/SpatialUpSamplingNearest.lua
+++ b/SpatialUpSamplingNearest.lua
@@ -24,7 +24,6 @@ function SpatialUpSamplingNearest:__init(scale)
    end
    self.inputSize = torch.LongStorage(4)
    self.outputSize = torch.LongStorage(4)
-   self.usage = nil
 end
 
 function SpatialUpSamplingNearest:updateOutput(input)
diff --git a/TemporalMaxPooling.lua b/TemporalMaxPooling.lua
index 91723e6..894f4a9 100644
--- a/TemporalMaxPooling.lua
+++ b/TemporalMaxPooling.lua
@@ -10,7 +10,12 @@ function TemporalMaxPooling:__init(kW, dW)
 end
 
 function TemporalMaxPooling:updateOutput(input)
-   self.indices = self.indices or input.new()
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+       self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+       self.indices = self.indices:long()
+   end
    input.THNN.TemporalMaxPooling_updateOutput(
        input:cdata(), self.output:cdata(),
        self.indices:cdata(), self.kW, self.dW
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index e40c90a..89ce106 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -45,41 +45,10 @@ function VolumetricConvolution:reset(stdv)
    end
 end
 
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput then
-      if not gradOutput:isContiguous() then
-         self._gradOutput = self._gradOutput or gradOutput.new()
-         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
-         gradOutput = self._gradOutput
-      end
-   end
-   return input, gradOutput
-end
-
--- function to re-view the weight layout in a way that would make the MM ops happy
-local function viewWeight(self)
-   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
-   if self.gradWeight and self.gradWeight:dim() > 0 then
-      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane * self.kT * self.kH * self.kW)
-   end
-end
-
-local function unviewWeight(self)
-   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
-   if self.gradWeight and self.gradWeight:dim() > 0 then
-      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW)
-   end
-end
-
 function VolumetricConvolution:updateOutput(input)
    self.finput = self.finput or input.new()
    self.fgradInput = self.fgradInput or input.new()
-   if input:type() == 'torch.CudaTensor' then
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
       input.THNN.VolumetricConvolution_updateOutput(
         input:cdata(),
         self.output:cdata(),
@@ -91,8 +60,6 @@ function VolumetricConvolution:updateOutput(input)
         self.padT, self.padW, self.padH
       )
    else
-      viewWeight(self)
-      input = makeContiguous(self, input)
       input.THNN.VolumetricConvolutionMM_updateOutput(
          input:cdata(),
          self.output:cdata(),
@@ -103,13 +70,12 @@ function VolumetricConvolution:updateOutput(input)
          self.dT, self.dW, self.dH,
          self.padT, self.padW, self.padH
       )
-      unviewWeight(self)
    end
    return self.output
 end
 
 function VolumetricConvolution:updateGradInput(input, gradOutput)
-   if input:type() == 'torch.CudaTensor' then
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
       input.THNN.VolumetricConvolution_updateGradInput(
          input:cdata(),
          gradOutput:cdata(),
@@ -122,8 +88,6 @@ function VolumetricConvolution:updateGradInput(input, gradOutput)
       return self.gradInput
    else
       if self.gradInput then
-         viewWeight(self)
-         input, gradOutput = makeContiguous(self, input, gradOutput)
          input.THNN.VolumetricConvolutionMM_updateGradInput(
             input:cdata(),
             gradOutput:cdata(),
@@ -135,14 +99,13 @@ function VolumetricConvolution:updateGradInput(input, gradOutput)
             self.dT, self.dW, self.dH,
             self.padT, self.padW, self.padH
          )
-         unviewWeight(self)
          return self.gradInput
       end
    end
 end
 
 function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
-   if input:type() == 'torch.CudaTensor' then
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
       input.THNN.VolumetricConvolution_accGradParameters(
          input:cdata(),
          gradOutput:cdata(),
@@ -155,8 +118,6 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
          scale or 1
       )
    else
-      input, gradOutput = makeContiguous(self, input, gradOutput)
-      viewWeight(self)
       input.THNN.VolumetricConvolutionMM_accGradParameters(
          input:cdata(),
          gradOutput:cdata(),
@@ -165,7 +126,6 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
          self.finput:cdata(),
          scale or 1
       )
-      unviewWeight(self)
    end
 end
 
diff --git a/VolumetricDilatedMaxPooling.lua b/VolumetricDilatedMaxPooling.lua
index 050e2c9..f4c8d5b 100644
--- a/VolumetricDilatedMaxPooling.lua
+++ b/VolumetricDilatedMaxPooling.lua
@@ -16,7 +16,12 @@ function VolumetricDilatedMaxPooling:updateOutput(input)
    self.iheight = input:size(dims-1)
    self.iwidth = input:size(dims)
 
-   self.indices = self.indices or input.new()
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
    input.THNN.VolumetricDilatedMaxPooling_updateOutput(
       input:cdata(),
       self.output:cdata(),
@@ -44,8 +49,8 @@ function VolumetricDilatedMaxPooling:updateGradInput(input, gradOutput)
 end
 
 function VolumetricDilatedMaxPooling:clearState()
-   if self.indices then 
-      self.indices:set() 
+   if self.indices then
+      self.indices:set()
    end
    return parent.clearState(self)
 end
diff --git a/VolumetricMaxPooling.lua b/VolumetricMaxPooling.lua
index fd65231..20733ed 100644
--- a/VolumetricMaxPooling.lua
+++ b/VolumetricMaxPooling.lua
@@ -22,7 +22,7 @@ function VolumetricMaxPooling:__init(kT, kW, kH, dT, dW, dH, padT, padW, padH)
 
 
    self.ceil_mode = false
-   self.indices = torch.Tensor()
+   self.indices = torch.LongTensor()
 end
 
 function VolumetricMaxPooling:ceil()
@@ -41,7 +41,12 @@ function VolumetricMaxPooling:updateOutput(input)
    self.iheight = input:size(dims-1)
    self.iwidth = input:size(dims)
 
-   self.indices = self.indices or input.new()
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
    input.THNN.VolumetricMaxPooling_updateOutput(
       input:cdata(),
       self.output:cdata(),
diff --git a/doc/convolution.md b/doc/convolution.md
index b1a0d4c..dfc48b9 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -478,8 +478,8 @@ The parameters are the following:
 If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
 will be `nOutputPlane x oheight x owidth` where
 ```lua
-owidth  = floor(width + 2 * padW - dilationW * (kW-1) + 1) / dW + 1
-oheight = floor(height + 2 * padH - dilationH * (kH-1) + 1) / dH + 1
+owidth  = floor(width + 2 * padW - dilationW * (kW-1) - 1) / dW + 1
+oheight = floor(height + 2 * padH - dilationH * (kH-1) - 1) / dH + 1
 ```
 
 Further information about the dilated convolution can be found in the following paper: [Multi-Scale Context Aggregation by Dilated Convolutions](http://arxiv.org/abs/1511.07122).
@@ -750,6 +750,7 @@ Where `u` and `v` are index from 1 (as per lua convention).  There are no learna
 
 ```lua
 module = nn.SpatialUpSamplingBilinear(scale)
+module = nn.SpatialUpSamplingBilinear({oheight=H, owidth=W})
 ```
 
 Applies a 2D up-sampling over an input image composed of several input planes. The `input` tensor in
@@ -757,8 +758,10 @@ Applies a 2D up-sampling over an input image composed of several input planes. T
 
 The parameters are the following:
   * `scale`: The upscale ratio.  Must be a positive integer
+  * Or a table `{oheight=H, owidth=W}`: The required output height and width, should be positive integers.
 
-The up-scaling method is bilinear, and given an input of height iH and width iW, output height and width will be:
+The up-scaling method is bilinear.
+If `scale` is specified, given an input of height iH and width iW, output height and width will be:
 ```lua
 oH = (iH - 1)(scale - 1) + iH
 oW = (iW - 1)(scale - 1) + iW
diff --git a/doc/criterion.md b/doc/criterion.md
index 270edb9..337d873 100644
--- a/doc/criterion.md
+++ b/doc/criterion.md
@@ -96,7 +96,7 @@ criterion.sizeAverage = false
 criterion = nn.ClassNLLCriterion([weights])
 ```
 
-The negative log likelihood criterion. It is useful to train a classication problem with `n` classes.
+The negative log likelihood criterion. It is useful to train a classification problem with `n` classes.
 If provided, the optional argument `weights` should be a 1D `Tensor` assigning weight to each of the classes.
 This is particularly useful when you have an unbalanced training set.
 
@@ -112,10 +112,10 @@ loss(x, class) = -x[class]
 ```
 
 or in the case of the `weights` argument it is specified as follows:
-
 ```lua
 loss(x, class) = -weights[class] * x[class]
 ```
+Due to the behaviour of the backend code, it is necessary to set sizeAverage to false when calculating losses *in non-batch mode*.
 
 The following is a code fragment showing how to make a gradient step given an input `x`, a desired output `y` (an integer `1` to `n`, in this case `n = 2` classes), a network `mlp` and a learning rate `learningRate`:
 
@@ -143,7 +143,7 @@ criterion = nn.CrossEntropyCriterion([weights])
 
 This criterion combines [`LogSoftMax`](#nn.LogSoftMax) and [`ClassNLLCriterion`](#nn.ClassNLLCriterion) in one single class.
 
-It is useful to train a classication problem with `n` classes.
+It is useful to train a classification problem with `n` classes.
 If provided, the optional argument `weights` should be a 1D `Tensor` assigning weight to each of the classes. This is particularly useful when you have an unbalanced training set.
 
 The `input` given through a `forward()` is expected to contain scores for each class: `input` has to be a 1D `Tensor` of size `n`.
@@ -161,7 +161,11 @@ or in the case of the `weights` argument being specified:
 ```lua
 loss(x, class) = weights[class] * (-x[class] + log(\sum_j exp(x[j])))
 ```
-
+Due to the behaviour of the backend code, it is necessary to set sizeAverage to false when calculating losses *in non-batch mode*.
+```lua
+crit = nn.CrossEntropyCriterion(weights)
+crit.nll.sizeAverage = false
+```
 The losses are averaged across observations for each minibatch.
 
 <a name="nn.ClassSimplexCriterion"/>
diff --git a/doc/image/lena.jpg b/doc/image/lena.jpg
new file mode 100644
index 0000000..d4a8c36
Binary files /dev/null and b/doc/image/lena.jpg differ
diff --git a/doc/image/parameterflattening.png b/doc/image/parameterflattening.png
deleted file mode 100644
index efab4de..0000000
Binary files a/doc/image/parameterflattening.png and /dev/null differ
diff --git a/doc/image/parameterflattening.svg b/doc/image/parameterflattening.svg
deleted file mode 100644
index d58d62f..0000000
--- a/doc/image/parameterflattening.svg
+++ /dev/null
@@ -1,338 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="275.54715mm"
-   height="214.99242mm"
-   viewBox="0 0 976.34814 761.78413"
-   id="svg2"
-   version="1.1"
-   inkscape:version="0.91 r13725"
-   sodipodi:docname="parameterflattening.svg"
-   inkscape:export-filename="/home/ubuntu/git/nn/doc/image/parameterflattening.svg.png"
-   inkscape:export-xdpi="90"
-   inkscape:export-ydpi="90">
-  <defs
-     id="defs4" />
-  <sodipodi:namedview
-     id="base"
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1.0"
-     inkscape:pageopacity="0.0"
-     inkscape:pageshadow="2"
-     inkscape:zoom="0.7"
-     inkscape:cx="165.78568"
-     inkscape:cy="360.0347"
-     inkscape:document-units="px"
-     inkscape:current-layer="layer1"
-     showgrid="false"
-     inkscape:window-width="1920"
-     inkscape:window-height="1024"
-     inkscape:window-x="0"
-     inkscape:window-y="0"
-     inkscape:window-maximized="1"
-     fit-margin-top="0"
-     fit-margin-left="0"
-     fit-margin-right="0"
-     fit-margin-bottom="0" />
-  <metadata
-     id="metadata7">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <g
-     inkscape:label="Layer 1"
-     inkscape:groupmode="layer"
-     id="layer1"
-     transform="translate(-145.10191,-140.95261)">
-    <rect
-       id="rect3336"
-       width="264.20071"
-       height="127.05788"
-       x="498.61389"
-       y="212.40469"
-       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
-    <rect
-       id="rect3336-7"
-       width="264.20071"
-       height="127.05788"
-       x="499.32819"
-       y="384.54752"
-       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
-    <rect
-       id="rect3336-7-1"
-       width="264.20071"
-       height="127.05788"
-       x="502.18533"
-       y="554.54755"
-       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
-    <rect
-       id="rect3336-7-1-4"
-       width="264.20071"
-       height="127.05788"
-       x="499.32816"
-       y="705.97614"
-       style="fill:none;stroke:#000000;stroke-width:1.08497822;stroke-opacity:1" />
-    <rect
-       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-opacity:1"
-       id="rect4183"
-       width="18.571428"
-       height="631.42859"
-       x="170.00005"
-       y="206.64792" />
-    <rect
-       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-opacity:1"
-       id="rect4185"
-       width="18.571428"
-       height="631.42859"
-       x="207.14287"
-       y="207.50507" />
-    <rect
-       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187"
-       width="84.285713"
-       height="41.42857"
-       x="518.57141"
-       y="229.50507" />
-    <rect
-       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187-3"
-       width="84.285713"
-       height="41.42857"
-       x="518.42853"
-       y="283.07651" />
-    <rect
-       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187-8"
-       width="84.285713"
-       height="41.42857"
-       x="519.35712"
-       y="400.57651" />
-    <rect
-       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187-3-3"
-       width="84.285713"
-       height="41.42857"
-       x="519.21423"
-       y="454.14792" />
-    <rect
-       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187-8-7"
-       width="84.285713"
-       height="41.42857"
-       x="526.5"
-       y="572.00507" />
-    <rect
-       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187-3-3-8"
-       width="84.285713"
-       height="41.42857"
-       x="526.35712"
-       y="625.57648" />
-    <rect
-       style="fill:#aafff8;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187-8-7-8"
-       width="84.285713"
-       height="41.42857"
-       x="529.35718"
-       y="722.00513" />
-    <rect
-       style="fill:#fcf2cd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:8, 8;stroke-dashoffset:0;stroke-opacity:1"
-       id="rect4187-3-3-8-3"
-       width="84.285713"
-       height="41.42857"
-       x="529.21429"
-       y="775.57648" />
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:none;stroke:#000000;stroke-opacity:1"
-       x="1515.7142"
-       y="190.93362"
-       id="text4278"><tspan
-         sodipodi:role="line"
-         id="tspan4280"
-         x="1515.7142"
-         y="190.93362"></tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
-       x="635.71429"
-       y="768.07654"
-       id="text4290"><tspan
-         sodipodi:role="line"
-         id="tspan4292"
-         x="635.71429"
-         y="768.07654">conv1</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
-       x="627.14288"
-       y="613.79077"
-       id="text4294"><tspan
-         sodipodi:role="line"
-         id="tspan4296"
-         x="627.14288"
-         y="613.79077">conv2</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
-       x="632.85718"
-       y="443.79074"
-       id="text4298"><tspan
-         sodipodi:role="line"
-         id="tspan4300"
-         x="632.85718"
-         y="443.79074">conv3</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
-       x="631.42865"
-       y="259.50507"
-       id="text4302"><tspan
-         sodipodi:role="line"
-         id="tspan4304"
-         x="631.42865"
-         y="259.50507">conv4</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
-       x="528.57141"
-       y="156.64792"
-       id="text4306"><tspan
-         sodipodi:role="line"
-         id="tspan4308"
-         x="528.57141"
-         y="156.64792">Network layers:</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;stroke:#000000;stroke-opacity:1;fill-opacity:1;"
-       x="145.14287"
-       y="159.79077"
-       id="text4310"><tspan
-         sodipodi:role="line"
-         x="145.14287"
-         y="159.79077"
-         id="tspan4314">flattened tensors:</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;"
-       x="175.71434"
-       y="898.0766"
-       id="text4337"><tspan
-         sodipodi:role="line"
-         id="tspan4339"
-         x="175.71434"
-         y="898.0766">params tensor</tspan></text>
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-opacity:1;"
-       x="288.57147"
-       y="815.21936"
-       id="text4341"><tspan
-         sodipodi:role="line"
-         id="tspan4343"
-         x="288.57147"
-         y="815.21936">gradParams</tspan><tspan
-         sodipodi:role="line"
-         x="288.57147"
-         y="840.21936"
-         id="tspan4345">tensor</tspan></text>
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 284.28571,810.93366 228.57143,793.79078"
-       id="path4347"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 191.42857,872.36216 180,843.79076"
-       id="path4349"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 522.85714,230.93364 185.71429,205.21935"
-       id="path4351"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 517.14285,269.50506 187.14286,342.36221"
-       id="path4353"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 521.42857,396.64792 187.14286,340.93364"
-       id="path4355"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 521.42857,440.93364 185.71429,483.79078"
-       id="path4357"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 527.14285,625.21935 225.71428,506.64792"
-       id="path4359"
-       inkscape:connector-curvature="0" />
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="M 522.85714,666.64792 225.71428,659.50506"
-       id="path4361"
-       inkscape:connector-curvature="0" />
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;"
-       x="801.42853"
-       y="649.50513"
-       id="text4363"><tspan
-         sodipodi:role="line"
-         id="tspan4365"
-         x="801.42853"
-         y="649.50513">conv2 grad weight:</tspan><tspan
-         sodipodi:role="line"
-         x="801.42853"
-         y="674.50513"
-         id="tspan4367">view onto flattened gradParams</tspan></text>
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="m 612.85708,640.9336 180,14.2857"
-       id="path4375"
-       inkscape:connector-curvature="0" />
-    <text
-       xml:space="preserve"
-       style="font-size:20px;fill:#000000;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;"
-       x="791.42853"
-       y="400.93353"
-       id="text4377"><tspan
-         sodipodi:role="line"
-         id="tspan4379"
-         x="791.42853"
-         y="400.93353">conv3 weight:</tspan><tspan
-         sodipodi:role="line"
-         x="791.42853"
-         y="425.93353"
-         id="tspan4381">view onto flattened params</tspan><tspan
-         sodipodi:role="line"
-         x="791.42853"
-         y="450.93353"
-         id="tspan4383">tensor</tspan></text>
-    <path
-       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-       d="m 782.85708,403.7907 -180,11.4286"
-       id="path4387"
-       inkscape:connector-curvature="0" />
-  </g>
-</svg>
diff --git a/doc/overview.md b/doc/overview.md
index 25eb092..6db8008 100644
--- a/doc/overview.md
+++ b/doc/overview.md
@@ -137,7 +137,7 @@ function gradUpdate(mlp, x, y, criterion, learningRate)
   mlp:updateParameters(learningRate)
 end
 ```
-For example, if you wish to use your own criterion you can simple replace 
+For example, if you wish to use your own criterion you can simply replace 
 `gradCriterion` with the gradient vector of your criterion of choice.
 
 <a name="nn.overview.sharedparams"></a>
@@ -145,7 +145,7 @@ For example, if you wish to use your own criterion you can simple replace
 
 By using `:share(...)` and the Container Modules, one can easily create very
 complex architectures. In order to make sure that the network is going to
-train properly, one need to pay attention to the way the sharing is applied,
+train properly, one needs to pay attention to the way the sharing is applied,
 because it might depend on the optimization procedure.
 
 * If you are using an optimization algorithm that iterates over the modules
diff --git a/doc/simple.md b/doc/simple.md
index 302e4d8..b7044ae 100644
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -8,6 +8,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [Bilinear](#nn.Bilinear) : a bilinear transformation with sparse inputs ;
     * [PartialLinear](#nn.PartialLinear) : a linear transformation with sparse inputs with the option of only computing a subset ;
     * [Add](#nn.Add) : adds a bias term to the incoming data ;
+    * [CAdd](#nn.CAdd) : a component-wise addition to the incoming data ;
     * [Mul](#nn.Mul) : multiply a single scalar factor to the incoming data ;
     * [CMul](#nn.CMul) : a component-wise multiplication to the incoming data ;
     * [Euclidean](#nn.Euclidean) : the euclidean distance of the input to `k` mean centers ;
@@ -44,6 +45,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [MM](#nn.MM) : matrix-matrix multiplication (also supports batches of matrices) ;
   * Miscellaneous Modules :
     * [BatchNormalization](#nn.BatchNormalization) : mean/std normalization over the mini-batch inputs (with an optional affine transform) ;
+    * [PixelShuffle](#nn.PixelShuffle) : Rearranges elements in a tensor of shape `[C*r, H, W]` to a tensor of shape `[C, H*r, W*r]` ;
     * [Identity](#nn.Identity) : forward input as-is to output (useful with [ParallelTable](table.md#nn.ParallelTable)) ;
     * [Dropout](#nn.Dropout) : masks parts of the `input` using binary samples from a [bernoulli](http://en.wikipedia.org/wiki/Bernoulli_distribution) distribution ;
     * [SpatialDropout](#nn.SpatialDropout) : same as Dropout but for spatial inputs where adjacent pixels are strongly correlated ;
@@ -295,6 +297,7 @@ As described in the paper "Efficient Object Localization Using Convolutional Net
 
 ```nn.VolumetricDropout``` accepts 4D or 5D inputs.  If the input is 4D than a layout of (features x time x height x width) is assumed and for 5D (batch x features x time x height x width) is assumed.
 
+
 <a name="nn.Abs"></a>
 ## Abs ##
 
@@ -322,7 +325,7 @@ gnuplot.grid(true)
 module = nn.Add(inputDimension, scalar)
 ```
 
-Applies a bias term to the incoming data, i.e. `yi = x_i + b_i`,  or if `scalar = true` then uses a single bias term, `yi = x_i + b`.
+Applies a bias term to the incoming data, i.e. `yi = x_i + b_i`,  or if `scalar = true` then uses a single bias term, `yi = x_i + b`. So if `scalar = true` then `inputDimension` value will be disregarded.
 
 Example:
 
@@ -364,6 +367,57 @@ gives the output:
 
 i.e. the network successfully learns the input `x` has been shifted to produce the output `y`.
 
+<a name='nn.CAdd'></a>
+## CAdd ##
+
+```lua
+module = nn.CAdd(size)
+```
+
+Applies a component-wise addition to the incoming data, i.e. `y_i = x_i + b_i`. Argument `size` can be one or many numbers (sizes) or a `torch.LongStorage`. For example, `nn.CAdd(3,4,5)` is equivalent to `nn.CAdd(torch.LongStorage{3,4,5})`. If the size for a particular dimension is 1, the addition will be expanded along the entire axis.
+
+Example:
+
+```lua
+mlp = nn.Sequential()
+mlp:add(nn.CAdd(5, 1))
+
+y = torch.Tensor(5, 4)
+bf = torch.Tensor(5, 4)
+for i = 1, 5 do bf[i] = i; end -- scale input with this
+
+function gradUpdate(mlp, x, y, criterion, learningRate)
+   local pred = mlp:forward(x)
+   local err = criterion:forward(pred, y)
+   local gradCriterion = criterion:backward(pred, y)
+   mlp:zeroGradParameters()
+   mlp:backward(x, gradCriterion)
+   mlp:updateParameters(learningRate)
+   return err
+end
+
+for i = 1, 10000 do
+   x = torch.rand(5, 4)
+   y:copy(x)
+   y:add(bf)
+   err = gradUpdate(mlp, x, y, nn.MSECriterion(), 0.01)
+end
+
+print(mlp:get(1).bias)
+```
+
+gives the output:
+
+```lua
+ 1.0000
+ 2.0000
+ 3.0000
+ 4.0000
+ 5.0000
+[torch.Tensor of dimension 5x1]
+```
+
+i.e. the network successfully learns the input `x` has been shifted by those bias factors to produce the output `y`.
 
 <a name="nn.Mul"></a>
 ## Mul ##
@@ -576,7 +630,7 @@ gives the output:
 [torch.Tensor of dimension 5x2]
 ```
 
-Here is a more useful example, where one can implement a network which also computes a Criterion using this module:
+Here is a more useful example, where one can implement a network which also computes a `Criterion` using this module:
 
 ```lua
 pred_mlp = nn.Sequential()  -- A network that makes predictions given x.
@@ -613,8 +667,10 @@ end
 module = nn.Copy(inputType, outputType, [forceCopy, dontCast])
 ```
 
-This layer copies the input to output with type casting from `inputType` to `outputType`. Unless `forceCopy` is true, when the first two arguments are the same, the input isn't copied, only transferred as the output. The default `forceCopy` is false.
-When `dontCast` is true, a call to `nn.Copy:type(type)` will not cast the module's `output` and `gradInput` Tensors to the new type. The default is false.
+This layer copies the input to output with type casting from `inputType` to `outputType`. Unless `forceCopy` is true, when the first two arguments are the same, the input isn't copied, only transferred as the output.
+The default `forceCopy` is false.
+When `dontCast` is true, a call to `nn.Copy:type(type)` will not cast the module's `output` and `gradInput` `Tensor`s to the new type.
+The default is false.
 
 <a name="nn.Narrow"></a>
 ## Narrow ##
@@ -623,7 +679,8 @@ When `dontCast` is true, a call to `nn.Copy:type(type)` will not cast the module
 module = nn.Narrow(dimension, offset, length)
 ```
 
-Narrow is application of [narrow](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-narrowdim-index-size) operation in a module. The module further supports a negative `length` in order to handle inputs with an unknown size.
+Narrow is application of [narrow](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-narrowdim-index-size) operation in a module.
+The module further supports a negative `length` in order to handle inputs with an unknown size.
 
 ```lua
 > x = torch.rand(4, 5)
@@ -725,9 +782,12 @@ module = nn.Reshape(dimension1, dimension2, ... [, batchMode])
 ```
 
 
-Reshapes an `nxpxqx..`  Tensor into a `dimension1xdimension2x...` Tensor, taking the elements row-wise.
+Reshapes an `nxpxqx..` `Tensor` into a `dimension1xdimension2x...` `Tensor`, taking the elements row-wise.
 
-The optional last argument `batchMode`, when `true` forces the first dimension of the input to be considered the batch dimension, and thus keep its size fixed. This is necessary when dealing with batch sizes of one. When `false`, it forces the entire input (including the first dimension) to be reshaped to the input size. Default `batchMode=nil`, which means that the module considers inputs with more elements than the produce of provided sizes, i.e. `dimension1xdimension2x...`, to be batches.
+The optional last argument `batchMode`, when `true` forces the first dimension of the input to be considered the batch dimension, and thus keep its size fixed.
+This is necessary when dealing with batch sizes of one.
+When `false`, it forces the entire input (including the first dimension) to be reshaped to the input size.
+Default `batchMode=nil`, which means that the module considers inputs with more elements than the produce of provided sizes, i.e. `dimension1xdimension2x...`, to be batches.
 
 Example:
 
@@ -813,7 +873,8 @@ module = nn.View(sizes)
 ```
 
 This module creates a new view of the input tensor using the `sizes` passed to the constructor. The parameter `sizes` can either be a `LongStorage` or numbers.
-The method `setNumInputDims()` allows to specify the expected number of dimensions of the inputs of the modules. This makes it possible to use minibatch inputs when using a size `-1` for one of the dimensions.
+The method `setNumInputDims()` allows to specify the expected number of dimensions of the inputs of the modules.
+This makes it possible to use minibatch inputs when using a size `-1` for one of the dimensions.
 The method `resetSize(sizes)` allows to reset the view size of the module after initialization.
 
 Example 1:
@@ -892,9 +953,12 @@ Example 2:
 <a name="nn.Contiguous"></a>
 ## Contiguous ##
 
-Is used to make `input`, `gradOutput` or both contiguous, corresponds to
-`torch.contiguous` function. Only does copy and allocation if `input` or
-`gradOutput` is not contiguous, otherwise passes the same tensor.
+```lua
+module = nn.Contiguous()
+```
+
+Is used to make `input`, `gradOutput` or both contiguous, corresponds to `torch.contiguous` function.
+Only does copy and allocation if `input` or `gradOutput` is not contiguous, otherwise passes the same `Tensor`.
 
 <a name="nn.Select"></a>
 ## Select ##
@@ -903,7 +967,7 @@ Is used to make `input`, `gradOutput` or both contiguous, corresponds to
 module = nn.Select(dim, index)
 ```
 
-Selects a dimension and index of a  `nxpxqx..`  Tensor.
+Selects a dimension and index of a  `nxpxqx..`  `Tensor`.
 
 Example:
 
@@ -978,7 +1042,8 @@ end
 module = nn.MaskedSelect()
 ```
 
-Performs a [torch.MaskedSelect](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-maskedselectmask) on a Tensor.  The mask is supplied as a tabular argument with the input on the forward and backward passes.
+Performs a [torch.MaskedSelect](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-maskedselectmask) on a `Tensor`.
+The mask is supplied as a tabular argument with the input on the forward and backward passes.
 
 Example:
 
@@ -1021,7 +1086,7 @@ Gives the output:
 module = nn.Index(dim)
 ```
 
-Applies the Tensor [index](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-indexdim-index) operation along the given dimension. So
+Applies the `Tensor` [index](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-indexdim-index) operation along the given dimension. So
 
 ```lua
 nn.Index(dim):forward{t,i}
@@ -1037,7 +1102,7 @@ t:index(dim, i)
 ```lua
 module = nn.Squeeze([dim, numInputDims])
 ```
-Applies the Tensor [squeeze](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-squeezedim) operation. So
+Applies the `Tensor` [squeeze](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-squeezedim) operation. So
 
 ```lua
 nn.Squeeze():forward(t)
@@ -1056,7 +1121,7 @@ module = nn.Unsqueeze(pos [, numInputDims])
 ```
 Insert singleton dim (i.e., dimension 1) at position `pos`.
 For an `input` with `dim = input:dim()`, there are `dim + 1` possible positions to insert the singleton dimension.
-For example, if `input` is `3` dimensional tensor in size `p x q x r`, then the singleton dim can be inserted at the following `4` positions
+For example, if `input` is `3` dimensional `Tensor` in size `p x q x r`, then the singleton dim can be inserted at the following `4` positions
 ```
 pos = 1: 1 x p x q x r
 pos = 2: p x 1 x q x r
@@ -1126,7 +1191,7 @@ t:transpose(dim3, dim4)
 module = nn.Exp()
 ```
 
-Applies the `exp` function element-wise to the input Tensor, thus outputting a Tensor of the same dimension.
+Applies the `exp` function element-wise to the input `Tensor`, thus outputting a `Tensor` of the same dimension.
 
 ```lua
 ii = torch.linspace(-2, 2)
@@ -1148,7 +1213,7 @@ gnuplot.grid(true)
 module = nn.Log()
 ```
 
-Applies the `log` function element-wise to the input Tensor, thus outputting a Tensor of the same dimension.
+Applies the `log` function element-wise to the input `Tensor`, thus outputting a Tensor of the same dimension.
 
 
 <a name="nn.Square"></a>
@@ -1248,7 +1313,7 @@ print(B)  -- output
 ```lua
 module = nn.Normalize(p, [eps])
 ```
-Normalizes the input Tensor to have unit `L_p` norm. The smoothing parameter `eps` prevents division by zero when the input contains all zero elements (default = `1e-10`).
+Normalizes the input `Tensor` to have unit `L_p` norm. The smoothing parameter `eps` prevents division by zero when the input contains all zero elements (default = `1e-10`).
 
 Input can be 1D or 2D (in which case it's considered as in batch mode)
 
@@ -1333,6 +1398,37 @@ A = torch.randn(b, m)
 C = model:forward(A)  -- C will be of size `b x m`
 ```
 
+
+<a name="nn.PixelShuffle"></a>
+## PixelShuffle ##
+```module = nn.PixelShuffle(r)```
+
+Rearranges elements in a tensor of shape `[C*r, H, W]` to a tensor of shape `[C, H*r, W*r]`. This is useful for implementing efficient sub-pixel convolution with a stride of `1/r` (see [Shi et. al](https://arxiv.org/abs/1609.05158)). Below we show how the `PixelShuffle` module can be used to learn upscaling filters to transform a low-resolution input to a high resolution one, with a 3x upscale factor. This is useful for tasks such as super-resolution, see ["Real-Time Single Image and Vid [...]
+
+```
+upscaleFactor = 3
+inputChannels = 1
+
+model = nn.Sequential()
+model:add(nn.SpatialConvolution(inputChannels, 64, 5, 5, 1, 1, 2, 2))
+model:add(nn.ReLU())
+
+model:add(nn.SpatialConvolution(64, 32, 3, 3, 1, 1, 1, 1))
+model:add(nn.ReLU())
+
+model:add(nn.SpatialConvolution(32, inputChannels * upscaleFactor * upscaleFactor, 3, 3, 1, 1, 1, 1))
+model:add(nn.PixelShuffle(upscaleFactor))
+
+input = torch.Tensor(1, 192, 256);
+out = model:forward(input)
+out:size()
+   1
+ 576
+ 768
+[torch.LongStorage of size 3]
+```
+
+
 <a name="nn.Padding"></a>
 ## Padding ##
 
diff --git a/doc/training.md b/doc/training.md
index d21bcc7..165b2d5 100644
--- a/doc/training.md
+++ b/doc/training.md
@@ -6,8 +6,9 @@ use the `optim` optimizer, which implements some cool functionalities, like Nest
 [adagrad](https://github.com/torch/optim/blob/master/doc/index.md#x-adagradopfunc-x-config-state) and
 [adam](https://github.com/torch/optim/blob/master/doc/index.md#x-adamopfunc-x-config-state).
 
-We will demonstrate using a for-loop first, to show the low-level view of what happens in training, and then
-we will show how to train using `optim`.
+We will demonstrate using a for-loop first, to show the low-level view of what happens in training.  [StochasticGradient](#nn.StochasticGradient), a simple class
+which does the job for you, is provided as standard.  Finally, [`optim`](https://github.com/torch/optim) is a powerful module,
+that provides multiple optimization algorithms.
 
 <a name="nn.DoItYourself"></a>
 ## Example of manual training of a neural network ##
@@ -95,200 +96,136 @@ You should see something like:
 [torch.Tensor of dimension 1]
 ```
 
-<a name="nn.DoItYourself"></a>
-## Training using optim ##
 
-[optim](https://github.com/torch/optim) is the standard way of training Torch7 neural networks.
+<a name="nn.StochasticGradient.dok"></a>
+## StochasticGradient ##
 
-`optim` is a quite general optimizer, for minimizing any function with respect to a set
-of parameters.  In our case, our
-function will be the loss of our network, given an input, and a set of weights.  The goal of training 
-a neural net is to
-optimize the weights to give the lowest loss over our training set of input data.  So, we are going to use optim
-to minimize the loss with respect to the weights, over our training set.  We will feed the data to 
-`optim` in minibatches.  For this particular example, we will use just one minibatch, but in your own training
-you will almost certainly want to break your training set into minibatches, and feed each minibatch to `optim`,
-one by one.
+`StochasticGradient` is a high-level class for training [neural networks](#nn.Module), using a stochastic gradient
+algorithm. This class is [serializable](https://github.com/torch/torch7/blob/master/doc/serialization.md#serialization).
 
-We need to give `optim` a function that will output the loss and the derivative of the loss with respect to the
-weights, given the current weights, as a function parameter.  The function will have access to our training minibatch, and use this
-to calculate the loss, for this minibatch.  Typically, the function would be defined inside our loop over
-batches, and therefore have access to the current minibatch data.
+<a name="nn.StochasticGradient"></a>
+### StochasticGradient(module, criterion) ###
 
-Here's how this looks:
+Create a `StochasticGradient` class, using the given [Module](module.md#nn.Module) and [Criterion](criterion.md#nn.Criterion).
+The class contains [several parameters](#nn.StochasticGradientParameters) you might want to set after initialization.
 
-__Neural Network__
+<a name="nn.StochasticGradientTrain"></a>
+### train(dataset) ###
 
-We create a simple neural network with one hidden layer.
-```lua
-require 'nn'
+Train the module and criterion given in the
+[constructor](#nn.StochasticGradient) over `dataset`, using the
+internal [parameters](#nn.StochasticGradientParameters).
 
-local model = nn.Sequential();  -- make a multi-layer perceptron
-local inputs = 2; local outputs = 1; local HUs = 20; -- parameters
-model:add(nn.Linear(inputs, HUs))
-model:add(nn.Tanh())
-model:add(nn.Linear(HUs, outputs))
-```
+StochasticGradient expect as a `dataset` an object which implements the operator
+`dataset[index]` and implements the method `dataset:size()`. The `size()` methods
+returns the number of examples and `dataset[i]` has to return the i-th example.
 
-__Criterion__
+An `example` has to be an object which implements the operator
+`example[field]`, where `field` might take the value `1` (input features)
+or `2` (corresponding label which will be given to the criterion). 
+The input is usually a Tensor (except if you use special kind of gradient modules,
+like [table layers](table.md#nn.TableLayers)). The label type depends of the criterion.
+For example, the [MSECriterion](criterion.md#nn.MSECriterion) expects a Tensor, but the
+[ClassNLLCriterion](criterion.md#nn.ClassNLLCriterion) except a integer number (the class).
 
-We choose the Mean Squared Error loss criterion:
-```lua
-local criterion = nn.MSECriterion()
-```
+Such a dataset is easily constructed by using Lua tables, but it could any `C` object
+for example, as long as required operators/methods are implemented. 
+[See an example](#nn.DoItStochasticGradient).
 
-We are using an `nn.MSECriterion` because we are training on a regression task, predicting float target values.
-For a classification task, we would add an `nn.LogSoftMax()` layer to the end of our
-network, and use a `nn.ClassNLLCriterion` loss criterion.
+<a name="nn.StochasticGradientParameters"></a>
+### Parameters ###
 
-__Dataset__
+`StochasticGradient` has several field which have an impact on a call to [train()](#nn.StochasticGradientTrain).
 
-We will just create one minibatch of 128 examples.  In your own networks, you'd want to break down your
-rather larger dataset into multiple minibatches, of around 32-512 examples each.
+  * `learningRate`: This is the learning rate used during training. The update of the parameters will be `parameters = parameters - learningRate * parameters_gradient`. Default value is `0.01`.
+  * `learningRateDecay`: The learning rate decay. If non-zero, the learning rate (note: the field learningRate will not change value) will be computed after each iteration (pass over the dataset) with: `current_learning_rate =learningRate / (1 + iteration * learningRateDecay)`
+  * `maxIteration`: The maximum number of iteration (passes over the dataset). Default is `25`.
+  * `shuffleIndices`: Boolean which says if the examples will be randomly sampled or not. Default is `true`. If `false`, the examples will be taken in the order of the dataset.
+  * `hookExample`: A possible hook function which will be called (if non-nil) during training after each example forwarded and backwarded through the network. The function takes `(self, example)` as parameters. Default is `nil`.
+  * `hookIteration`: A possible hook function which will be called (if non-nil) during training after a complete pass over the dataset. The function takes `(self, iteration, currentError)` as parameters. Default is `nil`.
 
-```lua
-local batchSize = 128
-local batchInputs = torch.Tensor(batchSize, inputs)
-local batchLabels = torch.DoubleTensor(batchSize)
+<a name="nn.DoItStochasticGradient"></a>
+## Example of training using StochasticGradient ##
+
+We show an example here on a classical XOR problem.
+
+__Dataset__
 
-for i=1,batchSize do
-  local input = torch.randn(2)     -- normally distributed example in 2d
-  local label = 1
+We first need to create a dataset, following the conventions described in
+[StochasticGradient](#nn.StochasticGradientTrain).
+```lua
+dataset={};
+function dataset:size() return 100 end -- 100 examples
+for i=1,dataset:size() do 
+  local input = torch.randn(2);     -- normally distributed example in 2d
+  local output = torch.Tensor(1);
   if input[1]*input[2]>0 then     -- calculate label for XOR function
-    label = -1;
+    output[1] = -1;
+  else
+    output[1] = 1
   end
-  batchInputs[i]:copy(input)
-  batchLabels[i] = label
+  dataset[i] = {input, output}
 end
 ```
 
-__Flatten Parameters__
-
-`optim` expects the parameters that are to be optimized, and their gradients, to be one-dimensional tensors.
-But, our network model contains probably multiple modules, typically multiple convolutional layers, and each
-of these layers has their own weight and bias tensors.  How to handle this?
-
-It is simple: we can call a standard method `:getParameters()`, that is defined for any network module.  When
-we call this method, the following magic will happen:
-- a new tensor will be created, large enough to hold all the weights and biases of the entire network model
-- the model weight and bias tensors are replaced with views onto the new contiguous parameter tensor
-- and the exact same thing will happen for all the gradient tensors: replaced with views onto one single
-contiguous gradient tensor
+__Neural Network__
 
-We can call this method as follows:
+We create a simple neural network with one hidden layer.
 ```lua
-local params, gradParams = model:getParameters()
+require "nn"
+mlp = nn.Sequential();  -- make a multi-layer perceptron
+inputs = 2; outputs = 1; HUs = 20; -- parameters
+mlp:add(nn.Linear(inputs, HUs))
+mlp:add(nn.Tanh())
+mlp:add(nn.Linear(HUs, outputs))
 ```
 
-These flattened tensors have the following characteristics:
-- to `optim`, the parameters it needs to optimize are all contained in one single one-dimensional tensor
-- when `optim` optimizes the parameters in this large one-dimensional tensor, it is implicitly optimizing
-the weights and biases in our network model, since those are now simply views onto this large one-dimensional
-parameter tensor.
-
-It will look something like this:
-
-![Parameter Flattening](image/parameterflattening.png?raw=true "Parameter Flattening")
-
-Note that flattening the parameters redefines the weight and bias tensors for all the network modules
-in our network model.  Therefore, any pre-existing references to the original model layer weight and bias tensors
-will no longer point to the model weight and bias tensors, after flattening.
-
 __Training__
 
-Now that we have created our model, our training set, and prepared the flattened network parameters,
-we can run training, using `optim`.  `optim` provides [various training algorithms](https://github.com/torch/optim/blob/master/doc/index.md).  We
-will use the stochastic gradient descent algorithm [sgd](https://github.com/torch/optim/blob/master/doc/index.md#x-sgdopfunc-x-state).  We
-need to provide the learning rate, via an optimization state table:
-
+We choose the Mean Squared Error criterion and train the dataset.
 ```lua
-local optimState = {learningRate=0.01}
+criterion = nn.MSECriterion()  
+trainer = nn.StochasticGradient(mlp, criterion)
+trainer.learningRate = 0.01
+trainer:train(dataset)
 ```
 
-We define an evaluation function, inside our training loop, and use `optim.sgd` to run training:
-```lua
-require 'optim'
-
-for epoch=1,50 do
-  -- local function we give to optim
-  -- it takes current weights as input, and outputs the loss
-  -- and the gradient of the loss with respect to the weights
-  -- gradParams is calculated implicitly by calling 'backward',
-  -- because the model's weight and bias gradient tensors
-  -- are simply views onto gradParams
-  local function feval(params)
-    gradParams:zero()
-
-    local outputs = model:forward(batchInputs)
-    local loss = criterion:forward(outputs, batchLabels)
-    local dloss_doutput = criterion:backward(outputs, batchLabels)
-    model:backward(batchInputs, dloss_doutput)
-
-    return loss,gradParams
-  end
-  optim.sgd(feval, params, optimState)
-end
-```
 __Test the network__
 
-For the prediction task, we will also typically use minibatches, although we can run prediction sample by
-sample too.  In this example, we will predict sample by sample.  To run prediction on a minibatch, simply
-pass in a tensor with one additional dimension, which represents the sample index.
-
 ```lua
 x = torch.Tensor(2)
-x[1] =  0.5; x[2] =  0.5; print(model:forward(x))
-x[1] =  0.5; x[2] = -0.5; print(model:forward(x))
-x[1] = -0.5; x[2] =  0.5; print(model:forward(x))
-x[1] = -0.5; x[2] = -0.5; print(model:forward(x))
+x[1] =  0.5; x[2] =  0.5; print(mlp:forward(x))
+x[1] =  0.5; x[2] = -0.5; print(mlp:forward(x))
+x[1] = -0.5; x[2] =  0.5; print(mlp:forward(x))
+x[1] = -0.5; x[2] = -0.5; print(mlp:forward(x))
 ```
 
 You should see something like:
 ```lua
 > x = torch.Tensor(2)
-> x[1] =  0.5; x[2] =  0.5; print(model:forward(x))
+> x[1] =  0.5; x[2] =  0.5; print(mlp:forward(x))
 
 -0.3490
 [torch.Tensor of dimension 1]
 
-> x[1] =  0.5; x[2] = -0.5; print(model:forward(x))
+> x[1] =  0.5; x[2] = -0.5; print(mlp:forward(x))
 
  1.0561
 [torch.Tensor of dimension 1]
 
-> x[1] = -0.5; x[2] =  0.5; print(model:forward(x))
+> x[1] = -0.5; x[2] =  0.5; print(mlp:forward(x))
 
  0.8640
 [torch.Tensor of dimension 1]
 
-> x[1] = -0.5; x[2] = -0.5; print(model:forward(x))
+> x[1] = -0.5; x[2] = -0.5; print(mlp:forward(x))
 
 -0.2941
 [torch.Tensor of dimension 1]
 ```
 
-If we were running on a GPU, we would probably want to predict using minibatches, because this will
-hide the latencies involved in transferring data from main memory to the GPU.  To predict
-on a minbatch, we could do something like:
-
-```lua
-local x = torch.Tensor({
-  {0.5, 0.5},
-  {0.5, -0.5},
-  {-0.5, 0.5},
-  {-0.5, -0.5}
-})
-print(model:forward(x))
-```
-You should see something like:
-```lua
-> print(model:forward(x))
- -0.3490
- 1.0561
- 0.8640
- -0.2941
-[torch.Tensor of size 4]
-```
 
-That's it! For minibatched prediction, the output tensor contains one value for each of our input data samples.
+<a name="nn.optim"></a>
+## Using optim to train a network ##
 
+[`optim`](https://github.com/torch/optim) is a powerful module, that provides multiple optimization algorithms.
diff --git a/doc/transfer.md b/doc/transfer.md
index 358ea7e..3d2d034 100644
--- a/doc/transfer.md
+++ b/doc/transfer.md
@@ -1,405 +1,607 @@
 <a name="nn.transfer.dok"></a>
 # Transfer Function Layers #
-Transfer functions are normally used to introduce a non-linearity after a parameterized layer like [Linear](simple.md#nn.Linear) and  [SpatialConvolution](convolution.md#nn.SpatialConvolution). Non-linearities allows for dividing the problem space into more complex regions than what a simple logistic regressor would permit.
+
+Transfer functions are normally used to introduce a non-linearity after a parameterized layer like [`Linear`](simple.md#nn.Linear) and [`SpatialConvolution`](convolution.md#nn.SpatialConvolution).
+Non-linearities allows for dividing the problem space into more complex regions than what a simple logistic regressor would permit.
+
 
 <a name="nn.HardTanh"></a>
 ## HardTanh ##
 
-Applies the `HardTanh` function element-wise to the input Tensor,
-thus outputting a Tensor of the same dimension.
+```lua
+f = nn.HardTanh([min_value, max_value[, inplace]])
+```
+
+Applies the `HardTanh` function element-wise to the input `Tensor`, thus outputting a `Tensor` of the same dimension.
 
 `HardTanh` is defined as:
 
-  * `f(x)` = `1, if x >`  `1,`
-  * `f(x)` = `-1, if x <`  `-1,`
-  * `f(x)` = `x,` `otherwise.`
-
-The range of the linear region `[-1 1]` can be adjusted by specifying arguments in declaration, for example `nn.HardTanh(min_value, max_value)`.
-Otherwise, `[min_value max_value]` is set to `[-1 1]` by default. In-place operation defined by third argument boolean.
+```lua
+       ⎧  1, if x >  1
+f(x) = ⎨ -1, if x < -1
+       ⎩  x, otherwise
+```
 
+The range of the linear region `[-1 1]` can be adjusted by specifying arguments in declaration, for example  `nn.HardTanh(min_value, max_value)`.
+Otherwise, `[min_value max_value]` is set to `[-1 1]` by default.
+In-place operation defined by third argument boolean.
 
 ```lua
-ii=torch.linspace(-2,2)
-m=nn.HardTanh()
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-2, 2)
+m = nn.HardTanh()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/htanh.png)
 
 
 <a name="nn.HardShrink"></a>
 ## HardShrink ##
 
-`module = nn.HardShrink(lambda)`
+```lua
+f = nn.HardShrink([lambda])
+```
 
-Applies the hard shrinkage function element-wise to the input
-[Tensor](https://github.com/torch/torch7/blob/master/doc/tensor.md). The output is the same size as the input.
+Applies the hard shrinkage function element-wise to the input `Tensor`.
+`lambda` is set to `0.5` by default.
 
 `HardShrinkage` operator is defined as:
 
-  * `f(x) = x, if x > lambda`
-  * `f(x) = x, if x < -lambda`
-  * `f(x) = 0, otherwise`
+```lua
+       ⎧ x, if x >  lambda
+f(x) = ⎨ x, if x < -lambda
+       ⎩ 0, otherwise
+```
 
 ```lua
-ii=torch.linspace(-2,2)
-m=nn.HardShrink(0.85)
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-2, 2)
+m = nn.HardShrink(0.85)
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/hshrink.png)
 
+
 <a name="nn.SoftShrink"></a>
 ## SoftShrink ##
 
-`module = nn.SoftShrink(lambda)`
+```lua
+f = nn.SoftShrink([lambda])
+```
 
-Applies the soft shrinkage function element-wise to the input
-[Tensor](https://github.com/torch/torch7/blob/master/doc/tensor.md). The output is the same size as the input.
+Applies the soft shrinkage function element-wise to the input `Tensor`.
+`lambda` is set to `0.5` by default.
 
 `SoftShrinkage` operator is defined as:
 
-  * `f(x) = x-lambda, if x > lambda`
-  * `f(x) = x+lambda, if x < -lambda`
-  * `f(x) = 0, otherwise`
+```lua
+       ⎧ x - lambda, if x >  lambda
+f(x) = ⎨ x + lambda, if x < -lambda
+       ⎩ 0, otherwise
+```
 
 ```lua
-ii=torch.linspace(-2,2)
-m=nn.SoftShrink(0.85)
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-2, 2)
+m = nn.SoftShrink(0.85)
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/sshrink.png)
 
 
 <a name="nn.SoftMax"></a>
 ## SoftMax ##
 
-Applies the `Softmax` function to an n-dimensional input Tensor,
-rescaling them so that the elements of the n-dimensional output Tensor
-lie in the range (0,1) and sum to 1.
+```lua
+f = nn.SoftMax()
+```
 
-`Softmax` is defined as `f_i(x)` = `exp(x_i-shift) / sum_j exp(x_j-shift)`,
-where `shift` = `max_i x_i`.
+Applies the `SoftMax` function to an n-dimensional input `Tensor`, rescaling them so that the elements of the n-dimensional output Tensor
+lie in the range `(0, 1)` and sum to `1`.
 
+`Softmax` is defined as:
 
 ```lua
-ii=torch.exp(torch.abs(torch.randn(10)))
-m=nn.SoftMax()
-oo=m:forward(ii)
-gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'})
+f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift)
+```
+
+where `shift = max_i(x_i)`.
+
+
+```lua
+ii = torch.exp(torch.abs(torch.randn(10)))
+m = nn.SoftMax()
+oo = m:forward(ii)
+gnuplot.plot({'Input', ii, '+-'}, {'Output', oo, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/softmax.png)
 
-Note that this module doesn't work directly with [ClassNLLCriterion](criterion.md#nn.ClassNLLCriterion), which expects the `nn.Log` to be computed between the `SoftMax` and itself. Use [LogSoftMax](#nn.LogSoftMax) instead (it's faster).
+Note that this module doesn't work directly with [`ClassNLLCriterion`](criterion.md#nn.ClassNLLCriterion), which expects the `nn.Log` to be computed between the `SoftMax` and itself.
+Use [`LogSoftMax`](#nn.LogSoftMax) instead (it's faster).
+
 
 <a name="nn.SoftMin"></a>
 ## SoftMin ##
 
-Applies the `Softmin` function to an n-dimensional input Tensor,
-rescaling them so that the elements of the n-dimensional output Tensor
-lie in the range (0,1) and sum to 1.
+```lua
+f = nn.SoftMin()
+```
+
+Applies the `SoftMin` function to an n-dimensional input `Tensor`, rescaling them so that the elements of the n-dimensional output `Tensor` lie in the range `(0,1)` and sum to `1`.
 
-`Softmin` is defined as `f_i(x)` = `exp(-x_i-shift) / sum_j exp(-x_j-shift)`,
-where `shift` = `max_i -x_i`.
+`Softmin` is defined as:
+
+```lua
+f_i(x) = exp(-x_i - shift) / sum_j exp(-x_j - shift)
+```
 
+where `shift = max_i(-x_i)`.
 
 ```lua
-ii=torch.exp(torch.abs(torch.randn(10)))
-m=nn.SoftMin()
-oo=m:forward(ii)
-gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'})
+ii = torch.exp(torch.abs(torch.randn(10)))
+m = nn.SoftMin()
+oo = m:forward(ii)
+gnuplot.plot({'Input', ii, '+-'}, {'Output', oo, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/softmin.png)
 
+
 <a name="nn.SoftPlus"></a>
 ### SoftPlus ###
 
-Applies the `SoftPlus` function to an n-dimensioanl input Tensor.
-`SoftPlus` is a smooth approximation to the [ReLU](#nn.ReLU) function and can be used to constrain the output of a machine to always be positive. For numerical stability the implementation reverts to the linear function for inputs above a certain value (20 by default).
+```lua
+f = nn.SoftPlus()
+```
+
+Applies the `SoftPlus` function to an n-dimensioanl input `Tensor`.
+`SoftPlus` is a smooth approximation to the [`ReLU`](#nn.ReLU) function and can be used to constrain the output of a machine to always be positive.
+For numerical stability the implementation reverts to the linear function for inputs above a certain value (20 by default).
+
+`SoftPlus` is defined as:
 
-`SoftPlus` is defined as `f_i(x)` = `1/beta * log(1 + exp(beta * x_i))`.
+```lua
+f_i(x) = 1/beta * log(1 + exp(beta * x_i))
+```
 
 ```lua
-ii=torch.linspace(-3,3)
-m=nn.SoftPlus()
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-3, 3)
+m = nn.SoftPlus()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/softplus.png)
 
+
 <a name="nn.SoftSign"></a>
 ## SoftSign ##
 
-Applies the `SoftSign` function to an n-dimensioanl input Tensor.
+```lua
+f = nn.SoftSign()
+```
+
+Applies the `SoftSign` function to an n-dimensioanl input `Tensor`.
+
+`SoftSign` is defined as:
 
-`SoftSign` is defined as `f_i(x) = x_i / (1+|x_i|)`
+```lua
+f_i(x) = x_i / (1+|x_i|)
+```
 
 ```lua
-ii=torch.linspace(-5,5)
-m=nn.SoftSign()
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-5, 5)
+m = nn.SoftSign()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f (x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/softsign.png)
 
+
 <a name="nn.LogSigmoid"></a>
 ## LogSigmoid ##
 
-Applies the `LogSigmoid` function to an n-dimensional input Tensor.
+```lua
+f = nn.LogSigmoid()
+```
 
-`LogSigmoid` is defined as `f_i(x)` = `log(1/(1+ exp(-x_i)))`.
+Applies the `LogSigmoid` function to an n-dimensional input `Tensor`.
 
+`LogSigmoid` is defined as:
 
 ```lua
-ii=torch.randn(10)
-m=nn.LogSigmoid()
-oo=m:forward(ii)
-go=torch.ones(10)
-gi=m:backward(ii,go)
-gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'},{'gradInput',gi,'+-'})
+f_i(x) = log(1 / (1 + exp(-x_i)))
+```
+
+```lua
+ii = torch.randn(10)
+m = nn.LogSigmoid()
+oo = m:forward(ii)
+go = torch.ones(10)
+gi = m:backward(ii, go)
+gnuplot.plot({'Input', ii, '+-'}, {'Output', oo, '+-'}, {'gradInput', gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/logsigmoid.png)
 
 
 <a name="nn.LogSoftMax"></a>
 ## LogSoftMax ##
 
-Applies the `LogSoftMax` function to an n-dimensional input Tensor.
+```lua
+f = nn.LogSoftMax()
+```
+
+Applies the `LogSoftMax` function to an n-dimensional input `Tensor`.
 
-`LogSoftmax` is defined as `f_i(x)` = `log(1/a exp(x_i))`,
-where  `a` = `sum_j exp(x_j)`.
+`LogSoftmax` is defined as:
 
 ```lua
-ii=torch.randn(10)
-m=nn.LogSoftMax()
-oo=m:forward(ii)
-go=torch.ones(10)
-gi=m:backward(ii,go)
-gnuplot.plot({'Input',ii,'+-'},{'Output',oo,'+-'},{'gradInput',gi,'+-'})
+f_i(x) = log(1 / a exp(x_i))
+```
+
+where  `a = sum_j[exp(x_j)]`.
+
+```lua
+ii = torch.randn(10)
+m = nn.LogSoftMax()
+oo = m:forward(ii)
+go = torch.ones(10)
+gi = m:backward(ii, go)
+gnuplot.plot({'Input', ii, '+-'}, {'Output', oo, '+-'}, {'gradInput', gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/logsoftmax.png)
 
+
 <a name="nn.Sigmoid"></a>
 ## Sigmoid ##
 
-Applies the `Sigmoid` function element-wise to the input Tensor,
-thus outputting a Tensor of the same dimension.
+```lua
+f = nn.Sigmoid()
+```
+
+Applies the `Sigmoid` function element-wise to the input `Tensor`, thus outputting a Tensor of the same dimension.
 
-`Sigmoid` is defined as `f(x)` = `1/(1+exp(-x))`.
+`Sigmoid` is defined as:
+
+```lua
+f(x) = 1 / (1 + exp(-x))
+```
 
 ```lua
-ii=torch.linspace(-5,5)
-m=nn.Sigmoid()
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-5, 5)
+m = nn.Sigmoid()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/sigmoid.png)
 
+
 <a name="nn.Tanh"></a>
 ## Tanh ##
 
-Applies the `Tanh` function element-wise to the input Tensor,
-thus outputting a Tensor of the same dimension.
+```lua
+f = nn.Tanh()
+```
 
-`Tanh` is defined as `f(x)` = `(exp(x)-exp(-x))/(exp(x)+exp(-x))`.
+Applies the `Tanh` function element-wise to the input `Tensor`, thus outputting a `Tensor` of the same dimension.
+
+`Tanh` is defined as:
+
+```lua
+f(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+```
 
 ```lua
-ii=torch.linspace(-3,3)
-m=nn.Tanh()
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-3, 3)
+m = nn.Tanh()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/tanh.png)
 
+
 <a name="nn.ReLU"></a>
 ## ReLU ##
 
-Applies the rectified linear unit (`ReLU`) function element-wise to the input Tensor,
-thus outputting a Tensor of the same dimension.
+```lua
+f = nn.ReLU([inplace])
+```
+
+Applies the rectified linear unit (`ReLU`) function element-wise to the input `Tensor`, thus outputting a `Tensor` of the same dimension.
+
+`ReLU` is defined as:
 
-`ReLU` is defined as `f(x)` = `max(0,x)`
+```lua
+f(x) = max(0, x)
+```
 
 Can optionally do its operation in-place without using extra state memory:
+
 ```lua
-m=nn.ReLU(true) -- true = in-place, false = keeping separate state.
+f = nn.ReLU(true) -- true = in-place, false = keeping separate state.
 ```
 
 ```lua
-ii=torch.linspace(-3,3)
-m=nn.ReLU()
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-3, 3)
+m = nn.ReLU()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/relu.png)
 
+
 <a name="nn.ReLU6"></a>
 ## ReLU6 ##
 
-Same as `ReLU` except that the rectifying function `f(x)` saturates at `x = 6`. This layer is useful for training networks that do not loose precision (due to FP saturation) when implemented as FP16.
+```lua
+f = nn.ReLU6([inplace])
+```
+
+Same as `ReLU` except that the rectifying function `f(x)` saturates at `x = 6`.
+This layer is useful for training networks that do not loose precision (due to FP saturation) when implemented as FP16.
 
-`ReLU6` is defined as `f(x)` = `min(max(0, x), 6)`
+`ReLU6` is defined as:
+
+```lua
+f(x) = min(max(0, x), 6)
+```
 
 Can optionally do its operation in-place without using extra state memory:
+
 ```lua
-m=nn.ReLU6(true) -- true = in-place, false = keeping separate state.
+f = nn.ReLU6(true) -- true = in-place, false = keeping separate state.
 ```
 
 ```lua
-ii=torch.linspace(-3, 9)
-m=nn.ReLU6() 
-oo=m:forward(ii)
-go=torch.ones(100)
-gi=m:backward(ii,go)
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-3, 9)
+m = nn.ReLU6()
+oo = m:forward(ii)
+go = torch.ones(100)
+gi = m:backward(ii, go)
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/relu6.png)
 
+
 <a name="nn.PReLU"></a>
 ## PReLU ##
 
-Applies parametric ReLU, which parameter varies the slope of the negative part:
+```lua
+f = nn.PReLU()
+```
+
+Applies parametric `ReLU`, which parameter varies the slope of the negative part:
+
+`PReLU` is defined as:
 
-`PReLU` is defined as `f(x)` = `max(0,x) + a * min(0,x)`
+```lua
+f(x) = max(0, x) + a * min(0, x)
+```
 
-When called without a number on input as ```nn.PReLU()``` uses shared version, meaning
-has only one parameter. Otherwise if called ```nn.PReLU(nOutputPlane)``` has ```nOutputPlane```
-parameters, one for each input map. The output dimension is always equal to input dimension.
-Note that weight decay should not be used on it. For reference see [Delving Deep into Rectifiers](http://arxiv.org/abs/1502.01852).
+When called without a number on input as `nn.PReLU()` uses shared version, meaning has only one parameter.
+Otherwise if called `nn.PReLU(nOutputPlane)` has `nOutputPlane` parameters, one for each input map.
+The output dimension is always equal to input dimension.
+Note that weight decay should not be used on it.
+For reference see [Delving Deep into Rectifiers](http://arxiv.org/abs/1502.01852).
 
 ![](image/prelu.png)
 
+
 <a name="nn.RReLU"></a>
 ## RReLU ##
 
-Applies the randomized leaky rectified linear unit (RReLU) element-wise to the input tensor, thus outputting a tensor of the same dimension. Informally the RReLU is also known as 'insanity' layer.
+```lua
+f = nn.RReLU([l, u[, inplace]])
+```
 
-`RReLU` is defined as `f(x)` = `max(0,x) + a * min(0,x)`, where `a` ~ `U(l,u)`.
+Applies the randomized leaky rectified linear unit (`RReLU`) element-wise to the input `Tensor`, thus outputting a `Tensor` of the same dimension.
+Informally the `RReLU` is also known as 'insanity' layer.
 
-In training mode negative inputs are multiplied by a factor `a` drawn from a uniform random distribution `U(l, u)`. In evaluation mode a RReLU behaves like a LeakyReLU with a constant mean factor `a` = `(l+u)/2`.
+`RReLU` is defined as:
 
-Syntax:
 ```lua
-m=nn.ReLU(
-   l,       -- minimum factor for negative inputs, default: 1/8;
-   u,       -- maximum factor for negative inputs, default: 1/3;
-   inplace  -- if true the result will be written to the input tensor, default: false;
-)
+f(x) = max(0,x) + a * min(0, x)
 ```
-If `l == u` a RReLU effectively becomes a LeakyReLU. Regardless of operating in in-place mode a RReLU will internally allocate an input-sized `noise` tensor to store random factors for negative inputs. The backward() operation assumes that forward() has been called before.
+
+where `a ~ U(l, u)`.
+
+In training mode negative inputs are multiplied by a factor `a` drawn from a uniform random distribution `U(l, u)`.
+In evaluation mode a `RReLU` behaves like a `LeakyReLU` with a constant mean factor `a = (l + u) / 2`.
+By default, `l = 1/8` and `u = 1/3`.
+If `l == u` a `RReLU` effectively becomes a `LeakyReLU`.
+Regardless of operating in in-place mode a `RReLU` will internally allocate an input-sized `noise` tensor to store random factors for negative inputs.
+The `backward()` operation assumes that `forward()` has been called before.
 
 For reference see [Empirical Evaluation of Rectified Activations in Convolutional Network](http://arxiv.org/abs/1505.00853).
+
 ```lua
-ii=torch.linspace(-3, 3)
-m=nn.RReLU()
-oo=m:forward(ii):clone()
-gi=m:backward(ii,torch.ones(100))
-gnuplot.plot({'f(x)',ii,oo,'+-'},{'df/dx',ii,gi,'+-'})
+ii = torch.linspace(-3, 3)
+m = nn.RReLU()
+oo = m:forward(ii):clone()
+gi = m:backward(ii, torch.ones(100))
+gnuplot.plot({'f(x)', ii, oo, '+-'}, {'df/dx', ii, gi, '+-'})
 gnuplot.grid(true)
 ```
+
 ![](image/rrelu.png)
 
+
 <a name="nn.ELU"></a>
 ## ELU ##
 
-Applies exponential linear unit (ELU), which parameter a varies the convergence value of the exponential function below zero:
+```lua
+f = nn.ELU([alpha[, inplace]])
+```
 
-`ELU` is defined as `f(x)` = `max(0,x) + min(0,a*(exp(x)-1))`
+Applies exponential linear unit (`ELU`), which parameter a varies the convergence value of the exponential function below zero:
 
-It is called with the parameter a as ```nn.ELU(a)``` with the default value `a=1`. The output dimension is always equal to input dimension.
+`ELU` is defined as:
 
-For reference see [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)](http://arxiv.org/abs/1511.07289).
 ```lua
-require 'nn'
-require 'gnuplot'
+f(x) = max(0, x) + min(0, alpha * (exp(x) - 1))
+```
+
+The output dimension is always equal to input dimension.
+
+For reference see [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)](http://arxiv.org/abs/1511.07289).
 
-xs = torch.linspace(-3,3,200)
+```lua
+xs = torch.linspace(-3, 3, 200)
 go = torch.ones(xs:size(1))
 function f(a) return nn.ELU(a):forward(xs) end
 function df(a) local m = nn.ELU(a) m:forward(xs) return m:backward(xs, go) end
 
-gnuplot.plot({'fw ELU, alpha=0.1', xs,  f(0.1), '-'},
-             {'fw ELU, alpha=1.0', xs,  f(1.0), '-'},
-             {'bw ELU, alpha=0.1', xs, df(0.1), '-'},
-             {'bw ELU, alpha=1.0', xs, df(1.0), '-'})
+gnuplot.plot({'fw ELU, alpha = 0.1', xs,  f(0.1), '-'},
+             {'fw ELU, alpha = 1.0', xs,  f(1.0), '-'},
+             {'bw ELU, alpha = 0.1', xs, df(0.1), '-'},
+             {'bw ELU, alpha = 1.0', xs, df(1.0), '-'})
 gnuplot.grid(true)
 ```
+
 ![](image/elu.png)
 
+
 <a name="nn.LeakyReLU"></a>
 ## LeakyReLU ##
 
-Applies Leaky ReLU, which parameter `a` sets the slope of the negative part:
+```lua
+f = nn.LeakyReLU([negval[, inplace]])
+```
+
+Applies `LeakyReLU`, which parameter `negval` sets the slope of the negative part:
+
+`LeakyReLU` is defined as:
 
-`LeakyReLU` is defined as `f(x)` = `max(0,x) + a * min(0,x)`
+```lua
+f(x) = max(0, x) + negval * min(0, x)
+```
 
 Can optionally do its operation in-place without using extra state memory:
 
 ```lua
-m=nn.LeakyReLU(a,true) -- true = in-place, false = keeping separate state.
+f = nn.LeakyReLU(negval, true) -- true = in-place, false = keeping separate state.
 ```
 
+
 <a name="nn.SpatialSoftMax"></a>
 ## SpatialSoftMax ##
 
-Applies [SoftMax](#nn.SoftMax) over features to each spatial location (height x width of planes).
+```lua
+f = nn.SpatialSoftMax()
+```
+
+Applies [`SoftMax`](#nn.SoftMax) over features to each spatial location (height x width of planes).
+The module accepts 1D (vector), 2D (batch of vectors), 3D (vectors in space) or 4D (batch of vectors in space) `Tensor` as input.
+Functionally it is equivalent to [`SoftMax`](#nn.SoftMax) when 1D or 2D input is used.
+The output dimension is always the same as input dimension.
+
+```lua
+ii = torch.randn(4, 8, 16, 16)  -- batchSize x features x height x width
+m = nn.SpatialSoftMax()
+oo = m:forward(ii)
+```
+
+<a name="nn.SpatialLogSoftMax"></a>
+## SpatialLogSoftMax ##
+
+Applies [LogSoftMax](#nn.LogSoftMax) over features to each spatial location (height x width of planes).
 The module accepts 1D (vector), 2D (batch of vectors), 3D (vectors in space) or 4D (batch of vectors in space) tensor as input.
-Functionally it is equivalent to [SoftMax](#nn.SoftMax) when 1D or 2D input is used.
+Functionally it is equivalent to [LogSoftMax](#nn.LogSoftMax) when 1D or 2D input is used.
 The output dimension is always the same as input dimension.
 
 ```lua
 ii=torch.randn(4,8,16,16)  -- batchSize x features x height x width
-m=nn.SpatialSoftMax()
+m=nn.SpatialLogSoftMax()
 oo = m:forward(ii)
 ```
 
 <a name="nn.AddConstant"></a>
 ## AddConstant ##
 
-Adds a (non-learnable) scalar constant.  This module is sometimes useful for debugging purposes:  `f(x)` = `x + k`, where `k` is a scalar.
+```lua
+f = nn.AddConstant(k[, inplace])
+```
+
+Adds a (non-learnable) scalar constant.
+This module is sometimes useful for debugging purposes.
+Its transfer function is:
+
+```lua
+f(x) = x + k
+```
+
+where `k` is a scalar.
 
 Can optionally do its operation in-place without using extra state memory:
+
 ```lua
-m=nn.AddConstant(k,true) -- true = in-place, false = keeping separate state.
+f = nn.AddConstant(k, true) -- true = in-place, false = keeping separate state.
 ```
-In-place mode restores the original input value after the backward pass, allowing its use after other in-place modules, like [MulConstant](#nn.MulConstant).
+
+In-place mode restores the original input value after the backward pass, allowing its use after other in-place modules, like [`MulConstant`](#nn.MulConstant).
+
 
 <a name="nn.MulConstant"></a>
 ## MulConstant ##
 
-Multiplies input tensor by a (non-learnable) scalar constant.  This module is sometimes useful for debugging purposes:  `f(x)` = `k * x`, where `k` is a scalar.
+```lua
+f = nn.MulConstant(k[, inplace])
+```
+
+Multiplies input `Tensor` by a (non-learnable) scalar constant.
+This module is sometimes useful for debugging purposes.
+Its transfer function is:
+
+```lua
+f(x) = k * x
+```
+
+where `k` is a scalar.
 
 Can optionally do its operation in-place without using extra state memory:
+
 ```lua
-m=nn.MulConstant(k,true) -- true = in-place, false = keeping separate state.
+m = nn.MulConstant(k, true) -- true = in-place, false = keeping separate state.
 ```
-In-place mode restores the original input value after the backward pass, allowing its use after other in-place modules, like [AddConstant](#nn.AddConstant).
+
+In-place mode restores the original input value after the backward pass, allowing its use after other in-place modules, like [`AddConstant`](#nn.AddConstant).
diff --git a/init.lua b/init.lua
index 70027a1..1e3924b 100644
--- a/init.lua
+++ b/init.lua
@@ -44,6 +44,7 @@ require('nn.Mean')
 require('nn.CMul')
 require('nn.Mul')
 require('nn.MulConstant')
+require('nn.CAdd')
 require('nn.Add')
 require('nn.AddConstant')
 require('nn.Dropout')
@@ -90,6 +91,7 @@ require('nn.ReLU6')
 require('nn.PReLU')
 require('nn.LeakyReLU')
 require('nn.SpatialSoftMax')
+require('nn.SpatialLogSoftMax')
 require('nn.RReLU')
 require('nn.ELU')
 
@@ -173,6 +175,8 @@ require('nn.BCECriterion')
 require('nn.CrossEntropyCriterion')
 require('nn.ParallelCriterion')
 
+require('nn.PixelShuffle')
+
 require('nn.StochasticGradient')
 
 require('nn.MM')
@@ -183,4 +187,5 @@ require('nn.SparseJacobian')
 require('nn.hessian')
 require('nn.test')
 
+
 return nn
diff --git a/lib/THNN/CMakeLists.txt b/lib/THNN/CMakeLists.txt
index b221d59..cb704b1 100644
--- a/lib/THNN/CMakeLists.txt
+++ b/lib/THNN/CMakeLists.txt
@@ -14,6 +14,7 @@ ENDIF()
 IF(MSVC)
   # we want to respect the standard, and we are bored of those **** .
   ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+  ADD_DEFINITIONS(-DTH_EXPORTS)
 ENDIF(MSVC)
 
 IF (CMAKE_VERSION VERSION_LESS "3.1")
diff --git a/lib/THNN/THNN.h b/lib/THNN/THNN.h
index 9efcd46..0019b79 100644
--- a/lib/THNN/THNN.h
+++ b/lib/THNN/THNN.h
@@ -19,7 +19,15 @@ typedef long THIndex_t;
 typedef int THInteger_t;
 typedef void THNNState;
 
+#define THNN_resizeAs_indices(I1, I2)                    \
+  THLongStorage *size2 = THIndexTensor_(newSizeOf)(I2);  \
+  if (!THTensor_(isSize)(I1, size2))                     \
+  { \
+    THTensor_(resize)(I1, size2, NULL);                  \
+  } \
+  THLongStorage_free(size2);
+
 #include "generic/THNN.h"
 #include <THGenerateFloatTypes.h>
 
-#endif
\ No newline at end of file
+#endif
diff --git a/lib/THNN/generic/Abs.c b/lib/THNN/generic/Abs.c
index c5e36ff..28721ec 100644
--- a/lib/THNN/generic/Abs.c
+++ b/lib/THNN/generic/Abs.c
@@ -17,6 +17,7 @@ void THNN_(Abs_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
     real z = *input_data;
diff --git a/lib/THNN/generic/AbsCriterion.c b/lib/THNN/generic/AbsCriterion.c
index e87bb5b..9bee5de 100644
--- a/lib/THNN/generic/AbsCriterion.c
+++ b/lib/THNN/generic/AbsCriterion.c
@@ -10,7 +10,7 @@ void THNN_(AbsCriterion_updateOutput)(
           bool sizeAverage)
 {
   real sum = 0;
-
+  THNN_CHECK_NELEMENT(input, target);
   TH_TENSOR_APPLY2(real, input, real, target,
     sum += fabs(*input_data - *target_data);
   );
@@ -28,6 +28,7 @@ void THNN_(AbsCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/BCECriterion.c b/lib/THNN/generic/BCECriterion.c
index c8d7da2..55909ba 100644
--- a/lib/THNN/generic/BCECriterion.c
+++ b/lib/THNN/generic/BCECriterion.c
@@ -4,8 +4,13 @@
 
 #define EPS 1e-12
 
-void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage, THTensor *weights)
+void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input,
+				      THTensor *target, THTensor *output,
+				      bool sizeAverage, THTensor *weights)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
   real sum = 0;
 
   if(weights)
@@ -29,8 +34,13 @@ void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input, THTenso
   THTensor_(set1d)(output, 0, sum);
 }
 
-void THNN_(BCECriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage, THTensor *weights)
+void THNN_(BCECriterion_updateGradInput)(THNNState *state, THTensor *input,
+					 THTensor *target, THTensor *gradInput,
+					 bool sizeAverage, THTensor *weights)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/BatchNormalization.c b/lib/THNN/generic/BatchNormalization.c
index bf36d30..fb4ba90 100644
--- a/lib/THNN/generic/BatchNormalization.c
+++ b/lib/THNN/generic/BatchNormalization.c
@@ -9,8 +9,10 @@ void THNN_(BatchNormalization_updateOutput)(
   THTensor *save_mean, THTensor *save_std,
   bool train, double momentum, double eps)
 {
+  THTensor_(resizeAs)(output, input);
   long nInput = THTensor_(size)(input, 1);
-  long f,n = THTensor_(nElement)(input) / nInput;
+  long f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
 
   #pragma omp parallel for
   for (f = 0; f < nInput; ++f) {
@@ -70,8 +72,10 @@ void THNN_(BatchNormalization_backward)(
   THTensor *save_mean, THTensor *save_std,
   bool train, double scale, double eps)
 {
+  THNN_CHECK_SHAPE(input, gradOutput);
   long nInput = THTensor_(size)(input, 1);
-  long f,n = THTensor_(nElement)(input) / nInput;
+  long f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
 
   #pragma omp parallel for
   for (f = 0; f < nInput; ++f) {
@@ -97,6 +101,7 @@ void THNN_(BatchNormalization_backward)(
       dotp += (*in_data - mean) * (*gradOut_data););
 
     if (gradInput) {
+      THTensor_(resizeAs)(gradInput, input);      
       THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
 
       if (train) {
diff --git a/lib/THNN/generic/ClassNLLCriterion.c b/lib/THNN/generic/ClassNLLCriterion.c
index aea726c..0db3a8a 100644
--- a/lib/THNN/generic/ClassNLLCriterion.c
+++ b/lib/THNN/generic/ClassNLLCriterion.c
@@ -11,6 +11,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
           THTensor *weights,
           THTensor *total_weight)
 {
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  THNN_CHECK_DIM_SIZE(total_weight, 1, 0, 1);
   int n_dims = THTensor_(nDimension)(input);
   int n_classes = THTensor_(size)(input, n_dims - 1);
 
@@ -21,7 +23,9 @@ void THNN_(ClassNLLCriterion_updateOutput)(
     THError("input tensor should be 1D or 2D");
   }
   if (weights && THTensor_(nElement)(weights) != n_classes) {
-    THError("weight tensor should be defined either for all or no classes");
+    THDescBuff s1 = THTensor_(sizeDesc)(weights);
+    THError("weight tensor should be defined either for all %d classes or no classes"
+	    " but got weight tensor of shape: %s", n_classes, s1.str);
   }
 
   input = THTensor_(newContiguous)(input);
diff --git a/lib/THNN/generic/DistKLDivCriterion.c b/lib/THNN/generic/DistKLDivCriterion.c
index 507324d..e1bd8cd 100644
--- a/lib/THNN/generic/DistKLDivCriterion.c
+++ b/lib/THNN/generic/DistKLDivCriterion.c
@@ -9,6 +9,9 @@ void THNN_(DistKLDivCriterion_updateOutput)(
           THTensor *output,
           bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  
   real sum = 0;
 
   TH_TENSOR_APPLY2(real, input, real, target,
@@ -28,6 +31,8 @@ void THNN_(DistKLDivCriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/ELU.c b/lib/THNN/generic/ELU.c
index 8303de0..784a203 100644
--- a/lib/THNN/generic/ELU.c
+++ b/lib/THNN/generic/ELU.c
@@ -8,7 +8,7 @@ void THNN_(ELU_updateOutput)(
           THTensor *output,
           real alpha,
           bool inplace)
-{
+{  
   if(inplace) {
     TH_TENSOR_APPLY(real, input,
       if(*input_data <= 0) {
@@ -33,6 +33,7 @@ void THNN_(ELU_updateGradInput)(
           real alpha,
           bool inplace)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   if(inplace) {
     TH_TENSOR_APPLY2(real, gradOutput, real, output,
       if(*output_data <= 0) {
diff --git a/lib/THNN/generic/HardShrink.c b/lib/THNN/generic/HardShrink.c
index 689f565..50d272c 100644
--- a/lib/THNN/generic/HardShrink.c
+++ b/lib/THNN/generic/HardShrink.c
@@ -27,6 +27,7 @@ void THNN_(HardShrink_updateGradInput)(
           THTensor *gradInput,
           real lambda)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
     if (*input_data > lambda || *input_data < -lambda)
diff --git a/lib/THNN/generic/HardTanh.c b/lib/THNN/generic/HardTanh.c
index f360068..57ef1be 100644
--- a/lib/THNN/generic/HardTanh.c
+++ b/lib/THNN/generic/HardTanh.c
@@ -37,8 +37,8 @@ void THNN_(HardTanh_updateOutput)(
   {
     real* ptr_input  = THTensor_(data)(input);
     real* ptr_output = THTensor_(data)(output);
-    long i;
-    long n = THTensor_(nElement)(input);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
 
     if (inplace)
 #pragma omp parallel for private(i)
@@ -72,6 +72,7 @@ void THNN_(HardTanh_updateGradInput)(
           real max_val,
           bool inplace)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   if (inplace)
     THTensor_(set)(gradInput, gradOutput);
   else
@@ -102,8 +103,8 @@ void THNN_(HardTanh_updateGradInput)(
     real* ptr_gradOutput = THTensor_(data)(gradOutput);
     real* ptr_gradInput  = THTensor_(data)(gradInput);
     real* ptr_input      = THTensor_(data)(input);
-    long i;
-    long n = THTensor_(nElement)(input);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
 
     if (inplace)
 #pragma omp parallel for private(i)
diff --git a/lib/THNN/generic/L1Cost.c b/lib/THNN/generic/L1Cost.c
index 86f69a6..8f5eb17 100644
--- a/lib/THNN/generic/L1Cost.c
+++ b/lib/THNN/generic/L1Cost.c
@@ -7,6 +7,7 @@ void THNN_(L1Cost_updateOutput)(
           THTensor *input,
           THTensor *output)
 {
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
   accreal sum = 0;
 
   TH_TENSOR_APPLY(real, input, 
@@ -22,6 +23,7 @@ void THNN_(L1Cost_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY2(real, gradInput, real, input,
     if (*input_data > 0)
diff --git a/lib/THNN/generic/LeakyReLU.c b/lib/THNN/generic/LeakyReLU.c
index 5276989..a4d9677 100644
--- a/lib/THNN/generic/LeakyReLU.c
+++ b/lib/THNN/generic/LeakyReLU.c
@@ -34,6 +34,7 @@ void THNN_(LeakyReLU_updateGradInput)(
           real negval,
           bool inplace)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   if (inplace)
   {
     TH_TENSOR_APPLY2(real, gradOutput, real, input,
diff --git a/lib/THNN/generic/Linear.c b/lib/THNN/generic/Linear.c
new file mode 100644
index 0000000..933bc4b
--- /dev/null
+++ b/lib/THNN/generic/Linear.c
@@ -0,0 +1,110 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Linear.c"
+#else
+
+void THNN_(Linear_updateAddBuffer)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *addBuffer)
+{
+  long nframe = THTensor_(size)(input,0);
+  long nElement = THTensor_(nElement)(addBuffer);
+  if (nElement != nframe) {
+    THTensor_(resize1d)(addBuffer,nframe);
+    THTensor_(fill)(addBuffer,1.0);
+  }
+}
+
+void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer)
+{
+  long dim = THTensor_(nDimension)(input);
+  if (dim == 1) {
+    THTensor_(resize1d)(output,THTensor_(size)(weight,0));
+    if (bias) {
+      THTensor_(copy)(output,bias);
+    }
+    else {
+      THTensor_(zero)(output);
+    }
+    THTensor_(addmv)(output,1,output,1,weight,input);
+  }
+  else if (dim == 2) {
+    long nframe = THTensor_(size)(input,0);
+    long nElement = THTensor_(nElement)(output);
+    THTensor_(resize2d)(output,nframe,THTensor_(size)(weight,0));
+    if (THTensor_(nElement)(output) != nElement) {
+      THTensor_(zero)(output);
+    }
+    THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+    THTensor_(transpose)(weight,weight,0,1);
+    THTensor_(addmm)(output,0,output,1,input,weight);
+    THTensor_(transpose)(weight,weight,0,1);
+    if (bias) {
+      THTensor_(addr)(output,1,output,1,addBuffer,bias);
+    }
+  }
+}
+
+void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight)
+{
+  if (gradInput) {
+    long nElement = THTensor_(nElement)(gradInput);
+    THTensor_(resizeAs)(gradInput,input);
+    if (THTensor_(nElement)(gradInput) != nElement) {
+      THTensor_(zero)(gradInput);
+    }
+
+    long dim = THTensor_(nDimension)(input);
+    if (dim == 1) {
+      THTensor_(transpose)(weight,weight,0,1);
+      THTensor_(addmv)(gradInput,0,gradInput,1,weight,gradOutput);
+      THTensor_(transpose)(weight,weight,0,1);
+    }
+    else if (dim == 2) {
+      THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight);
+    }
+  }
+}
+
+void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          real scale)
+{
+  long dim = THTensor_(nDimension)(input);
+  if (dim == 1) {
+    THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input);
+    if (bias) {
+      THTensor_(cadd)(gradBias,gradBias,scale,gradOutput);
+    }
+  }
+  else if (dim == 2) {
+    THTensor_(transpose)(gradOutput,gradOutput,0,1);
+    THTensor_(addmm)(gradWeight,1,gradWeight,scale,gradOutput,input);
+    if (bias) {
+      THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+      THTensor_(addmv)(gradBias,1,gradBias,scale,gradOutput,addBuffer);
+    }
+    THTensor_(transpose)(gradOutput,gradOutput,0,1);
+  }
+}
+
+#endif
diff --git a/lib/THNN/generic/LogSigmoid.c b/lib/THNN/generic/LogSigmoid.c
index 20932f1..651d560 100644
--- a/lib/THNN/generic/LogSigmoid.c
+++ b/lib/THNN/generic/LogSigmoid.c
@@ -25,6 +25,7 @@ void THNN_(LogSigmoid_updateGradInput)(
           THTensor *gradInput,
           THTensor *buffer)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, buffer);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
     real z = *buffer_data;
diff --git a/lib/THNN/generic/LogSoftMax.c b/lib/THNN/generic/LogSoftMax.c
index 3160d8a..3ed9c3b 100644
--- a/lib/THNN/generic/LogSoftMax.c
+++ b/lib/THNN/generic/LogSoftMax.c
@@ -8,23 +8,35 @@ void THNN_(LogSoftMax_updateOutput)(
           THTensor *output)
 {
   real *input_data, *output_data;
-  long nframe = 0, dim = 0;
-  long t, d;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t, d;
 
   if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0];
+    stride = 1;
   }
   else if (input->nDimension == 2)
   {
     nframe = input->size[0];
     dim = input->size[1];
+    stride = 1;
   }
-  else
+  else if (input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if (input->nDimension == 4)
   {
-    THArgCheck(0, 2, "vector or matrix expected");
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
   }
+  else
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
 
   input = THTensor_(newContiguous)(input);
   THTensor_(resizeAs)(output, input);
@@ -35,22 +47,22 @@ void THNN_(LogSoftMax_updateOutput)(
   accreal logsum;
   real maxInput;
   #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
-  for (t = 0; t < nframe; t++)
+  for (t = 0; t < stride*nframe; t++)
   {
     logsum = 0;
     maxInput = -THInf;
-    input_data = input_data0 + dim*t;
-    output_data = output_data0 + dim*t;
+    input_data = input_data0 + (t/stride)*dim*stride + t % stride;
+    output_data = output_data0 + (t/stride)*dim*stride + t % stride;
 
     for (d = 0; d < dim; d++)
-      maxInput = THMax(maxInput, input_data[d]);
+      maxInput = THMax(maxInput, input_data[d*stride]);
 
     for (d = 0; d < dim; d++)
-      logsum += exp(input_data[d] - maxInput);
+      logsum += exp(input_data[d*stride] - maxInput);
     logsum = maxInput + log(logsum);
 
     for (d = 0; d < dim; d++)
-      output_data[d] = input_data[d] - logsum;
+      output_data[d*stride] = input_data[d*stride] - logsum;
   }
 
   THTensor_(free)(input);
@@ -63,26 +75,41 @@ void THNN_(LogSoftMax_updateGradInput)(
           THTensor *gradInput,
           THTensor *output)
 {
-
+  THNN_CHECK_SHAPE(input, gradOutput);
   gradOutput = THTensor_(newContiguous)(gradOutput);
   real *gradInput_data, *gradOutput_data, *output_data;
-  long nframe = 0, dim = 0;
-  long t, d;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t, d;
 
   if (output->nDimension == 1)
   {
     nframe = 1;
     dim = output->size[0];
+    stride = 1;
   }
   else if (output->nDimension == 2)
   {
     nframe = output->size[0];
     dim = output->size[1];
+    stride = 1;
   }
-  else
+  else if (output->nDimension == 3)
   {
-    THError("vector or matrix expected");
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
   }
+  else if (output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+    THError("1D, 2D, 3D or 4D tensor expected");
+
+  output = THTensor_(newContiguous)(output);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
 
   THTensor_(resizeAs)(gradInput, output);
   real *gradInput_data0 = THTensor_(data)(gradInput);
@@ -90,21 +117,22 @@ void THNN_(LogSoftMax_updateGradInput)(
   real *gradOutput_data0 = THTensor_(data)(gradOutput);
   accreal sum;
   #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
-  for (t = 0; t < nframe; t++)
+  for (t = 0; t < stride*nframe; t++)
   {
     sum = 0;
-    gradInput_data = gradInput_data0 + dim*t;
-    output_data = output_data0 + dim*t;
-    gradOutput_data = gradOutput_data0 + dim*t;
+    gradInput_data = gradInput_data0 + (t/stride)*dim*stride + t % stride;
+    output_data = output_data0 + (t/stride)*dim*stride + t % stride;
+    gradOutput_data = gradOutput_data0 + (t/stride)*dim*stride + t % stride;
 
     for (d = 0; d < dim; d++)
-      sum += gradOutput_data[d];
+      sum += gradOutput_data[d*stride];
 
     for (d = 0; d < dim; d++)
-      gradInput_data[d] = gradOutput_data[d] - exp(output_data[d])*sum;
+      gradInput_data[d*stride] = gradOutput_data[d*stride] - exp(output_data[d*stride])*sum;
   }
 
   THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
 }
 
 #endif
diff --git a/lib/THNN/generic/LookupTable.c b/lib/THNN/generic/LookupTable.c
index 378d1c3..b460f38 100644
--- a/lib/THNN/generic/LookupTable.c
+++ b/lib/THNN/generic/LookupTable.c
@@ -6,9 +6,9 @@ static void THNN_(LookupTable_resetCount)(
           THInteger_t *count_data,
           THIndexTensor *input)
 {
-  int i;
+  ptrdiff_t i;
   THIndex_t *input_data = THIndexTensor_(data)(input);
-  long numel = THIndexTensor_(nElement)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
 
   for (i = 0; i<numel; i++)
   {
@@ -29,12 +29,12 @@ void THNN_(LookupTable_accGradParameters)(
           THTensor *gradWeight,
           THIntegerTensor *count,
           THTensor *sorted,
-          THTensor *indices,
+          THIndexTensor *indices,
           bool scaleGradByFreq,
           int paddingValue,
           real scale)
 {
-  long i;
+  ptrdiff_t i;
   THInteger_t *count_data = NULL;
 
   if (scaleGradByFreq)
@@ -47,17 +47,22 @@ void THNN_(LookupTable_accGradParameters)(
     THError("gradWeight must be contiguous");
   if (!THIndexTensor_(isContiguous)(input))
     THError("input must be contiguous");
-  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2)
-    THError("input must be a vector or matrix");
+  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) {
+    THDescBuff s1 = THIndexTensor_(sizeDesc)(input);
+    THError("input must be a vector or matrix, but is of shape: %s", s1.str);
+  }
 
   THIndex_t *input_data = THIndexTensor_(data)(input);
-  long numel = THIndexTensor_(nElement)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
   long numw = THTensor_(size)(gradWeight, 0);
 
   // check that inputs are all within range
   for (i=0; i<numel; i++)
-    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE)
-      THError("input out of range");
+    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) {
+      THError("inputs need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      input_data[i]);
+    }
 
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
@@ -170,19 +175,23 @@ void THNN_(LookupTable_renorm)(
   if (normType <= 0)
     THError("non-positive-norm not supported");
 
-  long i;
+  ptrdiff_t i;
   THIndex_t *row_idx = THIndexTensor_(data)(idx);
-  long numel = THIndexTensor_(nElement)(idx);
+  ptrdiff_t numel = THIndexTensor_(nElement)(idx);
 
   long numw = THTensor_(size)(weight, 0);
   long stride = THTensor_(stride)(weight, 0);
   real *gw = THTensor_(data)(weight);
-  for (i=0; i<numel; i++)
-    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE)
-      THError("input out of range");
+  for (i=0; i<numel; i++) {
+    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) {
+      THError("input need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      row_idx[i]);
+    }
+  }
   // get unique indices
   qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
-  long ptr = 0;
+  ptrdiff_t ptr = 0;
   for (i=0; i<numel; i++)
     if (i == 0 || row_idx[i] != row_idx[i-1])
       row_idx[ptr++] = row_idx[i];
diff --git a/lib/THNN/generic/MSECriterion.c b/lib/THNN/generic/MSECriterion.c
index c576e3d..ffdd5d5 100644
--- a/lib/THNN/generic/MSECriterion.c
+++ b/lib/THNN/generic/MSECriterion.c
@@ -9,6 +9,9 @@ void THNN_(MSECriterion_updateOutput)(
           THTensor *output,
           bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
   real sum = 0;
 
   TH_TENSOR_APPLY2(real, input, real, target,
@@ -29,6 +32,8 @@ void THNN_(MSECriterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  
   real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/MarginCriterion.c b/lib/THNN/generic/MarginCriterion.c
index 792ce7b..1675860 100644
--- a/lib/THNN/generic/MarginCriterion.c
+++ b/lib/THNN/generic/MarginCriterion.c
@@ -10,6 +10,8 @@ void THNN_(MarginCriterion_updateOutput)(
           bool sizeAverage,
           real margin)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);  
   real sum = 0;
 
   TH_TENSOR_APPLY2(real, input, real, target,
@@ -31,6 +33,7 @@ void THNN_(MarginCriterion_updateGradInput)(
           bool sizeAverage,
           real margin)
 {
+  THNN_CHECK_NELEMENT(input, target);  
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/MultiLabelMarginCriterion.c b/lib/THNN/generic/MultiLabelMarginCriterion.c
index 9cfc5fe..fe851c9 100644
--- a/lib/THNN/generic/MultiLabelMarginCriterion.c
+++ b/lib/THNN/generic/MultiLabelMarginCriterion.c
@@ -2,43 +2,48 @@
 #define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
 #else
 
+// TODO: improve error messages
 void THNN_(MultiLabelMarginCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *output,
           THTensor *isTarget,
           bool sizeAverage)
 {
-  real *input_data, *target_data, *isTarget_data;
+  real *input_data, *isTarget_data;
+  THIndex_t *target_data;
   long nframe, dim;
   long t, d, dt, ddt;
   real sum;
 
-  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
 
   if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0];
-    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+	       "inconsistent target size");
   }
   else
   {
     nframe = input->size[0];
     dim = input->size[1];
-    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+	       && (target->size[1] == dim), 3, "inconsistent target size");
   }
 
-  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
-  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+  THArgCheck(THIndexTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) <= dim, 3, "target out of range");
 
-  target = THTensor_(newContiguous)(target);
+  target = THIndexTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);
   input_data = THTensor_(data)(input);
-  target_data = THTensor_(data)(target);
+  target_data = THIndexTensor_(data)(target);
 
-  THTensor_(resizeAs)(isTarget, target);
+  THNN_resizeAs_indices(isTarget, target);
   THTensor_(zero)(isTarget);
   isTarget_data = THTensor_(data)(isTarget);
 
@@ -47,14 +52,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   {
     for (ddt = 0; ddt < dim; ddt++)
     {
-      long target_idx = (long)target_data[ddt] - TH_INDEX_BASE;
+      THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE;
       if (target_idx < 0)
         break;
       isTarget_data[target_idx] = 1;
     }
     for (dt = 0; dt < dim; dt++)
     {
-      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
       real input_target;
       if (target_idx < 0)
         break;
@@ -82,53 +87,58 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   THTensor_(set1d)(output, 0, sum);
 
   THTensor_(free)(input);
-  THTensor_(free)(target);
+  THIndexTensor_(free)(target);
 }
 
 void THNN_(MultiLabelMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *gradInput,
           THTensor *isTarget,
           bool sizeAverage)
 {
   real *input_data;
   real *gradInput_data;
-  real *target_data;
+  THIndex_t *target_data;
   real *isTarget_data;
   long nframe, dim;
   long t, d, dt;
   real g;
 
-  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
 
   if (input->nDimension == 1)
   {
     nframe = 1;
     dim = input->size[0];
-    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, "inconsistent target size");
-    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3, "inconsistent isTarget size");
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+	       "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3,
+	       "inconsistent isTarget size");
   }
   else
   {
     nframe = input->size[0];
     dim = input->size[1];
-    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) && (target->size[1] == dim), 3, "inconsistent target size");
-    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe) && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+	       && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe)
+	       && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
   }
 
-  THArgCheck(THTensor_(minall)(target) >= 0, 3, "target out of range");
-  THArgCheck(THTensor_(maxall)(target) <= dim, 3, "target out of range");
+  THArgCheck(THIndexTensor_(minall)(target) >= 0, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) <= dim, 3, "target out of range");
 
   THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
   THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
 
-  target = THTensor_(newContiguous)(target);
+  target = THIndexTensor_(newContiguous)(target);
   input = THTensor_(newContiguous)(input);
   isTarget = THTensor_(newContiguous)(isTarget);
   input_data = THTensor_(data)(input);
-  target_data = THTensor_(data)(target);
+  target_data = THIndexTensor_(data)(target);
   isTarget_data = THTensor_(data)(isTarget);
 
   g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
@@ -141,7 +151,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   {
     for (dt = 0; dt < dim; dt++)
     {
-      long target_idx = (long)target_data[dt] - TH_INDEX_BASE;
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
       real input_target;
       if (target_idx < 0)
         break;
@@ -167,7 +177,7 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   }
 
   THTensor_(free)(input);
-  THTensor_(free)(target);
+  THIndexTensor_(free)(target);
   THTensor_(free)(isTarget);
 }
 
diff --git a/lib/THNN/generic/MultiMarginCriterion.c b/lib/THNN/generic/MultiMarginCriterion.c
index 455cf5e..af83e89 100644
--- a/lib/THNN/generic/MultiMarginCriterion.c
+++ b/lib/THNN/generic/MultiMarginCriterion.c
@@ -2,22 +2,25 @@
 #define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
 #else
 
+// TODO: improve error messages
 void THNN_(MultiMarginCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *output,
           bool sizeAverage,
           int p,
           THTensor *weights,
           real margin)
 {
-  real *input_data, *target_data, *weights_data;
+  real *input_data, *weights_data;
+  THIndex_t *target_data;
   long nframe, dim;
   long t, d;
   real sum;
 
-  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
 
   if (input->nDimension == 1)
   {
@@ -28,26 +31,28 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   {
     nframe = input->size[0];
     dim = input->size[1];
-    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+	       "inconsistent target size");
   }
 
   for (t = 0; t < nframe; t++)
   {
-    real idx = THTensor_(get1d)(target, t);
-    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3, "target out of range");
+    THIndex_t idx = THIndexTensor_(get1d)(target, t);
+    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3,
+	       "target out of range");
   }
 
   input = THTensor_(newContiguous)(input);
-  target = THTensor_(newContiguous)(target);
+  target = THIndexTensor_(newContiguous)(target);
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
   input_data = THTensor_(data)(input);
-  target_data = THTensor_(data)(target);
+  target_data = THIndexTensor_(data)(target);
   weights_data = weights ? THTensor_(data)(weights) : NULL;
 
   sum = 0;
   for (t = 0; t < nframe; t++)
   {
-    long target_idx = (long)(target_data[t] - TH_INDEX_BASE);
+    THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
     real input_target = input_data[target_idx];
     for (d = 0; d < dim; d++)
     {
@@ -72,7 +77,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   THTensor_(set1d)(output, 0, sum);
 
   THTensor_(free)(input);
-  THTensor_(free)(target);
+  THIndexTensor_(free)(target);
   if(weights)
     THTensor_(free)(weights);
 }
@@ -80,7 +85,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
 void THNN_(MultiMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *gradInput,
           bool sizeAverage,
           int p,
@@ -89,13 +94,14 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
 {
   real *input_data;
   real *gradInput_data;
-  real *target_data;
+  THIndex_t *target_data;
   real *weights_data;
   long nframe, dim;
   long t, d;
   real g;
 
-  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, "vector or matrix expected");
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
 
   if (input->nDimension == 1)
   {
@@ -106,25 +112,26 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   {
     nframe = input->size[0];
     dim = input->size[1];
-    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, "inconsistent target size");
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+	       "inconsistent target size");
   }
 
   g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
 
   input = THTensor_(newContiguous)(input);
-  target = THTensor_(newContiguous)(target);
+  target = THIndexTensor_(newContiguous)(target);
   input_data = THTensor_(data)(input);
 
   THTensor_(resizeAs)(gradInput, input);
   gradInput_data = THTensor_(data)(gradInput);
 
-  target_data = THTensor_(data)(target);
+  target_data = THIndexTensor_(data)(target);
   weights = weights ? THTensor_(newContiguous)(weights) : NULL;
   weights_data = weights ? THTensor_(data)(weights) : NULL;
 
   for (t = 0; t < nframe; t++)
   {
-    long target_idx = (long)(target_data[t]) - TH_INDEX_BASE;
+    THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
     real input_target = input_data[target_idx];
     real gradInput_target = 0;
     for (d = 0; d < dim; d++)
@@ -151,7 +158,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   }
 
   THTensor_(free)(input);
-  THTensor_(free)(target);
+  THIndexTensor_(free)(target);
   if(weights)
     THTensor_(free)(weights);
 }
diff --git a/lib/THNN/generic/PReLU.c b/lib/THNN/generic/PReLU.c
index b1b2c0f..3d2ebfc 100644
--- a/lib/THNN/generic/PReLU.c
+++ b/lib/THNN/generic/PReLU.c
@@ -21,6 +21,7 @@ void THNN_(PReLU_updateOutput)(
   }
   else
   {
+    input = THTensor_(newContiguous)(input);
     long bs, ks;
     {
       long input_ndim = THTensor_(nDimension)(input);
@@ -65,6 +66,7 @@ void THNN_(PReLU_updateOutput)(
         n_output_data += ks;
       }
     }
+    THTensor_(free)(input);
   }
 }
 
@@ -76,6 +78,7 @@ void THNN_(PReLU_updateGradInput)(
           THTensor *weight,
           THIndex_t nOutputPlane)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
 
   if (nOutputPlane == 0)
@@ -90,6 +93,8 @@ void THNN_(PReLU_updateGradInput)(
   }
   else
   {
+    input = THTensor_(newContiguous)(input);
+    gradOutput = THTensor_(newContiguous)(gradOutput);
     const real *input_data = THTensor_(data)(input);
     const real *gradOutput_data = THTensor_(data)(gradOutput);
     const real *weight_data = THTensor_(data)(weight);
@@ -145,6 +150,8 @@ void THNN_(PReLU_updateGradInput)(
         n_gradOutput_data += ks;
       }
     }
+    THTensor_(free)(input);
+    THTensor_(free)(gradOutput);
   }
 }
 
@@ -160,6 +167,7 @@ void THNN_(PReLU_accGradParameters)(
           THIndex_t nOutputPlane,
           real scale)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   real *gradWeight_data = THTensor_(data)(gradWeight);
 
   if (nOutputPlane == 0)
@@ -173,6 +181,8 @@ void THNN_(PReLU_accGradParameters)(
   }
   else
   {
+    input = THTensor_(newContiguous)(input);
+    gradOutput = THTensor_(newContiguous)(gradOutput);
     long bs, ks;
     {
       long input_ndim = THTensor_(nDimension)(input);
@@ -222,6 +232,8 @@ void THNN_(PReLU_accGradParameters)(
         n_gradOutput_data += ks;
       }
     }
+    THTensor_(free)(input);
+    THTensor_(free)(gradOutput);
   }
 }
 
diff --git a/lib/THNN/generic/RReLU.c b/lib/THNN/generic/RReLU.c
index 8bf6764..cdb9dca 100644
--- a/lib/THNN/generic/RReLU.c
+++ b/lib/THNN/generic/RReLU.c
@@ -86,6 +86,7 @@ void THNN_(RReLU_updateGradInput)(
           bool train,
           bool inplace)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
   {
     // multiply the gradient by the noise tensor
diff --git a/lib/THNN/generic/Sigmoid.c b/lib/THNN/generic/Sigmoid.c
index 0a1b375..f48cb0f 100644
--- a/lib/THNN/generic/Sigmoid.c
+++ b/lib/THNN/generic/Sigmoid.c
@@ -21,6 +21,7 @@ void THNN_(Sigmoid_updateGradInput)(
           THTensor *gradInput,
           THTensor *output)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
     real z = *output_data;
diff --git a/lib/THNN/generic/SmoothL1Criterion.c b/lib/THNN/generic/SmoothL1Criterion.c
index 8b53100..d1b53da 100644
--- a/lib/THNN/generic/SmoothL1Criterion.c
+++ b/lib/THNN/generic/SmoothL1Criterion.c
@@ -9,6 +9,9 @@ void THNN_(SmoothL1Criterion_updateOutput)(
           THTensor *output,
           bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  
   real sum = 0;
   TH_TENSOR_APPLY2(real, input, real, target,
     real z = fabs(*input_data - *target_data);
@@ -28,6 +31,7 @@ void THNN_(SmoothL1Criterion_updateGradInput)(
           THTensor *gradInput,
           bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/SoftMarginCriterion.c b/lib/THNN/generic/SoftMarginCriterion.c
index d9b618d..bac0a3b 100644
--- a/lib/THNN/generic/SoftMarginCriterion.c
+++ b/lib/THNN/generic/SoftMarginCriterion.c
@@ -9,6 +9,9 @@ void THNN_(SoftMarginCriterion_updateOutput)(
   THTensor *output,
   bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
   real sum;
 
   sum = 0;
@@ -29,6 +32,7 @@ void THNN_(SoftMarginCriterion_updateGradInput)(
   THTensor *gradInput,
   bool sizeAverage)
 {
+  THNN_CHECK_NELEMENT(input, target);
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/SoftMax.c b/lib/THNN/generic/SoftMax.c
index 8bccefd..303526a 100644
--- a/lib/THNN/generic/SoftMax.c
+++ b/lib/THNN/generic/SoftMax.c
@@ -8,8 +8,8 @@ void THNN_(SoftMax_updateOutput)(
           THTensor *output)
 {
   real *input_data, *output_data;
-  long nframe = 0, dim = 0, stride = 0;
-  long t;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t;
 
   if (input->nDimension == 1)
   {
@@ -55,7 +55,7 @@ void THNN_(SoftMax_updateOutput)(
     real inputMax = -THInf;
     accreal sum;
 
-    long d;
+    ptrdiff_t d;
     for (d = 0; d < dim; d++)
     {
       if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
@@ -85,9 +85,10 @@ void THNN_(SoftMax_updateGradInput)(
           THTensor *gradInput,
           THTensor *output)
 {
+  THNN_CHECK_SHAPE(input, gradOutput);  
   real *gradInput_data, *gradOutput_data, *output_data;
-  long nframe = 0, dim = 0, stride = 0;
-  long t;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t;
 
   if (output->nDimension == 1)
   {
@@ -133,7 +134,7 @@ void THNN_(SoftMax_updateGradInput)(
     real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
     real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
 
-    long d;
+    ptrdiff_t d;
     accreal sum = 0;
     for (d = 0; d < dim; d++)
       sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
diff --git a/lib/THNN/generic/SoftPlus.c b/lib/THNN/generic/SoftPlus.c
index 407413f..7305238 100644
--- a/lib/THNN/generic/SoftPlus.c
+++ b/lib/THNN/generic/SoftPlus.c
@@ -26,6 +26,7 @@ void THNN_(SoftPlus_updateGradInput)(
           real beta,
           real threshold)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
   
   // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
diff --git a/lib/THNN/generic/SoftShrink.c b/lib/THNN/generic/SoftShrink.c
index 7bd1cc8..28dcce0 100644
--- a/lib/THNN/generic/SoftShrink.c
+++ b/lib/THNN/generic/SoftShrink.c
@@ -27,6 +27,7 @@ void THNN_(SoftShrink_updateGradInput)(
           THTensor *gradInput,
           real lambda)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
     if ((*input_data) > lambda || (*input_data) < -lambda)
diff --git a/lib/THNN/generic/SpatialAdaptiveMaxPooling.c b/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
index 5d6d995..fff716e 100644
--- a/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
+++ b/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -5,8 +5,8 @@
 static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
-          real *indx_p,
-          real *indy_p,
+          THIndex_t *indx_p,
+          THIndex_t *indy_p,
           long nslices,
           long iwidth,
           long iheight,
@@ -38,8 +38,8 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
         /* local pointers */
         real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
         real *op = output_p  + k*owidth*oheight + i*owidth + j;
-        real *indyp = indy_p + k*owidth*oheight + i*owidth + j;
-        real *indxp = indx_p + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indyp = indy_p + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indxp = indx_p + k*owidth*oheight + i*owidth + j;
 
         /* compute local max: */
         long maxindex = -1;
@@ -64,7 +64,7 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
         *op = maxval;
 
         /* store location of max (x,y) */
-        *indyp = (int)(maxindex / kW) + TH_INDEX_BASE;
+        *indyp = (maxindex / kW) + TH_INDEX_BASE;
         *indxp = (maxindex % kW) + TH_INDEX_BASE;
       }
     }
@@ -75,7 +75,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int owidth,
           int oheight)
 {
@@ -93,10 +93,11 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
 
   real *input_data;
   real *output_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
 
   if (input->nDimension == 4)
   {
@@ -120,11 +121,11 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
   {
     THTensor_(resize3d)(output, nslices, oheight, owidth);
     /* indices will contain i,j locations for each output point */
-    THTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
+    THIndexTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
     THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
                                                       indices_data+nslices*owidth*oheight, indices_data,
@@ -140,11 +141,11 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
 
     THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
     /* indices will contain i,j locations for each output point */
-    THTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
+    THIndexTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
@@ -163,8 +164,8 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
 static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
-          real *indx_p,
-          real *indy_p,
+          THIndex_t *indx_p,
+          THIndex_t *indy_p,
           long nslices,
           long iwidth,
           long iheight,
@@ -177,8 +178,8 @@ static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
   {
     real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
     real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
-    real *indx_p_k = indx_p + k*owidth*oheight;
-    real *indy_p_k = indy_p + k*owidth*oheight;
+    THIndex_t *indx_p_k = indx_p + k*owidth*oheight;
+    THIndex_t *indy_p_k = indy_p + k*owidth*oheight;
 
     /* calculate max points */
     long i, j;
@@ -204,7 +205,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices)
+          THIndexTensor *indices)
 {
   int dimw = 2;
   int dimh = 1;
@@ -216,7 +217,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
   int owidth;
   real *gradInput_data;
   real *gradOutput_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -241,7 +242,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
+  indices_data = THIndexTensor_(data)(indices);
 
   /* backprop */
   if (input->nDimension == 3)
@@ -271,4 +272,3 @@ void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
 }
 
 #endif
-
diff --git a/lib/THNN/generic/SpatialAveragePooling.c b/lib/THNN/generic/SpatialAveragePooling.c
index 37ee274..56db162 100644
--- a/lib/THNN/generic/SpatialAveragePooling.c
+++ b/lib/THNN/generic/SpatialAveragePooling.c
@@ -31,8 +31,12 @@ void THNN_(SpatialAveragePooling_updateOutput)(
 
   long k;
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
 
   if (input->nDimension == 4) {
     nbatch = input->size[0];
@@ -65,7 +69,10 @@ void THNN_(SpatialAveragePooling_updateOutput)(
       --outputWidth;
   }
 
-  THArgCheck(inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2, "input image smaller than kernel size");
+  THArgCheck(inputWidth >= kW - 2 * padW && inputHeight >= kH - 2 * padH, 2,
+	     "input image smaller than (kernel size - 2 * padW). Got "
+	     "inputHeight: %d inputWidth: %d kH %d kW %d padH %d padW %d",
+	     inputHeight, inputWidth, kH, kW, padH, padW);
 
   if (input->nDimension == 3)
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
@@ -148,7 +155,8 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
   int dimh = 1;
   int dimc = 0;
   long nbatch = 1;
-
+  long ndim = 3;
+  
   long inputWidth;
   long inputHeight;
   long outputWidth;
@@ -165,6 +173,7 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
     dimw++;
     dimh++;
     dimc++;
+    ndim = 4;
   }
 
   inputWidth = input->size[dimw];
@@ -191,6 +200,9 @@ void THNN_(SpatialAveragePooling_updateGradInput)(
       --outputWidth;
   }
 
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+
   input_data = THTensor_(data)(input);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/SpatialClassNLLCriterion.c b/lib/THNN/generic/SpatialClassNLLCriterion.c
index cbb4cea..d711c85 100644
--- a/lib/THNN/generic/SpatialClassNLLCriterion.c
+++ b/lib/THNN/generic/SpatialClassNLLCriterion.c
@@ -4,9 +4,12 @@
 
 #define INITIAL_CHECK                                                            \
   THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3,                         \
-              "only batches of spatial targets supported (3D tensors)");         \
-  THArgCheck(THTensor_(nDimension)(input) == 4, 2,                               \
-              "only batches of spatial inputs supported (4D tensors)");          \
+    "only batches of spatial targets supported (3D tensors)"		         \
+	     " but got targets of dimension: %d",			         \
+	     THIndexTensor_(nDimension)(target));			         \
+  THArgCheck(THTensor_(nDimension)(input) == 4, 2,			         \
+	     "only batches of spatial inputs supported (4D tensors), "	         \
+	     "but got input of dimension: %d", THTensor_(nDimension)(input));    \
   if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
     THError("weight tensor should be defined either for all or no classes");     \
   }                                                                              \
diff --git a/lib/THNN/generic/SpatialConvolutionLocal.c b/lib/THNN/generic/SpatialConvolutionLocal.c
index 091c6f0..4d446dd 100644
--- a/lib/THNN/generic/SpatialConvolutionLocal.c
+++ b/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -2,28 +2,97 @@
 #define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
 #else
 
+static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias, 
+	int kH, int kW, int dH, 
+	int dW, int padH, int padW,
+	long inputHeight, long inputWidth,
+	long outputHeight, long outputWidth) {
 
-static void THNN_(SpatialConvolutionLocal_updateOutput_frame)(THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
-                                                         int kW, int kH, int dW, int dH, int padW, int padH,
-                                                         long nInputPlane, long inputWidth, long inputHeight,
-                                                         long nOutputPlane, long outputWidth, long outputHeight)
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane = weight->size[2] / (kH * kW);
+  long nOutputPlane = weight->size[1];
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(bias, 3, 1, outputHeight);
+    THNN_CHECK_DIM_SIZE(bias, 3, 2, outputWidth);
+  }
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static int THNN_(view_weight_local)(THTensor **_weight)
+{
+  THTensor *weight = *_weight;
+  THArgCheck(weight->nDimension == 3 || weight->nDimension == 6, 4,
+          "weight tensor should be 3D or 6D - got %dD", weight->nDimension);
+  if (weight->nDimension == 6) {
+    long s1 = weight->size[0] * weight->size[1];
+    long s2 = weight->size[2];
+    long s3 = weight->size[3] * weight->size[4] * weight->size[5];
+    *_weight = THTensor_(newWithStorage3d)(weight->storage, 
+					   weight->storageOffset, 
+					   s1, -1, s2, -1, s3, -1);
+    return 1;
+  }
+  return 0;
+}
+
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
+     (
+      THTensor *input, THTensor *output,
+      THTensor *weight, THTensor *bias, THTensor *finput,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
 {
   long i;
   THTensor *output3d, *finput3d;
 
-  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, 
+		       nInputPlane, inputWidth, inputHeight, 
+		       outputWidth, outputHeight);
 
   THTensor_(copy)(output, bias);
 
-  output3d = THTensor_(newWithStorage3d)(output->storage, output->storageOffset,
-                                         outputHeight*outputWidth, 1,
-                                         nOutputPlane, outputHeight*outputWidth,
-                                         1, nOutputPlane*outputHeight*outputWidth);
- 
-  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
-                                         outputHeight*outputWidth, 1,
-                                         kW*kH*nInputPlane, outputHeight*outputWidth,
-                                         1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  output3d = THTensor_(newWithStorage3d)
+    (output->storage, output->storageOffset,
+     outputHeight * outputWidth, 1,
+     nOutputPlane, outputHeight * outputWidth,
+     1, nOutputPlane * outputHeight * outputWidth);
+  
+  finput3d = THTensor_(newWithStorage3d)
+    (finput->storage, finput->storageOffset,
+     outputHeight * outputWidth, 1,
+     kW * kH * nInputPlane, outputHeight * outputWidth,
+     1, kW * kH * nInputPlane * outputHeight * outputWidth);
+
   // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
   // finput3d:  oH*oW x nInputPlane*kH*kW x 1  
   THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
@@ -47,18 +116,27 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
     long inputWidth, long inputHeight,
     long outputWidth, long outputHeight)
 {
-  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
-  long nOutputPlane = THTensor_(size)(weight,1);
+  int freeWeight = THNN_(view_weight_local)(&weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);					    
+
+  input = THTensor_(newContiguous)(input);
+  
+  long nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH);
+  long nOutputPlane = THTensor_(size)(weight, 1);
 
   if(input->nDimension == 3)
   {
     THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
 
-    THNN_(SpatialConvolutionLocal_updateOutput_frame)(input, output, weight, bias, finput,
-                                                 kW, kH, dW, dH, padW, padH,
-                                                 nInputPlane, inputWidth, inputHeight,
-                                                 nOutputPlane, outputWidth, outputHeight);
+    THNN_(SpatialConvolutionLocal_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
   }
   else
   {
@@ -75,23 +153,30 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
       THTensor *output_t = THTensor_(newSelect)(output, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      THNN_(SpatialConvolutionLocal_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
-                                                   kW, kH, dW, dH, padW, padH,
-                                                   nInputPlane, inputWidth, inputHeight,
-                                                   nOutputPlane, outputWidth, outputHeight);
+      THNN_(SpatialConvolutionLocal_updateOutput_frame)
+	(input_t, output_t, weight, bias, finput_t,
+	 kW, kH, dW, dH, padW, padH,
+	 nInputPlane, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
 
       THTensor_(free)(input_t);
       THTensor_(free)(output_t);
       THTensor_(free)(finput_t);
     }
   }
+
+  THTensor_(free)(input);
+  if (freeWeight)
+    THTensor_(free)(weight);
 }
 
 
-static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
-                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
-                                                            long nInputPlane, long inputWidth, long inputHeight,
-                                                            long nOutputPlane, long outputWidth, long outputHeight)
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+     (THTensor *gradInput, THTensor *gradOutput,
+      THTensor *weight, THTensor *fgradInput,
+      int kW, int kH, int dW, int dH, int padW, int padH, 
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
 {
   THTensor *gradOutput3d, *fgradInput3d;
   gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
@@ -111,9 +196,11 @@ static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)(THTensor *gradI
   THTensor_(free)(fgradInput3d);
   
   THTensor_(zero)(gradInput);
-
+  
   THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
-                                            nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+		      nInputPlane, inputWidth, inputHeight, 
+		      outputWidth, outputHeight);
+
 }
 
 void THNN_(SpatialConvolutionLocal_updateGradInput)(
@@ -130,6 +217,13 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
     long inputWidth, long inputHeight,
     long outputWidth, long outputHeight)
 {
+  int freeWeight = THNN_(view_weight_local)(&weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
   long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
   long nOutputPlane = THTensor_(size)(weight,1);
 
@@ -139,9 +233,11 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
 
   if(input->nDimension == 3)
   {
-    THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH, 
-                                                       nInputPlane, inputWidth, inputHeight,
-                                                       nOutputPlane, outputWidth, outputHeight);
+    THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+      (gradInput, gradOutput, weight, 
+       fgradInput, kW, kH, dW, dH, padW, padH, 
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
   }
   else
   {
@@ -155,9 +251,11 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
-      THNN_(SpatialConvolutionLocal_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH, 
-                                                         nInputPlane, inputWidth, inputHeight,
-                                                         nOutputPlane, outputWidth, outputHeight);
+      THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+	(gradInput_t, gradOutput_t, weight, fgradInput_t, 
+	 kW, kH, dW, dH, padW, padH, 
+	 nInputPlane, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
 
       THTensor_(free)(gradInput_t);
       THTensor_(free)(gradOutput_t);
@@ -166,12 +264,19 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
   }
 
   THTensor_(transpose)(weight, weight, 1, 2);
+
+  THTensor_(free)(input);
+  if (freeWeight)
+    THTensor_(free)(weight);
+
 }
 
-static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)(THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale, 
-                                                            int kW, int kH, int dW, int dH, int padW, int padH, 
-                                                            long nInputPlane, long inputWidth, long inputHeight,
-                                                            long nOutputPlane, long outputWidth, long outputHeight)
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+     (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+      THTensor *finput, real scale, 
+      int kW, int kH, int dW, int dH, int padW, int padH, 
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
 {
    
   THTensor *gradOutput3d, *finput3d;
@@ -209,14 +314,26 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
     long outputWidth, long outputHeight,
     real scale)
 {
+
+  int freeWeight = THNN_(view_weight_local)(&gradWeight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
   long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
   long nOutputPlane = THTensor_(size)(gradWeight,1);
 
   if(input->nDimension == 3)
   {
-    THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale, kW, kH, dW, dH, padW, padH,
-                                                         nInputPlane, inputWidth, inputHeight,
-                                                         nOutputPlane, outputWidth, outputHeight);
+    THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+      (gradOutput, gradWeight, gradBias, finput, scale, 
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
   }
   else
   {
@@ -228,14 +345,23 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      THNN_(SpatialConvolutionLocal_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale, kW, kH, dW, dH, padW, padH,
-                                                           nInputPlane, inputWidth, inputHeight,
-                                                           nOutputPlane, outputWidth, outputHeight);
+      THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+	(gradOutput_t, gradWeight, gradBias, finput_t, scale, 
+	 kW, kH, dW, dH, padW, padH,
+	 nInputPlane, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
 
       THTensor_(free)(gradOutput_t);
       THTensor_(free)(finput_t);
     }
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+
+  if (freeWeight)
+    THTensor_(free)(gradWeight);
+
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialConvolutionMM.c b/lib/THNN/generic/SpatialConvolutionMM.c
index 64aa9db..d093bee 100644
--- a/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/lib/THNN/generic/SpatialConvolutionMM.c
@@ -2,6 +2,57 @@
 #define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
 #else
 
+static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias, 
+	int kH, int kW, int dH, int dW, int padH, int padW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1] / (kH * kW);
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
 static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
           THTensor *input,
           THTensor *output,
@@ -24,14 +75,18 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
   long i;
   THTensor *output2d;
 
-  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
+		       outputWidth, outputHeight);
 
   output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
                                          nOutputPlane, -1,
                                          outputHeight*outputWidth, -1);
   if (bias) {
     for(i = 0; i < nOutputPlane; i++)
-        THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+        THVector_(fill)
+	  (output->storage->data + output->storageOffset + output->stride[0] * i,
+	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
   } else {
     THTensor_(zero)(output);
   }
@@ -56,64 +111,47 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
           int padW,
           int padH)
 {
-  int dimf = 0;
-  int dimw = 2;
-  int dimh = 1;
+  int freeWeight = 0;
 
-  long nInputPlane;
-  long inputWidth;
-  long inputHeight;
-  long nOutputPlane;
-  long outputWidth;
-  long outputHeight;
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+					 s1, -1, s2, -1);
+    freeWeight = 1;
+  }
 
-  int freeWeight = 0;
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
 
-  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
 
-  if (input->nDimension == 4) {
+  if (ndim == 4) {
     dimf++;
-    dimw++;
     dimh++;
+    dimw++;
   }
 
-  nInputPlane = input->size[dimf];
-  inputWidth   = input->size[dimw];
-  inputHeight  = input->size[dimh];
-  nOutputPlane = weight->size[0];
-  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
-  if (outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
-
-
-  int expectedWeightSize = weight->nDimension == 2 ? nInputPlane*kW*kH : nInputPlane;
-  int weightInputPlanes = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
-  if (expectedWeightSize != weight->size[1])
-    THError("Wrong number of input channels! Input has %d channels, expected %d",
-        nInputPlane, weightInputPlanes);
-
-  if (weight->nDimension == 4) {
-    long s1 = weight->size[0];
-    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
-    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
-    freeWeight = 1;
-  }
+  long nInputPlane = input->size[dimf];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
 
   if(input->nDimension == 3)
   {
     THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
 
-    THNN_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
-                                                 kW, kH, dW, dH, padW, padH,
-                                                 nInputPlane, inputWidth, inputHeight,
-                                                 nOutputPlane, outputWidth, outputHeight);
+    THNN_(SpatialConvolutionMM_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
   }
   else
   {
@@ -130,10 +168,11 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
       THTensor *output_t = THTensor_(newSelect)(output, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      THNN_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t,
-                                                   kW, kH, dW, dH, padW, padH,
-                                                   nInputPlane, inputWidth, inputHeight,
-                                                   nOutputPlane, outputWidth, outputHeight);
+      THNN_(SpatialConvolutionMM_updateOutput_frame)
+	(input_t, output_t, weight, bias, finput_t,
+	 kW, kH, dW, dH, padW, padH,
+	 nInputPlane, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
 
       THTensor_(free)(input_t);
       THTensor_(free)(output_t);
@@ -157,15 +196,19 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
           int padW,
           int padH)
 {
-  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
-                                                       gradOutput->size[0], -1,
-                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
   THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
   THTensor_(free)(gradOutput2d);
 
   THTensor_(zero)(gradInput);
 
-  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, gradInput->size[0], gradInput->size[2], gradInput->size[1], gradOutput->size[2], gradOutput->size[1]);
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+		      padW, padH,
+		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
+		      gradOutput->size[2], gradOutput->size[1]);
 }
 
 void THNN_(SpatialConvolutionMM_updateGradInput)(
@@ -183,33 +226,34 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
           int padW,
           int padH)
 {
-  long nOutputPlane = weight->size[0];
   int freeWeight = 0;
 
-  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
-  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
-  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+					 s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
+
   // depending on the BLAS library, fgradInput (result tensor) might
   // be left uninitialized on zero alpha, which might lead to weird behavior
   // hence, to be safe, zero it
   THTensor_(zero)(fgradInput);
 
-  if (weight->nDimension == 4) {
-    long s1 = weight->size[0];
-    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
-    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
-    freeWeight = 1;
-  }
-
   THTensor_(transpose)(weight, weight, 0, 1);
 
   if(input->nDimension == 3)
   {
-    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, weight, fgradInput, kW, kH, dW, dH, padW, padH);
+    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
+						      weight, fgradInput,
+						      kW, kH, dW, dH, padW, padH);
   }
   else
   {
@@ -223,7 +267,9 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
-      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, weight, fgradInput_t, kW, kH, dW, dH, padW, padH);
+      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
+							weight, fgradInput_t,
+							kW, kH, dW, dH, padW, padH);
 
       THTensor_(free)(gradInput_t);
       THTensor_(free)(gradOutput_t);
@@ -245,9 +291,10 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
           real scale)
 {
   long i;
-  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
-                                                       gradOutput->size[0], -1,
-                                                       gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
 
   THTensor_(transpose)(finput, finput, 0, 1);
   THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
@@ -285,22 +332,23 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
           real scale)
 {
   int freeWeight = 0;
-  long nOutputPlane = gradWeight->size[0];
-  THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-  THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D");
 
   if (gradWeight->nDimension == 4) {
     long s1 = gradWeight->size[0];
     long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
-    gradWeight = THTensor_(newWithStorage2d)(gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
+    gradWeight = THTensor_(newWithStorage2d)(gradWeight->storage,
+					     gradWeight->storageOffset,
+					     s1, -1, s2, -1);
     freeWeight = 1;
   }
 
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
   if(input->nDimension == 3)
   {
-    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
+							gradBias, finput, scale);
   }
   else
   {
@@ -312,7 +360,8 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
-      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
+							  gradBias, finput_t, scale);
 
       THTensor_(free)(gradOutput_t);
       THTensor_(free)(finput_t);
diff --git a/lib/THNN/generic/SpatialDilatedConvolution.c b/lib/THNN/generic/SpatialDilatedConvolution.c
index 3928af0..9dcc1b4 100644
--- a/lib/THNN/generic/SpatialDilatedConvolution.c
+++ b/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -2,6 +2,62 @@
 #define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
 #else
 
+static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW) {
+
+  THNN_ARGCHECK(weight->nDimension == 4, 4, weight,
+                "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+                "but got: %s");
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
+             dilationH, dilationW);
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%ld x %ld x %ld). "
+	    "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
 void THNN_(SpatialDilatedConvolution_updateOutput)(
     THNNState *state,
     THTensor *input,
@@ -15,11 +71,10 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
     int padW, int padH,
     int dilationW, int dilationH)
 {
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
-  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
 
   // Params:
   int nInputPlane = weight->size[1];
@@ -27,23 +82,15 @@ void THNN_(SpatialDilatedConvolution_updateOutput)(
 
   int batch = 1;
   if (input->nDimension == 3) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
   }
-
   long inputWidth   = input->size[3];
   long inputHeight  = input->size[2];
   long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  if (outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-            nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
-
   // Batch size + input planes
   long batchSize = input->size[0];
 
@@ -142,10 +189,9 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
     int padW, int padH,
     int dilationW, int dilationH)
 {
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
-  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
 
   // Params
   int nInputPlane = weight->size[1];
@@ -156,7 +202,8 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1],
+			gradOutput->size[2]);
   }
 
   long inputWidth   = input->size[3];
@@ -236,11 +283,9 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
     int dilationW, int dilationH,
     real scale)
 {
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
-  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
 
   // Params
   int nInputPlane = gradWeight->size[1];
@@ -251,7 +296,8 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0],
+			gradOutput->size[1], gradOutput->size[2]);
   }
 
   long inputWidth   = input->size[3];
diff --git a/lib/THNN/generic/SpatialDilatedMaxPooling.c b/lib/THNN/generic/SpatialDilatedMaxPooling.c
index 6500f49..1a40b8f 100644
--- a/lib/THNN/generic/SpatialDilatedMaxPooling.c
+++ b/lib/THNN/generic/SpatialDilatedMaxPooling.c
@@ -2,10 +2,80 @@
 #define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c"
 #else
 
+static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW, bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationH > 0 && dilationW > 0, 12,
+             "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
+             dilationH, dilationW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2,
+	     "input image (H: %d, W: %d) smaller than kernel "
+	     "size - padding( kH: %d padH: %d kW: %d padW: %d",
+	     input->size[dimh], input->size[dimw], kH, padH, kW, padW);
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  long nInputPlane = input->size[dimh-1];
+  long inputHeight = input->size[dimh];
+  long inputWidth = input->size[dimw];
+  long outputHeight, outputWidth;
+  long nOutputPlane = nInputPlane;
+
+  if (ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth);
+  }
+}
+
 static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
-          real *ind_p,
+          THIndex_t *ind_p,
           long nslices,
           long iwidth,
           long iheight,
@@ -43,7 +113,7 @@ static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
 
         /* local pointers */
         real *op = output_p  + k*owidth*oheight + i*owidth + j;
-        real *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indp = ind_p   + k*owidth*oheight + i*owidth + j;
 
         /* compute local max: */
         long maxindex = -1;
@@ -78,7 +148,7 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW,
           int kH,
           int dW,
@@ -89,20 +159,22 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
           int dilationH,
           bool ceil_mode)
 {
+
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
-  long nslices;
-  long iheight;
-  long iwidth;
-  long oheight;
-  long owidth;
+  long nInputPlane;
+  long inputHeight;
+  long inputWidth;
+  long outputHeight;
+  long outputWidth;
   real *input_data;
   real *output_data;
-  real *indices_data;
-
+  THIndex_t *indices_data;
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, NULL, NULL, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
 
   if (input->nDimension == 4)
   {
@@ -110,35 +182,29 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
     dimw++;
     dimh++;
   }
-  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
-  
+
   /* sizes */
-  nslices = input->size[dimh-1];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
   if (ceil_mode)
   {
-    oheight = (long)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
-    owidth  = (long)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
   }
   else
   {
-    oheight = (long)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
-    owidth  = (long)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
   }
 
-  if (owidth < 1 || oheight < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-            nslices,iheight,iwidth,nslices,oheight,owidth);
-
   if (padW || padH)
   {
     // ensure that the last pooling starts inside the image
-    if ((oheight - 1)*dH >= iheight + padH)
-      --oheight;
-    if ((owidth  - 1)*dW >= iwidth  + padW)
-      --owidth;
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
   }
 
   /* get contiguous input */
@@ -147,48 +213,51 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
   /* resize output */
   if (input->nDimension == 3)
   {
-    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
     /* indices will contain the locations for each output point */
-    THTensor_(resize3d)(indices,  nslices, oheight, owidth);
+    THIndexTensor_(resize3d)(indices,  nInputPlane, outputHeight, outputWidth);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
-
-    THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(input_data, output_data,
-                                              indices_data,
-                                              nslices,
-                                              iwidth, iheight,
-                                              owidth, oheight,
-                                              kW, kH, dW, dH,
-                                              padW, padH,
-                                              dilationW, dilationH
-                                              );
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+      (input_data, output_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       kW, kH, dW, dH,
+       padW, padH,
+       dilationW, dilationH
+       );
   }
   else
   {
     long p;
 
-    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth);
     /* indices will contain the locations for each output point */
-    THTensor_(resize4d)(indices, nbatch, nslices, oheight, owidth);
+    THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
-                                                indices_data+p*nslices*owidth*oheight,
-                                                nslices,
-                                                iwidth, iheight,
-                                                owidth, oheight,
-                                                kW, kH, dW, dH,
-                                                padW, padH,
-                                                dilationW, dilationH
-                                                );
+      THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+	(input_data+p*nInputPlane*inputWidth*inputHeight,
+	 output_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 kW, kH, dW, dH,
+	 padW, padH,
+	 dilationW, dilationH
+	 );
     }
   }
 
@@ -199,33 +268,33 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
 static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
-          real *ind_p,
-          long nslices,
-          long iwidth,
-          long iheight,
-          long owidth,
-          long oheight,
+          THIndex_t *ind_p,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long outputWidth,
+          long outputHeight,
           int dW,
           int dH)
 {
   long k;
 #pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)
+  for (k = 0; k < nInputPlane; k++)
   {
-    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
-    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
-    real *ind_p_k = ind_p + k*owidth*oheight;
+    real *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
+    real *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
+    THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
 
     /* calculate max points */
     long i, j;
-    for(i = 0; i < oheight; i++)
+    for(i = 0; i < outputHeight; i++)
     {
-      for(j = 0; j < owidth; j++)
+      for(j = 0; j < outputWidth; j++)
       {
         /* retrieve position of max */
-        long maxp = ind_p_k[i*owidth + j] - TH_INDEX_BASE;
+        long maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE;
         /* update gradient */
-        gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
+        gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
       }
     }
   }
@@ -236,7 +305,7 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW,
           int kH,
           int dW,
@@ -250,14 +319,18 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
-  int nslices;
-  int iheight;
-  int iwidth;
-  int oheight;
-  int owidth;
+  int nInputPlane;
+  int inputHeight;
+  int inputWidth;
+  int outputHeight;
+  int outputWidth;
   real *gradInput_data;
   real *gradOutput_data;
-  real *indices_data;
+  THIndex_t *indices_data;
+
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, gradOutput, indices, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
@@ -273,26 +346,27 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
   }
 
   /* sizes */
-  nslices = input->size[dimh-1];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
-  oheight = gradOutput->size[dimh];
-  owidth = gradOutput->size[dimw];
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
+  outputHeight = gradOutput->size[dimh];
+  outputWidth = gradOutput->size[dimw];
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
+  indices_data = THIndexTensor_(data)(indices);
 
   /* backprop */
   if (input->nDimension == 3)
   {
-    THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
-                                                 indices_data,
-                                                 nslices,
-                                                 iwidth, iheight,
-                                                 owidth, oheight,
-                                                 dW, dH);
+    THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+      (gradInput_data, gradOutput_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       dW, dH);
   }
   else
   {
@@ -300,12 +374,14 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
-                                                   indices_data+p*nslices*owidth*oheight,
-                                                   nslices,
-                                                   iwidth, iheight,
-                                                   owidth, oheight,
-                                                   dW, dH);
+      THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+	(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
+	 gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 dW, dH);
     }
   }
 
diff --git a/lib/THNN/generic/SpatialFractionalMaxPooling.c b/lib/THNN/generic/SpatialFractionalMaxPooling.c
index c0a9384..a98954c 100644
--- a/lib/THNN/generic/SpatialFractionalMaxPooling.c
+++ b/lib/THNN/generic/SpatialFractionalMaxPooling.c
@@ -23,7 +23,7 @@ static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
 static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
   real* input,
   real* output,
-  real* indices,
+  THIndex_t* indices,
   real* randomSamples,
   long numPlanes,
   long inputW, long inputH,
@@ -48,7 +48,7 @@ static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
 
     real* inputForPlane = input + plane * inputW * inputH;
     real* outputForPlane = output + plane * outputW * outputH;
-    real* indicesForPlane = indices + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
 
     for (h = 0; h < outputH; ++h) {
       long inputHStart = sequenceH[h];
@@ -79,7 +79,7 @@ static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
 
         outputForPlane[h * outputW + w] = maxVal;
         /* +1 to lua index */
-        indicesForPlane[h * outputW + w] = (real) maxIndex + TH_INDEX_BASE;
+        indicesForPlane[h * outputW + w] = maxIndex + TH_INDEX_BASE;
       }
     }
 
@@ -94,7 +94,7 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
     THTensor *output,
     int outputW, int outputH,
     int poolSizeW, int poolSizeH,
-    THTensor *indices,
+    THIndexTensor *indices,
     THTensor *randomSamples) {
 
   long numBatch = 1;
@@ -103,8 +103,8 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
   int widthDim = 2;
 
   long numInputDims = THTensor_(nDimension)(input);
-  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
-             "3D or 4D (batch mode) tensor expected");
+  THNN_ARGCHECK(numInputDims == 3 || numInputDims == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
 
   if (numInputDims == 4) {
     numBatch = THTensor_(size)(input, 0);
@@ -119,9 +119,11 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
   long inputW = THTensor_(size)(input, widthDim);
 
   THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
-             "poolSizeH too large relative to input height");
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
   THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
-             "poolSizeW too large relative to input width");
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
@@ -130,18 +132,18 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
     /* resize output */
     THTensor_(resize3d)(output, numPlanes, outputH, outputW);
     /* indices will contain the locations for each output point */
-    THTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+    THIndexTensor_(resize3d)(indices, numPlanes, outputH, outputW);
 
     THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
       THTensor_(data)(input),
       THTensor_(data)(output),
-      THTensor_(data)(indices),
+      THIndexTensor_(data)(indices),
       THTensor_(data)(randomSamples),
       numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
   } else {
     THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
     /* indices will contain the locations for each output point */
-    THTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+    THIndexTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
 
     long batch;
 #pragma omp parallel for private(batch)
@@ -149,7 +151,7 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
       THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
         THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
         THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
-        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
         THTensor_(data)(randomSamples) + batch * numPlanes * 2,
         numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
     }
@@ -162,7 +164,7 @@ void THNN_(SpatialFractionalMaxPooling_updateOutput)(
 static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
   real* gradInput,
   real* gradOutput,
-  real* indices,
+  THIndex_t* indices,
   long numPlanes,
   long inputW, long inputH,
   long outputW, long outputH) {
@@ -171,7 +173,7 @@ static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
   for (plane = 0; plane < numPlanes; plane++) {
     real* gradInputForPlane = gradInput + plane * inputW * inputH;
     real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
-    real* indicesForPlane = indices + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
 
     long h, w;
     for (h = 0; h < outputH; ++h) {
@@ -193,7 +195,7 @@ void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
     THTensor *gradInput,
     int outputW, int outputH,
     int poolSizeW, int poolSizeH,
-    THTensor *indices) {
+    THIndexTensor *indices) {
 
   long numBatch = 1;
   int planeDim = 0;
@@ -230,7 +232,7 @@ void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
     THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
       THTensor_(data)(gradInput),
       THTensor_(data)(gradOutput),
-      THTensor_(data)(indices),
+      THIndexTensor_(data)(indices),
       numPlanes, inputW, inputH, outputW, outputH);
   } else {
     long batch;
@@ -239,7 +241,7 @@ void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
       THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
         THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
         THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
-        THTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
         numPlanes, inputW, inputH, outputW, outputH);
     }
   }
diff --git a/lib/THNN/generic/SpatialFullConvolution.c b/lib/THNN/generic/SpatialFullConvolution.c
index a82477d..94a7fc1 100644
--- a/lib/THNN/generic/SpatialFullConvolution.c
+++ b/lib/THNN/generic/SpatialFullConvolution.c
@@ -57,6 +57,57 @@ static void THNN_(col2im)(const real* data_col, const int channels,
   }
 }
 
+static inline void THNN_(SpatialFullConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias, 
+	int kH, int kW, int dH, int dW, int padH, int padW, int adjH, int adjW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[0];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[1];
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
 void THNN_(SpatialFullConvolution_updateOutput)(
     THNNState *state,
     THTensor *input,
@@ -70,25 +121,23 @@ void THNN_(SpatialFullConvolution_updateOutput)(
     int padW, int padH,
     int adjW, int adjH)
 {
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
   int nInputPlane = THTensor_(size)(weight,0);
   int nOutputPlane = THTensor_(size)(weight,1);
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
   int batch = 1;
   if (input->nDimension == 3) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
   }
 
-  long inputWidth   = input->size[3];
   long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long inputWidth   = input->size[3];
   long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
 
   // Batch size + input planes
   long batchSize = input->size[0];
@@ -189,11 +238,12 @@ void THNN_(SpatialFullConvolution_updateGradInput)(
     int padW, int padH,
     int adjW, int adjH)
 {
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
   int nInputPlane = THTensor_(size)(weight,0);
   int nOutputPlane = THTensor_(size)(weight,1);
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
@@ -283,11 +333,12 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
     int adjW, int adjH,
     real scale)
 {
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
   int nInputPlane = THTensor_(size)(gradWeight,0);
   int nOutputPlane = THTensor_(size)(gradWeight,1);
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
diff --git a/lib/THNN/generic/SpatialMaxPooling.c b/lib/THNN/generic/SpatialMaxPooling.c
index e0fafb1..88aaa40 100644
--- a/lib/THNN/generic/SpatialMaxPooling.c
+++ b/lib/THNN/generic/SpatialMaxPooling.c
@@ -6,7 +6,7 @@ void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW,
           int kH,
           int dW,
@@ -26,7 +26,7 @@ void THNN_(SpatialMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW,
           int kH,
           int dW,
diff --git a/lib/THNN/generic/SpatialMaxUnpooling.c b/lib/THNN/generic/SpatialMaxUnpooling.c
index cd1739b..1b7b517 100644
--- a/lib/THNN/generic/SpatialMaxUnpooling.c
+++ b/lib/THNN/generic/SpatialMaxUnpooling.c
@@ -3,18 +3,20 @@
 #else
 
 static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
-                                                      real *ind_p,
+                                                      THIndex_t *ind_p,
                                                       long nslices,
                                                       long iwidth, long iheight,
                                                       long owidth, long oheight)
 {
   long k;
+  int has_error = 0;
+  long error_index;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
     real *output_p_k = output_p + k*owidth*oheight;
     real *input_p_k = input_p + k*iwidth*iheight;
-    real *ind_p_k = ind_p + k*iwidth*iheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
 
     long i, j, maxp;
     for(i = 0; i < iheight; i++)
@@ -23,19 +25,28 @@ static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *o
       {
         maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
         if(maxp<0 || maxp>=owidth*oheight){
-            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
+#pragma omp critical
+          {
+            has_error = 1;
+            error_index = maxp;
+          }
+        } else {
+          output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
         }
-        output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
       }
     }
   }
+  if (has_error) {
+    THError("found an invalid max index %ld (output volumes are of size %ldx%ld)",
+        error_index, oheight, owidth);
+  }
 }
 
 void THNN_(SpatialMaxUnpooling_updateOutput)(
     THNNState *state,
     THTensor *input,
     THTensor *output,
-    THTensor *indices,
+    THIndexTensor *indices,
     int owidth, int oheight)
 {
   int dimw = 2;
@@ -46,13 +57,12 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
   int iwidth;
   real *input_data;
   real *output_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
-  if (!THTensor_(isSameSizeAs)(input, indices)){
-    THError("Invalid input size w.r.t current indices size");
-  }
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+  THNN_CHECK_SHAPE_INDICES(input, indices);
 
   if (input->nDimension == 4)
   {
@@ -68,7 +78,7 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
 
   /* get contiguous input and indices */
   input = THTensor_(newContiguous)(input);
-  indices = THTensor_(newContiguous)(indices);
+  indices = THIndexTensor_(newContiguous)(indices);
 
   /* resize output */
   if (input->nDimension == 3)
@@ -78,7 +88,7 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
     THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
                                               indices_data,
@@ -95,7 +105,7 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
@@ -110,11 +120,11 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
 
   /* cleanup */
   THTensor_(free)(input);
-  THTensor_(free)(indices);
+  THIndexTensor_(free)(indices);
 }
 
 static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
-                                                         real *ind_p,
+                                                         THIndex_t *ind_p,
                                                          long nslices,
                                                          long iwidth, long iheight,
                                                          long owidth, long oheight)
@@ -125,7 +135,7 @@ static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p,
   {
     real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
     real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
-    real *ind_p_k = ind_p + k*iwidth*iheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
 
     long i, j, maxp;
     for(i = 0; i < iheight; i++)
@@ -147,7 +157,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
     THTensor *input,
     THTensor *gradOutput,
     THTensor *gradInput,
-    THTensor *indices,
+    THIndexTensor *indices,
     int owidth, int oheight)
 {
   int dimw = 2;
@@ -158,15 +168,13 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
   int iwidth;
   real *gradInput_data;
   real *gradOutput_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
-  if (!THTensor_(isSameSizeAs)(input, indices)){
-    THError("Invalid input size w.r.t current indices size");
-  }
+  THNN_CHECK_SHAPE_INDICES(input, indices);
 
   /* get contiguous gradOutput and indices */
   gradOutput = THTensor_(newContiguous)(gradOutput);
-  indices = THTensor_(newContiguous)(indices);
+  indices = THIndexTensor_(newContiguous)(indices);
 
   /* resize */
   THTensor_(resizeAs)(gradInput, input);
@@ -184,13 +192,14 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
   iwidth = input->size[dimw];
 
   if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
-    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
+    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
+	    oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
   }
 
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
+  indices_data = THIndexTensor_(data)(indices);
 
   /* backprop */
   if (input->nDimension == 3)
@@ -217,7 +226,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(
 
   /* cleanup */
   THTensor_(free)(gradOutput);
-  THTensor_(free)(indices);
+  THIndexTensor_(free)(indices);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialReflectionPadding.c b/lib/THNN/generic/SpatialReflectionPadding.c
index 08e0ba0..dcde660 100644
--- a/lib/THNN/generic/SpatialReflectionPadding.c
+++ b/lib/THNN/generic/SpatialReflectionPadding.c
@@ -67,8 +67,8 @@ void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
   real *input_data;
   real *output_data;
 
-  THArgCheck(input->nDimension == 3 ||
-    input->nDimension == 4 , 2, "input must be 3 or 4-dimensional");
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
 
   if (input->nDimension == 4)
   {
@@ -85,7 +85,10 @@ void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
   oheight = iheight + pad_t + pad_b;
   owidth  = iwidth + pad_l + pad_r;
 
-  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
@@ -212,9 +215,11 @@ void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
   owidth  = iwidth + pad_l + pad_r;
 
   THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
-                "gradOutput width unexpected");
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
   THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
-                "gradOutput height unexpected");
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
diff --git a/lib/THNN/generic/SpatialReplicationPadding.c b/lib/THNN/generic/SpatialReplicationPadding.c
index cdd6fc5..4e318aa 100644
--- a/lib/THNN/generic/SpatialReplicationPadding.c
+++ b/lib/THNN/generic/SpatialReplicationPadding.c
@@ -66,8 +66,8 @@ void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
   real *input_data;
   real *output_data;
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4,
-             2, "input must be 3 or 4-dimensional");
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
 
   if (input->nDimension == 4)
   {
@@ -84,7 +84,11 @@ void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
   oheight = iheight + pad_t + pad_b;
   owidth  = iwidth + pad_l + pad_r;
 
-  THArgCheck(owidth >= 1 || oheight >= 1 , 2, "input is too small");
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
+
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
@@ -210,9 +214,11 @@ void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
   owidth  = iwidth + pad_l + pad_r;
 
   THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
-                "gradOutput width unexpected");
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
   THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
-                "gradOutput height unexpected");
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
diff --git a/lib/THNN/generic/SpatialSubSampling.c b/lib/THNN/generic/SpatialSubSampling.c
index abfbfce..3674f2c 100644
--- a/lib/THNN/generic/SpatialSubSampling.c
+++ b/lib/THNN/generic/SpatialSubSampling.c
@@ -2,6 +2,35 @@
 #define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
 #else
 
+static inline void THNN_(SpatialSubSampling_shapeCheck)(
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         int kW, int kH) {
+  int ndims = input->nDimension;
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+                  "3D or 4D input tensor expected but got: %s");
+
+  int nInputPlane = THTensor_(size)(weight, 0);
+
+  int dimw = 2;
+  int dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+
+  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+}
+
 void THNN_(SpatialSubSampling_updateOutput)(
     THNNState *state,
     THTensor *input,
@@ -30,7 +59,7 @@ void THNN_(SpatialSubSampling_updateOutput)(
 
   long k;
 
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+  THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH);
 
   if (input->nDimension == 4) {
     nbatch = input->size[0];
@@ -43,9 +72,6 @@ void THNN_(SpatialSubSampling_updateOutput)(
   outputWidth = (inputWidth - kW) / dW + 1;
   outputHeight = (inputHeight - kH) / dH + 1;
 
-  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
-  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
-
   if (input->nDimension == 3)
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
   else
@@ -105,7 +131,8 @@ void THNN_(SpatialSubSampling_updateGradInput)(
     int kW, int kH,
     int dW, int dH)
 {
-  
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, weight, kW, kH);
+
   int dimw = 2;
   int dimh = 1;
   long nbatch = 1;
@@ -135,13 +162,13 @@ void THNN_(SpatialSubSampling_updateGradInput)(
   outputHeight = (inputHeight - kH) / dH + 1;
 
   weight_data = THTensor_(data)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   gradOutput_data = THTensor_(data)(gradOutput);
 
   input_data = THTensor_(data)(input);
 
   THTensor_(resizeAs)(gradInput, input);
   gradInput_data = THTensor_(data)(gradInput);
-  gradOutput_data = THTensor_(data)(gradOutput);
 
 #pragma omp parallel for private(k)
   for(k = 0; k < nInputPlane; k++)
@@ -176,6 +203,7 @@ void THNN_(SpatialSubSampling_updateGradInput)(
       }
     }
   }
+  THTensor_(free)(gradOutput);
 }
 
 void THNN_(SpatialSubSampling_accGradParameters)(
@@ -188,6 +216,8 @@ void THNN_(SpatialSubSampling_accGradParameters)(
     int dW, int dH,
     real scale)
 {
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH);
+
   long nbatch = 1;
   long dimw = 2;
   long dimh = 1;
@@ -219,6 +249,7 @@ void THNN_(SpatialSubSampling_accGradParameters)(
 
   gradWeight_data = THTensor_(data)(gradWeight);
   gradBias_data = THTensor_(data)(gradBias);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   gradOutput_data = THTensor_(data)(gradOutput);
 
   input = THTensor_(newContiguous)(input);
@@ -262,6 +293,7 @@ void THNN_(SpatialSubSampling_accGradParameters)(
   }
 
   THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialUpSamplingBilinear.c b/lib/THNN/generic/SpatialUpSamplingBilinear.c
index 78290b6..7c4ea31 100644
--- a/lib/THNN/generic/SpatialUpSamplingBilinear.c
+++ b/lib/THNN/generic/SpatialUpSamplingBilinear.c
@@ -5,123 +5,170 @@
 #define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
 #else
 
+static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputHeight, int inputWidth,
+      int outputHeight, int outputWidth) {
+  THArgCheck(inputHeight > 0 && inputWidth > 0
+	     && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+	     inputHeight, inputWidth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->nDimension == 4, 2, input,
+		  "4D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+  }
+}
+
 void THNN_(SpatialUpSamplingBilinear_updateOutput)(
     THNNState *state,
     THTensor *input,
-    THTensor *output){
+    THTensor *output,
+    int outputHeight,
+    int outputWidth){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputHeight = THTensor_(size)(input, 2);
+  int inputWidth = THTensor_(size)(input, 3);
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
   input = THTensor_(newContiguous)(input);
-  output = THTensor_(newContiguous)(output);
+  THTensor_(resize4d)(output, 
+		      THTensor_(size)(input, 0), 
+		      THTensor_(size)(input, 1), 
+		      outputHeight, outputWidth);
   THTensor_(zero)(output);
   real *idata = THTensor_(data)(input);
   real *odata = THTensor_(data)(output);
-  int channels = THTensor_(size)(input, 0) * THTensor_(size)(input, 1);
-  int height1 = THTensor_(size)(input, 2);
-  int width1 = THTensor_(size)(input, 3);
-  int height2 = THTensor_(size)(output, 2);
-  int width2 = THTensor_(size)(output, 3);
-  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  channels = nbatch * channels;
+  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
   // special case: just copy
-  if (height1 == height2 && width1 == width2) {
-    for (int h2 = 0; h2 < height2; ++h2) {
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
       const int h1 = h2;
-      for (int w2 = 0; w2 < width2; ++w2) {
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
         const int w1 = w2;
-        const real* pos1 = &idata[h1 * width1 + w1];
-        real* pos2 = &odata[h2 * width2 + w2];
+        const real* pos1 = &idata[h1 * inputWidth + w1];
+        real* pos2 = &odata[h2 * outputWidth + w2];
         for (int c = 0; c < channels; ++c) {
           pos2[0] = pos1[0];
-          pos1 += width1 * height1;
-          pos2 += width2 * height2;
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
         }
       }
     }
     return;
   }
-  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
-  const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f;
-  for (int h2 = 0; h2 < height2; ++h2) {
+  const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
     const float h1r = rheight * h2;
     const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
     const real h1lambda = h1r - h1;
     const real h0lambda = (real)1. - h1lambda;
-    for (int w2 = 0; w2 < width2; ++w2) {
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
       const float w1r = rwidth * w2;
       const int w1 = w1r;
-      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
       const real w1lambda = w1r - w1;
       const real w0lambda = (real)1. - w1lambda;
-      const real* pos1 = &idata[h1 * width1 + w1];
-      real* pos2 = &odata[h2 * width2 + w2];
+      const real* pos1 = &idata[h1 * inputWidth + w1];
+      real* pos2 = &odata[h2 * outputWidth + w2];
       for (int c = 0; c < channels; ++c) {
         pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
-                  + h1lambda * (w0lambda * pos1[h1p * width1]
-                  + w1lambda * pos1[h1p * width1 + w1p]);
-        pos1 += width1 * height1;
-        pos2 += width2 * height2;
+                  + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                  + w1lambda * pos1[h1p * inputWidth + w1p]);
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
       }
     }
   }
+  THTensor_(free)(input);
 }
 
 void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
     THNNState *state,
     THTensor *gradOutput,
-    THTensor *gradInput){
-  gradInput = THTensor_(newContiguous)(gradInput);
-  gradOutput = THTensor_(newContiguous)(gradOutput);
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputHeight,
+    int inputWidth,
+    int outputHeight,
+    int outputWidth){
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
+  THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth);
   THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   real *data1 = THTensor_(data)(gradInput);
   real *data2 = THTensor_(data)(gradOutput);
-  int channels = THTensor_(size)(gradInput, 0) * THTensor_(size)(gradInput, 1);
-  int height1 = THTensor_(size)(gradInput, 2);
-  int width1 = THTensor_(size)(gradInput, 3);
-  int height2 = THTensor_(size)(gradOutput, 2);
-  int width2 = THTensor_(size)(gradOutput, 3);
-  THAssert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  channels = nbatch * channels;
+
   // special case: same-size matching grids
-  if (height1 == height2 && width1 == width2) {
-    for (int h2 = 0; h2 < height2; ++h2) {
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
       const int h1 = h2;
-      for (int w2 = 0; w2 < width2; ++w2) {
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
         const int w1 = w2;
-        real* pos1 = &data1[h1 * width1 + w1];
-        const real* pos2 = &data2[h2 * width2 + w2];
+        real* pos1 = &data1[h1 * inputWidth + w1];
+        const real* pos2 = &data2[h2 * outputWidth + w2];
         for (int c = 0; c < channels; ++c) {
           pos1[0] += pos2[0];
-          pos1 += width1 * height1;
-          pos2 += width2 * height2;
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
         }
       }
     }
     return;
   }
-  const float rheight =(height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
-  const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f;
-  for (int h2 = 0; h2 < height2; ++h2) {
+  const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
     const float h1r = rheight * h2;
     const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
     const real h1lambda = h1r - h1;
     const real h0lambda = (real)1. - h1lambda;
-    for (int w2 = 0; w2 < width2; ++w2) {
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
       const float w1r = rwidth * w2;
       const int w1 = w1r;
-      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
       const real w1lambda = w1r - w1;
       const real w0lambda = (real)1. - w1lambda;
-      real* pos1 = &data1[h1 * width1 + w1];
-      const real* pos2 = &data2[h2 * width2 + w2];
+      real* pos1 = &data1[h1 * inputWidth + w1];
+      const real* pos2 = &data2[h2 * outputWidth + w2];
       for (int c = 0; c < channels; ++c) {
         pos1[0] += h0lambda * w0lambda * pos2[0];
         pos1[w1p] += h0lambda * w1lambda * pos2[0];
-        pos1[h1p * width1] += h1lambda * w0lambda * pos2[0];
-        pos1[h1p * width1 + w1p] += h1lambda * w1lambda * pos2[0];
-        pos1 += width1 * height1;
-        pos2 += width2 * height2;
+        pos1[h1p * inputWidth] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * inputWidth + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
       }
     }
   }
+  THTensor_(free)(gradOutput);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialUpSamplingNearest.c b/lib/THNN/generic/SpatialUpSamplingNearest.c
index b67c68d..2135aa2 100644
--- a/lib/THNN/generic/SpatialUpSamplingNearest.c
+++ b/lib/THNN/generic/SpatialUpSamplingNearest.c
@@ -2,23 +2,76 @@
 #define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
 #else
 
+
+static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int scale_factor) {
+  THArgCheck(input != NULL, 2, "4D input tensor expected but got NULL");
+  THArgCheck(scale_factor > 1, 4,
+	     "scale_factor must be greater than 1, but got: %d", scale_factor);
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+  if (input->nDimension == 3) {
+    int nChannels    = THTensor_(size)(input, 0);
+    int inputHeight  = THTensor_(size)(input, 1);
+    int inputWidth   = THTensor_(size)(input, 2);
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth);
+    }
+  } else {
+    int nBatch       = THTensor_(size)(input, 0);
+    int nChannels    = THTensor_(size)(input, 1);
+    int inputHeight  = THTensor_(size)(input, 2);
+    int inputWidth   = THTensor_(size)(input, 3);  
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+    }
+  }
+}
+
 void THNN_(SpatialUpSamplingNearest_updateOutput)(
     THNNState *state,
     THTensor *input,
     THTensor *output,
     int scale_factor)
 {
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+  int inputHeight = THTensor_(size)(input, input->nDimension-2);
+  int inputWidth  = THTensor_(size)(input,  input->nDimension-1);
+  int outputHeight = inputHeight * scale_factor;
+  int outputWidth = inputWidth * scale_factor;
+
+  if (input->nDimension == 3) {
+    THTensor_(resize3d)(output,
+			THTensor_(size)(input, 0),
+			outputHeight, outputWidth);    
+  } else {
+    THTensor_(resize4d)(output,
+			THTensor_(size)(input, 0),
+			THTensor_(size)(input, 1),
+			outputHeight, outputWidth);
+  }
+
   int dW = scale_factor;
   int dH = scale_factor;
   int xDim = input->nDimension-2;
   int yDim = input->nDimension-1;
 
   // dims
-  int idim = input->nDimension;  // Guaranteed to be between 3 and 5
+  int idim = input->nDimension;
   int osz0 = output->size[0];
   int osz1 = output->size[1];
   int osz2 = output->size[2];
-  int osz3 = 1;
+  int osz3 = 1;  
   if (idim > 3) {
     osz3 = output->size[3];
   }
@@ -74,6 +127,9 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
     THTensor *gradInput,
     int scale_factor)
 {
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+  THTensor_(resizeAs)(gradInput, input);
+
   int dW = scale_factor;
   int dH = scale_factor;
   int xDim = gradInput->nDimension-2;
diff --git a/lib/THNN/generic/Sqrt.c b/lib/THNN/generic/Sqrt.c
index 826ed1d..24cd51a 100644
--- a/lib/THNN/generic/Sqrt.c
+++ b/lib/THNN/generic/Sqrt.c
@@ -19,6 +19,7 @@ void THNN_(Sqrt_updateGradInput)(
           THTensor *gradInput,
           THTensor *output)
 {
+  THNN_CHECK_SHAPE(output, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
 
   if (output->nDimension == 1 || 
diff --git a/lib/THNN/generic/Square.c b/lib/THNN/generic/Square.c
index a26c001..306ea77 100644
--- a/lib/THNN/generic/Square.c
+++ b/lib/THNN/generic/Square.c
@@ -32,6 +32,7 @@ void THNN_(Square_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput)
 {
+  THNN_CHECK_SHAPE(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
 
   if (input->nDimension == 1 || 
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 0f2149a..450998a 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -31,14 +31,14 @@ TH_API void THNN_(BCECriterion_updateOutput)(
           THTensor *target,
           THTensor *output,
           bool sizeAverage,
-	  THTensor *weights);
+          THTensor *weights);          // [OPTIONAL]
 TH_API void THNN_(BCECriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *target,
           THTensor *gradInput,
           bool sizeAverage,
-	  THTensor *weights);
+          THTensor *weights);          // [OPTIONAL]
 
 TH_API void THNN_(ClassNLLCriterion_updateOutput)(
           THNNState *state,            // library's state
@@ -186,7 +186,7 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THTensor *gradWeight,
           THIntegerTensor *count,
           THTensor *sorted,            // [OPTIONAL]
-          THTensor *indices,           // [OPTIONAL]
+          THIndexTensor *indices,      // [OPTIONAL]
           bool scaleGradByFreq,
           int paddingValue,
           real scale);
@@ -243,14 +243,14 @@ TH_API void THNN_(MSECriterion_updateGradInput)(
 TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *output,
           THTensor *isTarget,
           bool sizeAverage);
 TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *gradInput,
           THTensor *isTarget,
           bool sizeAverage);
@@ -258,7 +258,7 @@ TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
 TH_API void THNN_(MultiMarginCriterion_updateOutput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *output,
           bool sizeAverage,
           int p,
@@ -267,7 +267,7 @@ TH_API void THNN_(MultiMarginCriterion_updateOutput)(
 TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
-          THTensor *target,
+          THIndexTensor *target,
           THTensor *gradInput,
           bool sizeAverage,
           int p,
@@ -299,6 +299,31 @@ TH_API void THNN_(PReLU_accGradParameters)(
           THIndex_t nOutputPlane,
           real scale);
 
+TH_API void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer);
+TH_API void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+TH_API void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          real scale);
+
 TH_API void THNN_(RReLU_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -518,14 +543,14 @@ TH_API void THNN_(TemporalMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW, int dW);
 TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW, int dW);
 TH_API void THNN_(TemporalSubSampling_updateOutput)(
           THNNState *state,
@@ -693,14 +718,14 @@ TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int owidth, int oheight);
 TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices);
+          THIndexTensor *indices);
 
 TH_API void THNN_(SpatialAveragePooling_updateOutput)(
           THNNState *state,
@@ -728,7 +753,7 @@ TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
           THTensor *output,
           int outputW, int outputH,
           int poolSizeW, int poolSizeH,
-          THTensor *indices,
+          THIndexTensor *indices,
           THTensor *randomSamples);
 TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
           THNNState *state,
@@ -737,7 +762,7 @@ TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
           THTensor *gradInput,
           int outputW, int outputH,
           int poolSizeW, int poolSizeH,
-          THTensor *indices);
+          THIndexTensor *indices);
 
 TH_API void THNN_(SpatialFullConvolution_updateOutput)(
           THNNState *state,
@@ -852,7 +877,7 @@ TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
@@ -862,7 +887,7 @@ TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
@@ -872,7 +897,7 @@ TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
@@ -883,7 +908,7 @@ TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
@@ -894,14 +919,14 @@ TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int owidth, int oheight);
 TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int owidth, int oheight);
 
 TH_API void THNN_(SpatialSubSampling_updateOutput)(
@@ -945,11 +970,19 @@ TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
 TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
           THNNState *state,
           THTensor *input,
-          THTensor *output);
+          THTensor *output,
+	  int outputHeight,
+          int outputWidth);
 TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
           THNNState *state,
           THTensor *gradOutput,
-          THTensor *gradInput);
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputHeight,
+          int inputWidth,
+          int outputHeight,
+          int outputWidth);
 
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
@@ -1123,7 +1156,7 @@ TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kT, int kW, int kH,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
@@ -1133,7 +1166,7 @@ TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
@@ -1141,7 +1174,7 @@ TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kT, int kW, int kH,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
@@ -1152,7 +1185,7 @@ TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
           int dilationT, int dilationW, int dilationH);
@@ -1161,7 +1194,7 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int oT, int oW, int oH,
           int dT, int dW, int dH,
           int pT, int pW, int pH);
@@ -1170,7 +1203,7 @@ TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int oT, int oW, int oH,
           int dT, int dW, int dH,
           int pT, int pW, int pH);
diff --git a/lib/THNN/generic/Tanh.c b/lib/THNN/generic/Tanh.c
index d6da1e4..69a24b8 100644
--- a/lib/THNN/generic/Tanh.c
+++ b/lib/THNN/generic/Tanh.c
@@ -18,6 +18,7 @@ void THNN_(Tanh_updateGradInput)(
           THTensor *gradInput,
           THTensor *output)
 {
+  THNN_CHECK_SHAPE(output, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
 
   if (output->nDimension == 1 || 
diff --git a/lib/THNN/generic/TemporalConvolution.c b/lib/THNN/generic/TemporalConvolution.c
index a29a353..0e8e83a 100644
--- a/lib/THNN/generic/TemporalConvolution.c
+++ b/lib/THNN/generic/TemporalConvolution.c
@@ -20,15 +20,20 @@ void THNN_(TemporalConvolution_updateOutput)(
   int dimS = 0; // sequence dimension
   int dimF = 1; // feature dimension
   
-  THArgCheck(input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+		"2D or 3D (batch mode) tensor expected for input, but got: %s");
   
   if (input->nDimension == 3) 
   {
     dimS = 1;
     dimF = 2;
   }
-  THArgCheck(input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
-  THArgCheck(input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+  THArgCheck(input->size[dimF] == inputFrameSize, 2,
+	     "invalid input frame size. Got: %d, Expected: %d",
+	     input->size[dimF], inputFrameSize);
+  THArgCheck(input->size[dimS] >= kW, 2,
+	     "input sequence smaller than kernel size. Got: %d, Expected: %d",
+	     input->size[dimS], kW);
 
   input = THTensor_(newContiguous)(input);
   outputWindow = THTensor_(new)();
@@ -158,6 +163,9 @@ void THNN_(TemporalConvolution_updateGradInput)(
   nInputFrame = input->size[dimS];
   nOutputFrame = gradOutput->size[dimS];
 
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
   gradOutputWindow = THTensor_(new)();
   gradInputWindow = THTensor_(new)();
 
@@ -226,6 +234,8 @@ void THNN_(TemporalConvolution_updateGradInput)(
 
   THTensor_(free)(gradOutputWindow);
   THTensor_(free)(gradInputWindow);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(input);
 
 }
 
@@ -259,6 +269,7 @@ void THNN_(TemporalConvolution_accGradParameters)(
   nOutputFrame = gradOutput->size[dimS];
 
   input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
   gradOutputWindow = THTensor_(new)();
   inputWindow = THTensor_(new)();
   
@@ -342,6 +353,7 @@ void THNN_(TemporalConvolution_accGradParameters)(
 
   THTensor_(free)(gradOutputWindow);
   THTensor_(free)(inputWindow);
+  THTensor_(free)(gradOutput);
   THTensor_(free)(input);
 
 }
diff --git a/lib/THNN/generic/TemporalMaxPooling.c b/lib/THNN/generic/TemporalMaxPooling.c
index 48cbcab..0a2f004 100644
--- a/lib/THNN/generic/TemporalMaxPooling.c
+++ b/lib/THNN/generic/TemporalMaxPooling.c
@@ -6,7 +6,7 @@ void THNN_(TemporalMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW,
           int dW)
 {
@@ -16,7 +16,7 @@ void THNN_(TemporalMaxPooling_updateOutput)(
 
   real *input_data;
   real *output_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
   long t, y;
 
@@ -46,18 +46,18 @@ void THNN_(TemporalMaxPooling_updateOutput)(
     THTensor_(resize2d)(output, noframe, framesize);
 
     /* indices will contain index locations for each output point */
-    THTensor_(resize2d)(indices, noframe, framesize);
+    THIndexTensor_(resize2d)(indices, noframe, framesize);
 
     /* get raw pointers */
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
     for(t = 0; t < noframe; t++)
     {
       real *ip = input_data + t*framesize*dW;
       real *op = output_data + t*framesize;
-      real *xp = indices_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
 #pragma omp parallel for private(y)
       for(y = 0; y < framesize; y++)
       {
@@ -91,24 +91,24 @@ void THNN_(TemporalMaxPooling_updateOutput)(
     THTensor_(resize3d)(output, nbframe, noframe, framesize);
 
     /* indices will contain index locations for each output point */
-    THTensor_(resize3d)(indices, nbframe, noframe, framesize);
+    THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize);
 
     /* get raw pointers */
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
     for(i = 0; i < nbframe; i++)
     {
       real *inputSample_data = input_data + i*niframe*framesize;
       real *outputSample_data = output_data + i*noframe*framesize;
-      real *indicesSample_data = indices_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
 
       for(t = 0; t < noframe; t++)
       {
         real *ip = inputSample_data + t*framesize*dW;
         real *op = outputSample_data + t*framesize;
-        real *xp = indicesSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
 
 #pragma omp parallel for private(y)
         for(y = 0; y < framesize; y++)
@@ -145,7 +145,7 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kW,
           int dW)
 {
@@ -155,7 +155,7 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
 
   real *gradInput_data;
   real *gradOutput_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
   long t, y;
 
@@ -182,7 +182,7 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
+  indices_data = THIndexTensor_(data)(indices);
 
   if (input->nDimension == 2)
   {
@@ -190,7 +190,7 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
     {
       real *gip = gradInput_data + t*framesize*dW;
       real *gop = gradOutput_data + t*framesize;
-      real *xp = indices_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
 #pragma omp parallel for private(y)
       for(y = 0; y < framesize; y++)
       {
@@ -210,13 +210,13 @@ void THNN_(TemporalMaxPooling_updateGradInput)(
     {
       real *gradInputSample_data = gradInput_data + i*niframe*framesize;
       real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
-      real *indicesSample_data = indices_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
 
       for(t = 0; t < noframe; t++)
       {
         real *gip = gradInputSample_data + t*framesize*dW;
         real *gop = gradOutputSample_data + t*framesize;
-        real *xp = indicesSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
 #pragma omp parallel for private(y)
         for(y = 0; y < framesize; y++)
         {
diff --git a/lib/THNN/generic/Threshold.c b/lib/THNN/generic/Threshold.c
index 54310a0..dd2a698 100644
--- a/lib/THNN/generic/Threshold.c
+++ b/lib/THNN/generic/Threshold.c
@@ -36,6 +36,7 @@ void THNN_(Threshold_updateGradInput)(
           real val,
           bool inplace)
 {
+  THNN_CHECK_NELEMENT(input, gradOutput);
   if (inplace)
   {
     TH_TENSOR_APPLY2(real, gradOutput, real, input,
diff --git a/lib/THNN/generic/VolumetricAveragePooling.c b/lib/THNN/generic/VolumetricAveragePooling.c
index 49b311e..a317cbb 100644
--- a/lib/THNN/generic/VolumetricAveragePooling.c
+++ b/lib/THNN/generic/VolumetricAveragePooling.c
@@ -81,9 +81,8 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
   real *input_data;
   real *output_data;
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch-mode) tensor expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   int dimN = 0;
   int dimt = 1;
@@ -98,9 +97,12 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
     dimw++;
   }
 
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
-    "input image smaller than kernel size"
-  );
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+	     && input->size[dimt] >= kT, 2,
+	     "input image (T: %d H: %d W: %d) smaller than "
+	     "kernel size (kT: %d kH: %d kW: %d)",
+	     input->size[dimt], input->size[dimh], input->size[dimw],
+	     kT, kH, kW);
 
   /* sizes */
   nslices = input->size[dimN];
@@ -242,6 +244,7 @@ void THNN_(VolumetricAveragePooling_updateGradInput)(
   int dimh = 2;
   int dimw = 3;
 
+  // TODO: gradOutput shape check
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
diff --git a/lib/THNN/generic/VolumetricConvolution.c b/lib/THNN/generic/VolumetricConvolution.c
index 852dd54..4fd8ac3 100644
--- a/lib/THNN/generic/VolumetricConvolution.c
+++ b/lib/THNN/generic/VolumetricConvolution.c
@@ -19,9 +19,8 @@ void THNN_(VolumetricConvolution_updateOutput)(
 {
   THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch-mode) tensor expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   int dimt = 1;
   int dimh = 2;
@@ -106,15 +105,15 @@ void THNN_(VolumetricConvolution_updateGradInput)(
 {
   THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
-  THArgCheck(weight->nDimension == 5, 4,
-    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
 
   int nOutputPlane = (int)weight->size[0];
 
-  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
-    "4D or 5D (batch-mode) tensor expected"
-  );
+  THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+		gradOutput,
+		"4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
 
   int dimPlane = 0;
   if (gradOutput->nDimension == 5)
@@ -175,9 +174,9 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 {
   THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
-  THArgCheck(gradWeight->nDimension == 5, 4,
-    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
+  THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for gradWeight, but got: %s");
 
   int nOutputPlane = (int)gradWeight->size[0];
 
diff --git a/lib/THNN/generic/VolumetricConvolutionMM.c b/lib/THNN/generic/VolumetricConvolutionMM.c
index 8fef1cf..4b00c47 100644
--- a/lib/THNN/generic/VolumetricConvolutionMM.c
+++ b/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -2,6 +2,20 @@
 #define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
 #else
 
+static int THNN_(view_weight)(THTensor **_weight)
+{
+  THTensor *weight = *_weight;
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 5, 4,
+          "weight tensor should be 2D or 5D - got %dD", weight->nDimension);
+  if (weight->nDimension == 5) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    *_weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    return 1;
+  }
+  return 0;
+}
+
 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
 static void THNN_(unfolded_acc_vol)(
           THTensor *finput,
@@ -243,6 +257,7 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;
+  int freeWeight = 0;
 
   long nInputPlane;
   long inputDepth;
@@ -253,9 +268,9 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
   long outputHeight;
   long outputWidth;
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D(batch mode) tensor expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  input = THTensor_(newContiguous)(input);
 
   if (input->nDimension == 5)
   {
@@ -283,6 +298,8 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
     );
   }
 
+  freeWeight = THNN_(view_weight)(&weight);
+
   if (input->nDimension == 4)
   {
     THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
@@ -326,6 +343,10 @@ void THNN_(VolumetricConvolutionMM_updateOutput)(
       THTensor_(free)(finput_t);
     }
   }
+
+  THTensor_(free)(input);
+  if (freeWeight)
+    THTensor_(free)(weight);
 }
 
 static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
@@ -382,23 +403,22 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
           int pW,
           int pH)
 {
-  // number of input/output planes and kernel size is indirectly defined by the weight tensor
-  THArgCheck(weight->nDimension == 2, 4,
-    "2D weight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
-  );
-
   int nOutputPlane = (int)weight->size[0];
 
   THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
     "Number of output features is not equal to nOutputPlane"
   );
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int freeWeight = THNN_(view_weight)(&weight);
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
   // depending on the BLAS library, fgradInput (result tensor) might
   // be left uninitialized on zero alpha, which might lead to weird behavior
   // hence, to be safe, zero it
-  THTensor_(zero)(fgradInput);  
+  THTensor_(zero)(fgradInput);
   THTensor_(transpose)(weight, weight, 0, 1);
 
   if (input->nDimension == 4)
@@ -436,6 +456,11 @@ void THNN_(VolumetricConvolutionMM_updateGradInput)(
   }
 
   THTensor_(transpose)(weight, weight, 0, 1);
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (freeWeight)
+    THTensor_(free)(weight);
 }
 
 static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
@@ -479,10 +504,7 @@ void THNN_(VolumetricConvolutionMM_accGradParameters)(
           THTensor *finput,
           real scale)
 {
-  THArgCheck(gradWeight->nDimension == 2, 4,
-    "2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
-  );
-
+  int freeWeight;
   int nOutputPlane = (int)gradWeight->size[0];
 
   THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
@@ -492,6 +514,10 @@ void THNN_(VolumetricConvolutionMM_accGradParameters)(
   THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
     "Number of output features is not equal to nOutputPlane"
   );
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  freeWeight = THNN_(view_weight)(&gradWeight);
 
   if (input->nDimension == 4)   // non-batch mode
   {
@@ -513,6 +539,11 @@ void THNN_(VolumetricConvolutionMM_accGradParameters)(
       THTensor_(free)(finput_t);
     }
   }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (freeWeight)
+    THTensor_(free)(gradWeight);
 }
 
 #endif
diff --git a/lib/THNN/generic/VolumetricDilatedConvolution.c b/lib/THNN/generic/VolumetricDilatedConvolution.c
index 1a9cc93..e889f5a 100644
--- a/lib/THNN/generic/VolumetricDilatedConvolution.c
+++ b/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -15,11 +15,15 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
           int padT, int padW, int padH,
           int dilationT, int dilationW, int dilationH)
 {
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
-  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
   THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
   THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
   THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 17, "dilation should be greater than zero");
 
   // Params:
   int nInputPlane = weight->size[1];
@@ -146,9 +150,14 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
           int padT, int padW, int padH,
           int dilationT, int dilationW, int dilationH)
 {
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+		gradOutput,
+		"4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
   THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
   THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
 
@@ -246,9 +255,14 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
           int dilationT, int dilationW, int dilationH,
           real scale)
 {
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+		gradOutput,
+		"4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
+  THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for gradWeight, but got: %s");
   THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
   THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
   THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
diff --git a/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
index 0db41ae..629c05a 100644
--- a/lib/THNN/generic/VolumetricDilatedMaxPooling.c
+++ b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -5,7 +5,7 @@
 static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
-          real *indz_p,
+          THIndex_t *indz_p,
           long nslices,
           long itime,
           long iwidth,
@@ -43,7 +43,7 @@ static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
           long start_t = ti * dT - pT;
           long start_h = i * dH - pH;
           long start_w = j * dW - pW;
-          
+
           long kernel_t = fminf(kT, kT + start_t);
           long kernel_h = fminf(kH, kH + start_h);
           long kernel_w = fminf(kW, kW + start_w);
@@ -54,12 +54,12 @@ static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
             start_h += dilationH;
           while(start_w < 0)
             start_w += dilationW;
-          
+
           real *ip = input_p + k * itime * iwidth * iheight
             + start_t * iwidth * iheight + start_h * iwidth + start_w;
           real *op = output_p + k * otime * owidth * oheight
             + ti * owidth * oheight + i * owidth + j;
-          real *indzp = indz_p + k * otime * owidth * oheight
+          THIndex_t *indzp = indz_p + k * otime * owidth * oheight
             + ti * owidth * oheight + i * owidth + j;
 
           /* compute local max: */
@@ -107,7 +107,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kT,
           int kW,
           int kH,
@@ -131,11 +131,10 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
   long owidth;
   real *input_data;
   real *output_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch-mode) tensor expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   int dimN = 0;
   int dimt = 1;
@@ -150,14 +149,20 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
     dimw++;
   }
 
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
-    "input image smaller than kernel size"
-  );
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+	     && input->size[dimt] >= kT, 2,
+	     "input image (T: %d H: %d W: %d) smaller than "
+	     "kernel size (kT: %d kH: %d kW: %d)",
+	     input->size[dimt], input->size[dimh], input->size[dimw],
+	     kT, kH, kW);
 
   THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
     "pad should be smaller than half of kernel size"
   );
 
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+      "dilation should be greater than 0");
+
   /* sizes */
   nslices = input->size[dimN];
   itime   = input->size[dimt];
@@ -199,11 +204,11 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
     /* resize output */
     THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
     /* indices will contain ti,i,j uchar locations packed into float/double */
-    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+    THIndexTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
     THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
       input_data, output_data,
@@ -228,11 +233,11 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
     /* resize output */
     THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
     /* indices will contain ti,i,j locations for each output point */
-    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+    THIndexTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
 #pragma omp parallel for private(p)
     for (p=0; p < nBatch; p++)
@@ -259,7 +264,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
 static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
-          real *indz_p,
+          THIndex_t *indz_p,
           long nslices,
           long itime,
           long iwidth,
@@ -283,7 +288,7 @@ static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
   {
     real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
     real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
-    real *indz_p_k = indz_p + k * otime * owidth * oheight;
+    THIndex_t *indz_p_k = indz_p + k * otime * owidth * oheight;
 
     /* calculate max points */
     long ti, i, j;
@@ -294,7 +299,7 @@ static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
         for (j = 0; j < owidth; j++)
         {
           /* retrieve position of max */
-          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+          THIndex_t * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
           long maxti = ((unsigned char*)(indzp))[0] * dilationT + ti * dT - pT;
           long maxi  = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH;
           long maxj  = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW;
@@ -313,7 +318,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int dT,
           int dW,
           int dH,
@@ -333,13 +338,14 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   int owidth;
   real *gradInput_data;
   real *gradOutput_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
   int dimN = 0;
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;
 
+  // TODO: gradOutput shape check
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
 
@@ -367,7 +373,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
+  indices_data = THIndexTensor_(data)(indices);
 
   /* backprop */
   if (input->nDimension == 4) /* non-batch mode*/
diff --git a/lib/THNN/generic/VolumetricFullConvolution.c b/lib/THNN/generic/VolumetricFullConvolution.c
index 4eb36c4..8df9a74 100644
--- a/lib/THNN/generic/VolumetricFullConvolution.c
+++ b/lib/THNN/generic/VolumetricFullConvolution.c
@@ -101,9 +101,9 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
   THTensor *ones    = fgradInput;
 
   // number of input & output planes and kernel size is indirectly defined by the weight tensor
-  THArgCheck(weight->nDimension == 5, 4,
-    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
-  );
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
 
   const int nInputPlane  = (int)weight->size[0];
   const int nOutputPlane = (int)weight->size[1];
@@ -111,9 +111,8 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
   const int kH           = (int)weight->size[3];
   const int kW           = (int)weight->size[4];
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch mode) tensor is expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   int batch = 1;
   if (input->nDimension == 4)
@@ -241,9 +240,9 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
   THTensor *gradColumns = finput;
 
   // number of input & output planes and kernel size is indirectly defined by the weight tensor
-  THArgCheck(weight->nDimension == 5, 4,
-    "5D weight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
-  );
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
 
   const int nInputPlane  = (int)weight->size[0];
   const int nOutputPlane = (int)weight->size[1];
@@ -251,9 +250,8 @@ void THNN_(VolumetricFullConvolution_updateGradInput)(
   const int kH           = (int)weight->size[3];
   const int kW           = (int)weight->size[4];
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch mode) tensor is expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   int batch = 1;
   if (input->nDimension == 4)
@@ -349,9 +347,9 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
   real scale)
 {
   // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
-  THArgCheck(gradWeight->nDimension == 5, 4,
-    "5D gradWeight tensor is expected (nInputPlane x nOutputPlane x kT x kH x kW)"
-  );
+  THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for gradWeight, but got: %s");
 
   int nInputPlane  = (int)gradWeight->size[0];
   int nOutputPlane = (int)gradWeight->size[1];
@@ -362,9 +360,8 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
   THTensor *columns = finput;
   THTensor *ones = fgradInput;
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch mode) tensor is expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   int batch = 1;
   if (input->nDimension == 4)
diff --git a/lib/THNN/generic/VolumetricMaxPooling.c b/lib/THNN/generic/VolumetricMaxPooling.c
index dc376e6..47af4f0 100644
--- a/lib/THNN/generic/VolumetricMaxPooling.c
+++ b/lib/THNN/generic/VolumetricMaxPooling.c
@@ -6,7 +6,7 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int kT,
           int kW,
           int kH,
@@ -29,7 +29,7 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int dT,
           int dW,
           int dH,
diff --git a/lib/THNN/generic/VolumetricMaxUnpooling.c b/lib/THNN/generic/VolumetricMaxUnpooling.c
index 247dd5f..f2f879d 100644
--- a/lib/THNN/generic/VolumetricMaxUnpooling.c
+++ b/lib/THNN/generic/VolumetricMaxUnpooling.c
@@ -5,7 +5,7 @@
 static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
-          real *ind_p,
+          THIndex_t *ind_p,
           long nslices,
           long iT,
           long iW,
@@ -21,6 +21,8 @@ static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
           int pH)
 {
   long k;
+  int has_error = 0;
+  long error_index;
 #pragma omp parallel for private(k)
   for (k = 0; k < nslices; k++)
   {
@@ -37,31 +39,40 @@ static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
 
           //real *output_p_k = output_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
           real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
-          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
 
           maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
           maxy = ((unsigned char*)(ind_p_k))[1];
           maxx = ((unsigned char*)(ind_p_k))[2];
 
+          size_t idx = k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx);
           if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
           {
-            THError(
-              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
-              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
-            );
+#pragma omp critical
+            {
+              has_error = 1;
+              error_index = idx;
+            }
+          } else {
+            output_p[idx] = *input_p_k; /* update output */
           }
-          output_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx)] = *input_p_k; /* update output */
         }
       }
     }
   }
+  if (has_error) {
+    THError(
+        "found an invalid max index %ld (output volumes are of size %ldx%ldx%ld)",
+        error_index, oT, oH, oW
+    );
+  }
 }
 
 void THNN_(VolumetricMaxUnpooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          THTensor *indices,
+          THIndexTensor *indices,
           int oT,
           int oW,
           int oH,
@@ -82,16 +93,12 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
   int iW;
   real *input_data;
   real *output_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5 , 2,
-    "4D or 5D (batch mode) tensor expected"
-  );
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
-  if (!THTensor_(isSameSizeAs)(input, indices))
-  {
-    THError("Invalid input size w.r.t current indices size");
-  }
+  THNN_CHECK_SHAPE_INDICES(input, indices);
 
   if (input->nDimension == 5)
   {
@@ -109,7 +116,7 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
-  indices = THTensor_(newContiguous)(indices);
+  indices = THIndexTensor_(newContiguous)(indices);
 
   /* resize output */
   if (input->nDimension == 4)
@@ -119,7 +126,7 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
     THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
       input_data, output_data,
@@ -139,7 +146,7 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
 
     input_data = THTensor_(data)(input);
     output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
+    indices_data = THIndexTensor_(data)(indices);
 
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
@@ -159,13 +166,13 @@ void THNN_(VolumetricMaxUnpooling_updateOutput)(
 
   /* cleanup */
   THTensor_(free)(input);
-  THTensor_(free)(indices);
+  THIndexTensor_(free)(indices);
 }
 
 static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
-          real *ind_p,
+          THIndex_t *ind_p,
           long nslices,
           long iT,
           long iW,
@@ -197,7 +204,7 @@ static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
 
           real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
           //real *gradOutput_p_k = gradOutput_p + k*oT*oW*oH + ti*oW*oH*dT + i*oW*dH + j*dW;
-          real *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
 
           maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
           maxy = ((unsigned char*)(ind_p_k))[1];
@@ -222,7 +229,7 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          THTensor *indices,
+          THIndexTensor *indices,
           int oT,
           int oW,
           int oH,
@@ -243,16 +250,14 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
   int iW;
   real *gradInput_data;
   real *gradOutput_data;
-  real *indices_data;
+  THIndex_t *indices_data;
 
-  if (!THTensor_(isSameSizeAs)(input, indices))
-  {
-    THError("Invalid input size w.r.t current indices size");
-  }
+  THNN_CHECK_SHAPE_INDICES(input, indices);
 
+  // TODO: check gradOutput shape
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
-  indices = THTensor_(newContiguous)(indices);
+  indices = THIndexTensor_(newContiguous)(indices);
 
   /* resize */
   THTensor_(resizeAs)(gradInput, input);
@@ -283,7 +288,7 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
   /* get raw pointers */
   gradInput_data = THTensor_(data)(gradInput);
   gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
+  indices_data = THIndexTensor_(data)(indices);
 
   /* backprop */
   if (input->nDimension == 4)
@@ -319,7 +324,7 @@ void THNN_(VolumetricMaxUnpooling_updateGradInput)(
 
   /* cleanup */
   THTensor_(free)(gradOutput);
-  THTensor_(free)(indices);
+  THIndexTensor_(free)(indices);
 }
 
 #endif
diff --git a/lib/THNN/generic/VolumetricReplicationPadding.c b/lib/THNN/generic/VolumetricReplicationPadding.c
index c4ab02e..aebddbd 100644
--- a/lib/THNN/generic/VolumetricReplicationPadding.c
+++ b/lib/THNN/generic/VolumetricReplicationPadding.c
@@ -85,8 +85,8 @@ void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
   real *input_data;
   real *output_data;
 
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5,
-             2, "input must be 4 or 5-dimensional");
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
 
   if (input->nDimension == 5)
   {
@@ -106,8 +106,10 @@ void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
   oheight = iheight + ptop + pbottom;
   owidth  = iwidth + pleft + pright;
 
-  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1 , 2,
-             "input is too small");
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
+	     "input (D: %d H: %d, W: %d)is too small."
+	     " Calculated output D: %d H: %d W: %d",
+	     idepth, iheight, iwidth, odepth, oheight, owidth);
 
   /* get contiguous input */
   input = THTensor_(newContiguous)(input);
@@ -254,11 +256,15 @@ void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
   owidth  = iwidth + pleft + pright;
 
   THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
-                "gradOutput width unexpected");
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
   THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
-                "gradOutput height unexpected");
+	     "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
   THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
-                "gradOutput depth unexpected");
+	     "gradOutput depth unexpected. Expected: %d, Got: %d",
+	     odepth, THTensor_(size)(gradOutput, dimd));
+  
 
   /* get contiguous gradOutput */
   gradOutput = THTensor_(newContiguous)(gradOutput);
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
index b4218cb..3a7806d 100644
--- a/lib/THNN/init.c
+++ b/lib/THNN/init.c
@@ -4,6 +4,64 @@
 #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
 #define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
 
+#define THNN_CHECK_SHAPE(I1, I2)			\
+  if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2))	\
+    {							\
+       THDescBuff s1 = THTensor_(sizeDesc)(I1);		\
+       THDescBuff s2 = THTensor_(sizeDesc)(I2);		\
+       THError(#I1 " and " #I2 " shapes do not match: "	\
+	       #I1 " %s, " #I2 " %s", s1.str, s2.str);	\
+    }
+
+#define THNN_CHECK_SHAPE_INDICES(I1, I2)             \
+  THLongStorage *size2 = THLongTensor_newSizeOf(I2); \
+  if (I1 != NULL && I2 != NULL && !THTensor_(isSize)(I1, size2)) \
+    {             \
+      THDescBuff s1 = THTensor_(sizeDesc)(I1);       \
+      THDescBuff s2 = THLongTensor_sizeDesc(I2);     \
+      THLongStorage_free(size2);                     \
+      THError(#I1 " and " #I2 " shapes do not match: " \
+        #I1 " %s, " #I2 " %s", s1.str, s2.str);      \
+    } else {      \
+      THLongStorage_free(size2);                     \
+    }
+
+#define THNN_CHECK_NELEMENT(I1, I2) \
+  if (I1 != NULL && I2 != NULL ) {					\
+    ptrdiff_t n1 = THTensor_(nElement)(I1);					\
+    ptrdiff_t n2 = THTensor_(nElement)(I2);	                                \
+    if (n1 != n2)							\
+      {									\
+	THDescBuff s1 = THTensor_(sizeDesc)(I1);			\
+	THDescBuff s2 = THTensor_(sizeDesc)(I2);			\
+	THError(#I1 " and " #I2 " have different number of elements: "	\
+		#I1 "%s has %ld elements, while "			\
+		#I2 "%s has %ld elements", s1.str, n1, s2.str, n2);	\
+      }									\
+  }
+
+#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE)			\
+  if (THTensor_(nDimension)(T) != DIM ||				\
+      THTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THDescBuff s1 = THTensor_(sizeDesc)(T);				\
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+	      " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE)			\
+  if (THIndexTensor_(nDimension)(T) != DIM ||				\
+      THIndexTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THDescBuff s1 = THIndexTensor_(sizeDesc)(T);				\
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+        " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THNN_ARGCHECK(COND, ARG, T, FORMAT)	\
+  if (!(COND)) {				\
+    THDescBuff s1 = THTensor_(sizeDesc)(T);	\
+    THArgCheck(COND, ARG, FORMAT, s1.str);	\
+  }
+
 #include "generic/Abs.c"
 #include "THGenerateFloatTypes.h"
 
@@ -61,6 +119,9 @@
 #include "generic/MultiMarginCriterion.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/Linear.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/PReLU.c"
 #include "THGenerateFloatTypes.h"
 
diff --git a/test.lua b/test.lua
index 9098b46..774fba1 100644
--- a/test.lua
+++ b/test.lua
@@ -7,7 +7,7 @@ local jac
 local sjac
 
 local precision = 1e-5
-local expprecision = 1e-4
+local expprecision = 1.1e-4
 
 local nntest = torch.TestSuite()
 
@@ -80,8 +80,8 @@ function nntest.Add()
 
       -- IO
       local ferr,berr = jac.testIO(module,input)
-      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
    end
 end
 
@@ -121,6 +121,231 @@ function nntest.Bottle()
    mytester:eq(gradOutput1, gradOutput2, 0.0001, 'Bottle gradOutput not the same as Module')
 end
 
+function nntest.CAdd()
+   local function testBackwardPass(module, input, params, dparams)
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, "error computing gradiens w.r.t. inputs")
+
+      err = jac.testJacobianParameters(module, input, params, dparams)
+      mytester:assertlt(err,precision, "error computing gradients w.r.t params")
+
+      err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err,precision, "error in update using gradients w.r.t parameters")
+
+      --Test all of the various update methods
+      for test, err in pairs(jac.testAllUpdate(module, input, "bias", "gradBias")) do
+         mytester:assertlt(err, precision, string.format("error on bias [%s]", test))
+      end
+   end
+
+   local function testModuleIO(module, input)
+      local fwdErr,bkwdErr = jac.testIO(module,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(module) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(module) .. " - i/o backward err ")
+   end
+
+   local function testCAddWithNonBatchedInput()
+      local channels = math.random(3,5)
+      local width = math.random(3,5)
+      local height = math.random(3,5)
+
+      local input = torch.Tensor(channels, height, width):zero()
+
+      --Per channel bias
+      local module = nn.CAdd(channels, 1, 1)
+      local params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[i]:view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per row bias
+      module = nn.CAdd(1, height, 1)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {i}, {}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per column bias
+      module = nn.CAdd(1, 1, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {}, {i}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per input component bias
+      module = nn.CAdd(channels, height, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assert(module.bias:isSameSizeAs(input))
+      mytester:assertTensorEq(module.bias, output, precision)
+
+      testModuleIO(module, input)
+   end
+
+   local function testCAddWithBatchedInput()
+      local batchSize = math.random(3,5)
+      local channels = math.random(3,5)
+      local width = math.random(3,5)
+      local height = math.random(3,5)
+
+      local input = torch.Tensor(batchSize, channels, height, width):zero()
+      local module = nn.CAdd(batchSize, channels, height, width)
+
+      --Per batch bias
+      local module = nn.CAdd(batchSize, 1, 1, 1)
+      local params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[i]:view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per channel bias
+      module = nn.CAdd(1, channels, 1, 1)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {i}, {}, {}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per row bias
+      module = nn.CAdd(1, 1, height, 1)
+      params, gradParams = module:getParameters()
+
+       testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {}, {i}, {}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per column bias
+      module = nn.CAdd(1, 1, 1, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {}, {}, {i}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per input component bias
+      module = nn.CAdd(batchSize, channels, height, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assert(module.bias:isSameSizeAs(input))
+      mytester:assertTensorEq(module.bias, output, precision)
+
+      testModuleIO(module, input)
+   end
+
+
+   function testCAddWithLessDimsThanInput()
+      local input = torch.rand(4,5)
+      local module = nn.CAdd(5)
+      params, gradParams = module:getParameters()
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      local expandedBias = module.bias:view(1,5):expand(4,5):clone()
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assertTensorEq(expandedBias, output, precision)
+
+      testModuleIO(module, input)
+
+      input = torch.rand(4,5,6)
+      module = nn.CAdd(5,6)
+      params, gradParams = module:getParameters()
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      expandedBias = module.bias:view(1,5,6):expand(4,5,6):clone()
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assertTensorEq(expandedBias, output, precision)
+
+      testModuleIO(module, input)
+   end
+
+
+   testCAddWithNonBatchedInput()
+   testCAddWithBatchedInput()
+   testCAddWithLessDimsThanInput()
+end
+
 function nntest.CMul()
    local ini = math.random(3,5)
    local inj = math.random(3,5)
@@ -219,8 +444,8 @@ function nntest.CMul()
 
    -- IO
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Dropout()
@@ -348,8 +573,8 @@ function nntest.Exp()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Log()
@@ -363,8 +588,8 @@ function nntest.Log()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input, 0.1, 10)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.HardTanh()
@@ -379,8 +604,8 @@ function nntest.HardTanh()
    mytester:assertlt(err, precision ,  'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- test inclusive bounds -- HardTahn(1,inf) should behave like Threshold(1)
    local input = torch.Tensor({1})
@@ -410,8 +635,8 @@ function nntest.Clamp()
    mytester:assertlt(err, precision ,  'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Abs()
@@ -426,8 +651,8 @@ function nntest.Abs()
    mytester:assertlt(err, precision ,  'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Threshold()
@@ -442,8 +667,8 @@ function nntest.Threshold()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = nn.Jacobian.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.ELU()
@@ -458,8 +683,8 @@ function nntest.ELU()
    mytester:assertlt(err, precision ,  'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.ELUIP()
@@ -540,8 +765,8 @@ function nntest.PReLU()
 
    -- IO
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.RReLU()
@@ -564,8 +789,8 @@ function nntest.RReLU()
 
    -- IO
    local ferr,berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- test training and evalation mode
    for _,train in ipairs({true,false}) do
@@ -639,8 +864,8 @@ function nntest.HardShrink()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = nn.Jacobian.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SoftShrink()
@@ -655,8 +880,8 @@ function nntest.SoftShrink()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = nn.Jacobian.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Power()
@@ -678,8 +903,8 @@ function nntest.Power()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = nn.Jacobian.testIO(module,input, 0.1, 2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Normalize()
@@ -720,8 +945,8 @@ function nntest.Normalize()
    local module = nn.Normalize(2)
 
    local ferr, berr = jac.testIO(module,input, 0.1, 2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
 end
 
@@ -743,8 +968,8 @@ function nntest.Square()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = nn.Jacobian.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Sqrt()
@@ -772,8 +997,8 @@ function nntest.Sqrt()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = nn.Jacobian.testIO(module, input, 0, 2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Linear()
@@ -878,8 +1103,8 @@ function nntest.Linear()
 
          -- IO
          local ferr,berr = jac.testIO(module,input)
-         mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-         mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+         mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+         mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
       end
 
       jacTests(module)
@@ -1108,8 +1333,8 @@ function nntest.PartialLinear()
    mytester:assertlt(err,precision, 'error on bias [direct update] ')
 
    local ferr, berr = sjac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Euclidean()
@@ -1170,8 +1395,8 @@ function nntest.Euclidean()
    mytester:assertlt(err,precision, 'error on weight ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.WeightedEuclidean()
@@ -1253,8 +1478,8 @@ function nntest.WeightedEuclidean()
    mytester:assertlt(err,precision, 'error on bias ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    input:zero()
    module:zeroGradParameters()
@@ -1268,8 +1493,8 @@ function nntest.WeightedEuclidean()
    mytester:assertlt(err,precision, 'error on bias ')
 
    local ferr,berr = jac.testIO(module,input2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 local function criterionJacobianTest(cri, input, target)
@@ -1680,8 +1905,8 @@ function nntest.LogSigmoid()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.LogSoftmax()
@@ -1694,8 +1919,8 @@ function nntest.LogSoftmax()
    mytester:assertlt(err, 1e-3, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- test logsoftmax when gradOutput is non-contiguous
    local layer = nn.LogSoftMax()
@@ -1719,6 +1944,22 @@ function nntest.LogSoftmax()
 
 end
 
+function nntest.SpatialLogSoftMax()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local inl = math.random(3,5)
+   local input = torch.Tensor(inl, ink, inj, ini):zero()
+   local module = nn.SpatialLogSoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
 -- function nntest.TemporalLogSoftmax()
 --    local ini = math.random(10,20)
 --    local inj = math.random(10,20)
@@ -1729,8 +1970,8 @@ end
 --    mytester:assertlt(err,precision, 'error on state ')
 
 --    local ferr,berr = jac.testIO(module,input)
---    mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
---    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+--    mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+--    mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 -- end
 
 function nntest.Max()
@@ -1766,8 +2007,8 @@ function nntest.Max()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Min()
@@ -1803,8 +2044,8 @@ function nntest.Min()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Mean()
@@ -1840,8 +2081,8 @@ function nntest.Mean()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Mul()
@@ -1864,8 +2105,8 @@ function nntest.Mul()
    end
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Sigmoid()
@@ -1879,8 +2120,8 @@ function nntest.Sigmoid()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Softmax()
@@ -1893,8 +2134,8 @@ function nntest.Softmax()
    mytester:assertlt(err,expprecision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialSoftMax()
@@ -1909,8 +2150,8 @@ function nntest.SpatialSoftMax()
    mytester:assertlt(err,expprecision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Softmin()
@@ -1923,8 +2164,8 @@ function nntest.Softmin()
    mytester:assertlt(err,expprecision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Softsign()
@@ -1937,8 +2178,8 @@ function nntest.Softsign()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SoftPlus()
@@ -1952,8 +2193,8 @@ function nntest.SoftPlus()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialSubtractiveNormalization_2dkernel()
@@ -1968,8 +2209,8 @@ function nntest.SpatialSubtractiveNormalization_2dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
     -- test batch mode
    local output = module:forward(input):clone()
@@ -1991,8 +2232,8 @@ function nntest.SpatialSubtractiveNormalization_2dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
 end
 
@@ -2008,8 +2249,8 @@ function nntest.SpatialSubtractiveNormalization_1dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
     -- test batch mode
    local output = module:forward(input):clone()
@@ -2031,8 +2272,8 @@ function nntest.SpatialSubtractiveNormalization_1dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialDivisiveNormalization_2dkernel()
@@ -2047,8 +2288,8 @@ function nntest.SpatialDivisiveNormalization_2dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- test batch mode
    local output = module:forward(input):clone()
@@ -2070,8 +2311,8 @@ function nntest.SpatialDivisiveNormalization_2dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialDivisiveNormalization_1dkernel()
@@ -2086,8 +2327,8 @@ function nntest.SpatialDivisiveNormalization_1dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
     -- test batch mode
    local output = module:forward(input):clone()
@@ -2109,8 +2350,8 @@ function nntest.SpatialDivisiveNormalization_1dkernel()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialContrastiveNormalization()
@@ -2125,8 +2366,8 @@ function nntest.SpatialContrastiveNormalization()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- test batch mode and type
    local output = module:forward(input):clone()
@@ -2151,8 +2392,8 @@ function nntest.SpatialContrastiveNormalization()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialCrossMapLRN()
@@ -2169,8 +2410,8 @@ function nntest.SpatialCrossMapLRN()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- test batch mode and type
    local output = module:forward(input):clone()
@@ -2195,8 +2436,8 @@ function nntest.SpatialCrossMapLRN()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input2)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 
@@ -2272,10 +2513,6 @@ function nntest.SpatialConvolution()
       module = nn.SpatialConvolution(from, to, ki, kj, si, sj)
       input = torch.Tensor(batch,from,inj,ini):zero()
 
-      --    print(from, to, ki, kj, si, sj, batch, ini, inj)
-      --    print(module.weight:size())
-      --    print(module.gradWeight:size())
-
       local err = jac.testJacobian(module, input)
       mytester:assertlt(err, precision, 'batch error on state ')
 
@@ -2319,8 +2556,8 @@ function nntest.SpatialConvolution()
       end
 
       local ferr, berr = jac.testIO(module, input)
-      mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-      mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+      mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
    end
 
    jacTests(module)
@@ -2424,8 +2661,8 @@ function nntest.SpatialConvolutionMM()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- non-contiguous
    local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
@@ -2499,11 +2736,7 @@ function nntest.SpatialConvolutionLocal()
    ini = (outi-1)*si+ki
    inj = (outj-1)*sj+kj
    module = nn.SpatialConvolutionLocal(from, to, ini, inj, ki, kj, si, sj)
-   input = torch.Tensor(batch,from,inj,ini):zero()
-
---    print(from, to, ki, kj, si, sj, batch, ini, inj)
---    print(module.weight:size())
---    print(module.gradWeight:size())
+   input = torch.Tensor(batch, from, inj, ini):zero()
 
    local err = jac.testJacobian(module, input)
    mytester:assertlt(err, precision, 'batch error on state ')
@@ -2540,19 +2773,18 @@ function nntest.SpatialConvolutionLocal()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- check against nn.SpatialConvolution
    local conv = nn.SpatialConvolution(from, to, ki, kj, si, sj)
-   torch.repeatTensor(module.bias, conv.bias:view(to, 1, 1), 1, outi, outj)
+   torch.repeatTensor(module.bias, conv.bias:view(to, 1, 1), 1, outj, outi)
    torch.repeatTensor(module.weight, conv.weight:view(1, 1, from, to, ki, kj), outi, outj, 1, 1, 1, 1)
    local input = torch.rand(batch, from, inj, ini)
    local output = module:forward(input)
    local outputConv = conv:forward(input)
    local err = torch.dist(output, outputConv)
    mytester:assertlt(err, precision, 'error checking against nn.SpatialConvolution')
-
 end
 
 function nntest.SpatialFullConvolution()
@@ -2652,8 +2884,8 @@ function nntest.SpatialFullConvolution()
       end
 
       local ferr, berr = jac.testIO(module, input)
-      mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-      mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+      mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
    end
 
    jacTests(module)
@@ -2803,8 +3035,8 @@ function nntest.SpatialDilatedConvolution()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- non-contiguous
    local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
@@ -2864,8 +3096,8 @@ function nntest.SpatialConvolutionMap()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
 
 
@@ -2911,8 +3143,8 @@ function nntest.SpatialConvolutionMap()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialFullConvolutionMap()
@@ -2967,8 +3199,8 @@ function nntest.SpatialFullConvolutionMap()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialFullConvolutionCompare()
@@ -3166,8 +3398,8 @@ function nntest.SpatialSubSampling()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.SpatialMaxPooling()
@@ -3193,8 +3425,8 @@ function nntest.SpatialMaxPooling()
       mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
 
       local ferr, berr = jac.testIO(module, input)
-      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
       -- batch
       local nbatch = math.random(2,5)
@@ -3206,8 +3438,8 @@ function nntest.SpatialMaxPooling()
       mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
 
       local ferr, berr = jac.testIO(module, input)
-      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
-      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
   end
 end
 
@@ -3239,8 +3471,8 @@ function nntest.SpatialMaxUnpooling()
       mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
 
       local ferr, berr = jac.testIO(module, input)
-      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
       -- batch
       local nbatch = math.random(2,5)
@@ -3254,8 +3486,8 @@ function nntest.SpatialMaxUnpooling()
       mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
 
       local ferr, berr = jac.testIO(module, input)
-      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
-      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
   end
 end
 
@@ -3409,8 +3641,8 @@ function nntest.SpatialAveragePooling()
         mytester:assertlt(err, precision, 'error'..mode_string..' on state ')
 
         local ferr, berr = jac.testIO(module, input)
-        mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-        mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+        mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+        mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
         -- batch
         local batch = math.random(2,5)
@@ -3434,12 +3666,12 @@ function nntest.SpatialAveragePooling()
         mytester:assertlt(err, precision, 'batch error'..mode_string..' on state ')
 
         local ferr, berr = jac.testIO(module, input)
-        mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-        mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+        mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+        mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
         local ferr, berr = jac.testIO(module, input)
-        mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
-        mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+        mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+        mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
 
       end
    end
@@ -3511,8 +3743,8 @@ function nntest.SpatialAdaptiveMaxPooling()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- batch
    local nbatch = math.random(1,3)
@@ -3523,8 +3755,8 @@ function nntest.SpatialAdaptiveMaxPooling()
    mytester:assertlt(err, precision, 'error on state (Batch) ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
 
    -- non-contiguous
 
@@ -3571,8 +3803,8 @@ function nntest.SpatialLPPooling()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Sum()
@@ -3625,8 +3857,8 @@ function nntest.Sum()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Tanh()
@@ -3641,8 +3873,8 @@ function nntest.Tanh()
    mytester:assertlt(err, precision ,  'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.TemporalConvolution()
@@ -3711,8 +3943,8 @@ function nntest.TemporalConvolution()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- 2D matches 1D
    local output = module:forward(input):clone()
@@ -3805,8 +4037,8 @@ function nntest.TemporalSubSampling()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.TemporalMaxPooling()
@@ -3823,8 +4055,8 @@ function nntest.TemporalMaxPooling()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- 2D
    local nBatchFrame = 2
@@ -3833,8 +4065,8 @@ function nntest.TemporalMaxPooling()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- 2D matches 1D
    local output = module:forward(input):clone()
@@ -3925,8 +4157,8 @@ function nntest.VolumetricFullConvolution()
     mytester:assertlt(err , precision, 'error on bias ')
 
     local ferr, berr = jac.testIO(module, input)
-    mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-    mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+    mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+    mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.VolumetricFullConvolutionDualInput()
@@ -4025,8 +4257,8 @@ function nntest.VolumetricConvolution()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.VolumetricDilatedConvolution()
@@ -4121,8 +4353,8 @@ function nntest.VolumetricDilatedConvolution()
    end
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- non-contiguous
    local input = torch.randn(batch,from,ink,ini,inj):transpose(4,5) -- non-contiguous
@@ -4180,8 +4412,8 @@ function nntest.VolumetricAveragePooling()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
       -- batch
    local nbatch = math.random(2,3)
@@ -4192,8 +4424,8 @@ function nntest.VolumetricAveragePooling()
    mytester:assertlt(err, precision, 'error on state (Batch) ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
 end
 
 function nntest.VolumetricMaxPooling()
@@ -4220,8 +4452,8 @@ function nntest.VolumetricMaxPooling()
    mytester:assertlt(err, precision, 'error on state ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- batch
    local nbatch = math.random(2,3)
@@ -4232,8 +4464,8 @@ function nntest.VolumetricMaxPooling()
    mytester:assertlt(err, precision, 'error on state (Batch) ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
 end
 
 function nntest.VolumetricDilatedMaxPooling()
@@ -4313,8 +4545,8 @@ function nntest.VolumetricMaxUnpooling()
    mytester:assertlt(err, precision, 'error ')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
    -- batch
    local nbatch = math.random(2,3)
@@ -4328,8 +4560,8 @@ function nntest.VolumetricMaxUnpooling()
    mytester:assertlt(err, precision, 'error on Batch')
 
    local ferr, berr = jac.testIO(module, input)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
 end
 
 function nntest.VolumetricMaxPooling_boundary()
@@ -4862,8 +5094,8 @@ function nntest.LookupTable()
        -- IO
        module.gradInput = torch.Tensor(3,4):zero() --fixes an error
        local ferr,berr = jac.testIO(module,input,minval,maxval)
-       mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-       mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+       mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+       mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
        -- accUpdate
        module:accUpdateOnly()
@@ -6215,8 +6447,8 @@ function nntest.DotProduct()
 
   -- IO
   local ferr,berr = jac.testIO(module,input)
-  mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-  mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+  mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+  mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
   -- batch
   -- rebuild module to avoid correlated tests
@@ -6264,8 +6496,8 @@ function nntest.CosineDistance()
 
   -- IO
   local ferr,berr = jac.testIO(module,input)
-  mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-  mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+  mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+  mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 
   -- batch
   -- rebuild module to avoid correlated tests
@@ -6426,8 +6658,8 @@ local function testBatchNormalization(moduleName, dim, k)
 
       -- IO
       local ferr,berr = jac.testIO(module,input)
-      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
    end
 
    local module = nn[moduleName](planes)
@@ -6472,8 +6704,8 @@ function nntest.GradientReversal()
    mytester:assertlt(err,precision, 'error on state ')
 
    local ferr,berr = jac.testIO(module,input, 0.1, 10)
-   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
-   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
 function nntest.Padding()
@@ -6596,6 +6828,167 @@ function nntest.VolumetricReplicationPadding()
    end
 end
 
+function nntest.PixelShuffle()
+   -- Checks whether a given tensor has the specified size
+   local function tensorHasSize(tensor, size)
+      local tensorSize = tensor:size()
+
+      if tensorSize:size() ~= #size then
+         return false
+      end
+      for i,v in ipairs(size) do
+         if tensorSize[i] ~= size[i] then
+            return false
+         end
+      end
+      return true
+   end
+
+   --Verifies that the output is the input re-shuffled as per Eq 4. in
+   -- "Real-Time Single Image and Video Super-Resolution Using an Efficient
+   -- Sub-Pixel Convolutional Neural Network", Shi et al.
+   -- @param - the input, low-resolution image of shape [1, c, h , w]
+   -- @param - the output, super resolved image of shape [1, c, h ,w]
+   -- @param - upscale factor of the super resolutin
+   -- @returns true if output complies with Eq 4.
+   local function verifyPixelShuffle(_input, _output, upscaleFactor)
+      local input = _input
+      local output = _output
+
+      if input:nDimension() == 3 then
+         input = input:view(1, input:size(1), input:size(2), input:size(3))
+         output = output:view(1, output:size(1), output:size(2), output:size(3))
+      end
+
+      for c = 1, output:size(2)  do
+         for h = 1, output:size(3) do
+            for w = 1, output:size(4) do
+               local heightIdx = torch.floor((h - 1)/upscaleFactor) + 1
+               local widthIdx = torch.floor((w - 1)/upscaleFactor) + 1
+                  --c does not need to be (c - 1) as it starts at 1 not zero
+                  local channelIdx = upscaleFactor * ((h-1) % upscaleFactor) + ((w-1) % upscaleFactor) + 1 + (c-1)*upscaleFactor*upscaleFactor
+
+                  mytester:assertTensorEq(output[{{}, {c}, {h}, {w}}], input[{{}, {channelIdx}, {heightIdx}, {widthIdx}}],
+                                        string.format("output at location (%d, %d, %d) is incorrect", c, h, w))
+            end
+         end
+      end
+      return true
+   end
+
+   -- Checks the nn.PixelShuffle layer's forward pass. It checks that is
+   -- re-arranges input pixels correctly according to Eq. 4 of
+   -- "Real-Time Single Image and Video Super-Resolution Using an Efficient
+   -- Sub-Pixel Convolutional Neural Network", Shi et al.
+   -- This function tests for multip batch sizes, multiple channels and multiple input dimensions (square)
+   -- It also tests for normal tensors (un-batched)
+   function testPixelShuffleUpdateOutput()
+      --Test with batched input
+      for h = 1, 3 do
+         local batchSize = torch.round(torch.uniform(1, 3))
+         for i = 1, 3 do
+            local upscaleFactor = torch.round(torch.uniform(2,5))
+            local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+            for j = 1, 3 do
+               local channels = torch.round(torch.uniform(1, 4))
+               for k = 1, 3 do
+
+                     local inputDim = torch.round(torch.uniform(5, 10))
+                     local input = torch.Tensor(batchSize, channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+                     input:uniform()
+
+                     local output = pixelShuffle:forward(input)
+                     local expectedOutputDim = inputDim * upscaleFactor
+                     mytester:assert(tensorHasSize(output, {batchSize, channels, expectedOutputDim, expectedOutputDim}),
+                     string.format("Output tensor should have size (%d, %d, %d, %d) not %s", batchSize, channels, expectedOutputDim, expectedOutputDim, tostring(output:size())))
+                     verifyPixelShuffle(input, output, upscaleFactor)
+               end
+            end
+         end
+      end
+
+      --Test with non-batched input
+      local inputDim = torch.round(torch.uniform(5, 10))
+      local channels = torch.round(torch.uniform(1, 4))
+      local upscaleFactor = torch.round(torch.uniform(2,5))
+
+      local input = torch.Tensor(channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+      input:uniform()
+
+      local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+      local output = pixelShuffle:forward(input)
+      local expectedOutputDim = inputDim * upscaleFactor
+      mytester:assert(tensorHasSize(output, {channels, expectedOutputDim, expectedOutputDim}),
+      string.format("Output tensor should have size (%d, %d, %d) not %s", channels, expectedOutputDim, expectedOutputDim, tostring(output:size())))
+
+      verifyPixelShuffle(input, output, upscaleFactor)
+   end
+
+   -- Checks the nn.PixelShuffle layer's backward pass. It checks that is
+   -- essentially performs the inverse of Eq 4. in
+   -- "Real-Time Single Image and Video Super-Resolution Using an Efficient
+   -- Sub-Pixel Convolutional Neural Network", Shi et al.
+   -- This function tests for multip batch sizes, multiple channels and multiple input dimensions (square)
+   -- It also tests for normal tensors (un-batched)
+   function testPixelShuffleUpdateGradInput()
+      --Test with batched input
+      for h = 1, 3 do
+         local batchSize = torch.round(torch.uniform(1, 3))
+         for i = 1, 3 do
+            local upscaleFactor = torch.round(torch.uniform(2,5))
+            local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+               for j = 1, 3 do
+                  local channels = torch.round(torch.uniform(1, 4))
+                  for k = 1, 3 do
+                     local inputDim = torch.round(torch.uniform(5, 10))
+                     local input = torch.Tensor(batchSize, channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+
+                     input:uniform()
+
+                     local output = pixelShuffle:forward(input)
+                     --here we treat output as the same as gradOutput as they have the same shape
+                     local reconstructedInput = pixelShuffle:backward(input, output)
+                     mytester:assertTensorEq(reconstructedInput, input, 0)
+                  end
+            end
+         end
+      end
+
+      --Test with non-batched input
+      local inputDim = torch.round(torch.uniform(5, 10))
+      local channels = torch.round(torch.uniform(1, 4))
+      local upscaleFactor = torch.round(torch.uniform(2,5))
+      local input = torch.Tensor(channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+      input:uniform()
+
+      local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+      local output = pixelShuffle:forward(input)
+      --here we treat output as the same as gradOutput as they have the same shape
+      local reconstructedInput = pixelShuffle:backward(input, output)
+      mytester:assertTensorEq(reconstructedInput, input, 0)
+
+      local err = jac.testJacobian(pixelShuffle, input)
+      mytester:assertlt(err,precision, "error computing gradiens w.r.t. inputs")
+   end
+
+   function testModuleIO()
+      --Test with non-batched input
+      local inputDim = torch.round(torch.uniform(5, 10))
+      local channels = torch.round(torch.uniform(1, 4))
+      local upscaleFactor = torch.round(torch.uniform(2,5))
+      local input = torch.Tensor(channels * upscaleFactor * upscaleFactor, inputDim, inputDim):uniform()
+      local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+
+      local fwdErr,bkwdErr = jac.testIO(pixelShuffle,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(pixelShuffle) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(pixelShuffle) .. " - i/o backward err ")
+   end
+
+   testPixelShuffleUpdateOutput()
+   testPixelShuffleUpdateGradInput()
+   testModuleIO()
+end
+
 function nntest.Typecast()
   local function make_network()
     local seq = nn.Sequential()
@@ -6839,3 +7232,9 @@ function nn.test(tests, seed)
    torch.setnumthreads(nThreads)
    return mytester
 end
+
+function nn.testTHNN(tests, seed)
+   require 'test.LinearTHNN'
+   nn.Linear = nn.LinearTHNN
+   return nn.test(tests,seed)
+end
diff --git a/test/LinearTHNN.lua b/test/LinearTHNN.lua
new file mode 100644
index 0000000..dc690dc
--- /dev/null
+++ b/test/LinearTHNN.lua
@@ -0,0 +1,94 @@
+local LinearTHNN, parent = torch.class('nn.LinearTHNN', 'nn.Module')
+
+function LinearTHNN:__init(inputSize, outputSize, bias)
+   parent.__init(self)
+   local bias = ((bias == nil) and true) or bias
+   self.weight = torch.Tensor(outputSize, inputSize)
+   self.gradWeight = torch.Tensor(outputSize, inputSize)
+   if bias then
+      self.bias = torch.Tensor(outputSize)
+      self.gradBias = torch.Tensor(outputSize)
+   end
+   self.addBuffer = torch.Tensor(outputSize)
+   self:reset()
+end
+
+function LinearTHNN:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function LinearTHNN:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(2))
+   end
+   if nn.oldSeed then
+      for i=1,self.weight:size(1) do
+         self.weight:select(1, i):apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      end
+      if self.bias then
+         for i=1,self.bias:nElement() do
+            self.bias[i] = torch.uniform(-stdv, stdv)
+         end
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+      if self.bias then self.bias:uniform(-stdv, stdv) end
+   end
+   return self
+end
+
+function LinearTHNN:updateOutput(input)
+   input.THNN.Linear_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias and self.bias:cdata(),
+      self.addBuffer:cdata()
+   )
+   return self.output
+end
+
+function LinearTHNN:updateGradInput(input, gradOutput)
+   input.THNN.Linear_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata()
+   )
+   return self.gradInput
+end
+
+function LinearTHNN:accGradParameters(input, gradOutput, scale)
+   input.THNN.Linear_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.bias and self.bias:cdata(),
+      self.gradWeight:cdata(),
+      self.bias and self.gradBias:cdata(),
+      self.addBuffer:cdata(),
+      scale or 1
+   )
+   return self.gradWeight
+end
+
+-- we do not need to accumulate parameters when sharing
+LinearTHNN.sharedAccUpdateGradParameters = LinearTHNN.accUpdateGradParameters
+
+function LinearTHNN:clearState()
+   if self.addBuffer then self.addBuffer:set() end
+   return parent.clearState(self)
+end
+
+function LinearTHNN:__tostring__()
+  return torch.type(self) ..
+      string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1)) ..
+      (self.bias == nil and ' without bias' or '')
+end

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git