[lua-torch-nn] 01/07: New upstream version 0~20160908-g9d7b9ea+dfsg

Sat Sep 10 03:27:37 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit 02e4a99fbdf67a54e0bd2360c96ffa01236f170e
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Sat Sep 10 02:31:27 2016 +0000

    New upstream version 0~20160908-g9d7b9ea+dfsg
---
 BCECriterion.lua                                   | 142 +++-----
 CMaxTable.lua                                      |  33 ++
 CMinTable.lua                                      |  33 ++
 CMul.lua                                           |  61 +++-
 CONTRIBUTING.md                                    |   2 +-
 ClassSimplexCriterion.lua                          |   2 +-
 Concat.lua                                         |  10 +-
 Container.lua                                      |   2 +-
 DepthConcat.lua                                    |  18 +-
 Dropout.lua                                        |   5 +-
 JoinTable.lua                                      |   2 +-
 LookupTable.lua                                    |   2 +-
 MapTable.lua                                       |  99 ++++++
 Max.lua                                            |  15 +-
 Min.lua                                            |  15 +-
 Normalize.lua                                      |  15 +-
 SpatialConvolution.lua                             |  23 +-
 SpatialDilatedMaxPooling.lua                       |   4 +-
 SpatialDropout.lua                                 |   7 +-
 SpatialMaxPooling.lua                              |   2 -
 Sum.lua                                            |   4 +-
 THNN.lua                                           |   1 +
 TemporalDynamicKMaxPooling.lua                     |  65 ++++
 Threshold.lua                                      |   1 +
 VolumetricDilatedMaxPooling.lua                    |  64 ++++
 VolumetricDropout.lua                              |   7 +-
 doc/convolution.md                                 |  25 ++
 doc/simple.md                                      |  72 ++--
 doc/table.md                                       |  81 +++++
 init.lua                                           |   5 +
 lib/THNN/generic/BCECriterion.c                    |  50 +++
 lib/THNN/generic/HardTanh.c                        |   2 +-
 lib/THNN/generic/SpatialConvolutionMM.c            |  47 ++-
 ...tialMaxPooling.c => SpatialDilatedMaxPooling.c} |  18 +-
 lib/THNN/generic/SpatialMaxPooling.c               | 288 +---------------
 lib/THNN/generic/SpatialUpSamplingNearest.c        |   4 +-
 lib/THNN/generic/THNN.h                            |  58 +++-
 lib/THNN/generic/Threshold.c                       |   1 +
 ...cMaxPooling.c => VolumetricDilatedMaxPooling.c} |  87 +++--
 lib/THNN/generic/VolumetricMaxPooling.c            | 361 +--------------------
 lib/THNN/init.c                                    |   9 +
 test.lua                                           | 232 ++++++++++++-
 42 files changed, 1067 insertions(+), 907 deletions(-)

diff --git a/BCECriterion.lua b/BCECriterion.lua
index b319335..8bb5f81 100644
--- a/BCECriterion.lua
+++ b/BCECriterion.lua
@@ -1,106 +1,64 @@
+local THNN = require 'nn.THNN'
 local BCECriterion, parent = torch.class('nn.BCECriterion', 'nn.Criterion')
 
-local eps = 1e-12
-
 function BCECriterion:__init(weights, sizeAverage)
-    parent.__init(self)
-    if sizeAverage ~= nil then
-        self.sizeAverage = sizeAverage
-    else
-        self.sizeAverage = true
-    end
-    if weights ~= nil then
-        assert(weights:dim() == 1, "weights input should be 1-D Tensor")
-        self.weights = weights
-    end
+   parent.__init(self)
+   if sizeAverage ~= nil then
+      self.sizeAverage = sizeAverage
+   else
+      self.sizeAverage = true
+   end
+   if weights ~= nil then
+      assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+      self.weights = weights
+   end
 end
 
 
 function BCECriterion:__len()
-    if (self.weights) then
-        return #self.weights
-    else
-        return 0
-    end
+   return self.weights and #self.weights or 0
 end
 
 function BCECriterion:updateOutput(input, target)
-    -- - log(input) * target - log(1 - input) * (1 - target)
-
-    assert( input:nElement() == target:nElement(),
-    "input and target size mismatch")
-
-    self.buffer = self.buffer or input.new()
-
-    local buffer = self.buffer
-    local weights = self.weights
-    local output
-
-    buffer:resizeAs(input)
-
-    if weights ~= nil and target:dim() ~= 1 then
-        weights = self.weights:view(1, target:size(2)):expandAs(target)
-    end
-
-    -- log(input) * target
-    buffer:add(input, eps):log()
-    if weights ~= nil then buffer:cmul(weights) end
-
-    output = torch.dot(target, buffer)
-
-    -- log(1 - input) * (1 - target)
-    buffer:mul(input, -1):add(1):add(eps):log()
-    if weights ~= nil then buffer:cmul(weights) end
-
-    output = output + torch.sum(buffer)
-    output = output - torch.dot(target, buffer)
-
-    if self.sizeAverage then
-        output = output / input:nElement()
-    end
-
-    self.output = - output
-
-    return self.output
+   -- - log(input) * target - log(1 - input) * (1 - target)
+   assert( input:nElement() == target:nElement(),
+   "input and target size mismatch")
+   self.output_tensor = self.output_tensor or input.new(1)
+
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   input.THNN.BCECriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(weights)
+   )
+
+   self.output = self.output_tensor[1]
+   return self.output
 end
 
 function BCECriterion:updateGradInput(input, target)
-    -- - (target - input) / ( input (1 - input) )
-    -- The gradient is slightly incorrect:
-    -- It should have be divided by (input + eps) (1 - input + eps)
-    -- but it is divided by input (1 - input + eps) + eps
-    -- This modification requires less memory to be computed.
-
-    assert( input:nElement() == target:nElement(),
-    "input and target size mismatch")
-
-    self.buffer = self.buffer or input.new()
-
-    local buffer = self.buffer
-    local weights = self.weights
-    local gradInput = self.gradInput
-
-    if weights ~= nil and target:dim() ~= 1 then
-        weights = self.weights:view(1, target:size(2)):expandAs(target)
-    end
-
-    buffer:resizeAs(input)
-    -- - x ( 1 + eps -x ) + eps
-    buffer:add(input, -1):add(-eps):cmul(input):add(-eps)
-
-    gradInput:resizeAs(input)
-    -- y - x
-    gradInput:add(target, -1, input)
-    -- - (y - x) / ( x ( 1 + eps -x ) + eps )
-    gradInput:cdiv(buffer)
-
-    if weights ~= nil then
-        gradInput:cmul(weights)
-    end
-
-    if self.sizeAverage then
-        gradInput:div(target:nElement())
-    end
-
-    return gradInput
+   -- - (target - input) / ( input (1 - input) )
+   assert( input:nElement() == target:nElement(),
+   "input and target size mismatch")
+
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   input.THNN.BCECriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(weights)
+   )
+
+   return self.gradInput
 end
diff --git a/CMaxTable.lua b/CMaxTable.lua
new file mode 100644
index 0000000..3907faf
--- /dev/null
+++ b/CMaxTable.lua
@@ -0,0 +1,33 @@
+local CMaxTable, parent = torch.class('nn.CMaxTable', 'nn.Module')
+
+function CMaxTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+   self.maxIdx = torch.Tensor()
+end
+
+function CMaxTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.maxIdx:resizeAs(input[1]):fill(1)
+   for i=2,#input do
+      local mask = torch.gt(input[i], self.output)
+      self.maxIdx:maskedFill(mask, i)
+      self.output:maskedCopy(mask, input[i][mask])
+   end
+   return self.output
+end
+
+function CMaxTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = torch.Tensor()
+      self.gradInput[i]:resizeAs(input[i]):fill(0.0)
+      local mask = torch.eq(self.maxIdx, i)
+      self.gradInput[i]:maskedCopy(mask, gradOutput[mask])
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/CMinTable.lua b/CMinTable.lua
new file mode 100644
index 0000000..a8385e8
--- /dev/null
+++ b/CMinTable.lua
@@ -0,0 +1,33 @@
+local CMinTable, parent = torch.class('nn.CMinTable', 'nn.Module')
+
+function CMinTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+   self.minIdx = torch.Tensor()
+end
+
+function CMinTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.minIdx:resizeAs(input[1]):fill(1)
+   for i=2,#input do
+      local mask = torch.lt(input[i], self.output)
+      self.minIdx:maskedFill(mask, i)
+      self.output:maskedCopy(mask, input[i][mask])
+   end
+   return self.output
+end
+
+function CMinTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = torch.Tensor()
+      self.gradInput[i]:resizeAs(input[i]):fill(0.0)
+      local mask = torch.eq(self.minIdx, i)
+      self.gradInput[i]:maskedCopy(mask, gradOutput[mask])
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/CMul.lua b/CMul.lua
index e84f7ba..22b5b74 100644
--- a/CMul.lua
+++ b/CMul.lua
@@ -47,10 +47,15 @@ function CMul:updateOutput(input)
       
       self._output:cmul(self._weight)
    else
-      local batchSize = input:size(1)
-      self._output:view(self.output, batchSize, -1)
-      self._weight:view(self.weight, 1, -1)
-      
+      if self.weight:dim() == input:dim() then
+         self._output:set(self.output)
+         self._weight:set(self.weight)
+      else
+         local batchSize = input:size(1)
+         self._output:view(self.output, batchSize, -1)
+         self._weight:view(self.weight, 1, -1)
+      end
+
       self._expand:expandAs(self._weight, self._output)
       
       if torch.type(input) == 'torch.CudaTensor' then
@@ -76,10 +81,17 @@ function CMul:updateGradInput(input, gradOutput)
    if self.weight:nElement() == gradOutput:nElement() then
       self.gradInput:addcmul(1, self.weight, gradOutput)
    else
-      local batchSize = input:size(1)
-      nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
-      nn.utils.contiguousView(self._gradInput, self.gradInput, batchSize, -1)
-      self._weight:view(self.weight, 1, -1)
+      if self.weight:dim() == input:dim() then
+         nn.utils.contiguousView(self._gradOutput, gradOutput, gradOutput:size())
+         nn.utils.contiguousView(self._gradInput, self.gradInput, self.gradInput:size())
+         self._weight:set(self.weight)
+      else
+         local batchSize = input:size(1)
+         nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+         nn.utils.contiguousView(self._gradInput, self.gradInput, batchSize, -1)
+         self._weight:view(self.weight, 1, -1)
+      end
+
       self._expand:expandAs(self._weight, self._gradOutput)
       
       if torch.type(input) == 'torch.CudaTensor' then
@@ -103,14 +115,33 @@ function CMul:accGradParameters(input, gradOutput, scale)
    if self.weight:nElement() == gradOutput:nElement() then
       self.gradWeight:addcmul(scale, input, gradOutput)
    else
-      local batchSize = input:size(1)
-      nn.utils.contiguousView(self._input, input, batchSize, -1)
-      nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
-      self._gradWeight:view(self.gradWeight, 1, -1)
+      if self.weight:dim() == input:dim() then
+         nn.utils.contiguousView(self._input, input, input:size())
+         nn.utils.contiguousView(self._gradOutput, gradOutput, gradOutput:size())
+         self._gradWeight:set(self.gradWeight)
       
-      self._repeat:cmul(self._input, self._gradOutput)
-      self._sum:sum(self._repeat, 1)
-      self._gradWeight:add(scale, self._sum)
+         self._repeat:cmul(self._input, self._gradOutput)
+         local sumInto = self._sum
+         local sumFrom = self._repeat
+         for i=1,self.weight:dim() do
+            if self.weight:size(i) ~= input:size(i) then
+               sumInto:sum(sumFrom, i)
+               sumInto = sumFrom
+               sumFrom = sumFrom == self._repeat and self._sum or self._repeat
+            end
+         end
+         self._gradWeight:add(scale, sumFrom)
+      else
+         local batchSize = input:size(1)
+         nn.utils.contiguousView(self._input, input, batchSize, -1)
+         nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+         self._gradWeight:view(self.gradWeight, 1, -1)
+
+         self._repeat:cmul(self._input, self._gradOutput)
+         self._sum:sum(self._repeat, 1)
+         self._gradWeight:add(scale, self._sum)
+      end
+
    end
 end
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d4da7c9..92574db 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -22,7 +22,7 @@ restrictions:
   [mailing-list](http://groups.google.com/forum/#!forum/torch7)).
 
 * Please **do not** open issues regarding the code in a torch package 
-  outside the core. For example dont open issues about the 
+  outside the core. For example don't open issues about the 
   REPL in the nn issue tracker, use the trepl issue tracker for that.
 
 <a name="bugs"></a>
diff --git a/ClassSimplexCriterion.lua b/ClassSimplexCriterion.lua
index 6ccaed9..9cabc01 100644
--- a/ClassSimplexCriterion.lua
+++ b/ClassSimplexCriterion.lua
@@ -64,7 +64,7 @@ function ClassSimplexCriterion:__init(nClasses)
 end
 
 -- handle target being both 1D tensor, and
--- target being 2D tensor (2D tensor means dont do anything)
+-- target being 2D tensor (2D tensor means don't do anything)
 local function transformTarget(self, target)
     if torch.type(target) == 'number' then
         self._target:resize(self.nClasses)
diff --git a/Concat.lua b/Concat.lua
index ea2489e..108b216 100644
--- a/Concat.lua
+++ b/Concat.lua
@@ -2,22 +2,24 @@ local Concat, parent = torch.class('nn.Concat', 'nn.Container')
 
 function Concat:__init(dimension)
    parent.__init(self)
-   self.size = torch.LongStorage()
+   self.outputSize = torch.LongStorage()
    self.dimension = dimension
 end
 
 function Concat:updateOutput(input)
+   self.outputSize = self.outputSize or torch.LongStorage()
+
    local outs = {}
    for i=1,#self.modules do
       local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
       outs[i] = currentOutput
       if i == 1 then
-         self.size:resize(currentOutput:dim()):copy(currentOutput:size())
+         self.outputSize:resize(currentOutput:dim()):copy(currentOutput:size())
       else
-         self.size[self.dimension] = self.size[self.dimension] + currentOutput:size(self.dimension)
+         self.outputSize[self.dimension] = self.outputSize[self.dimension] + currentOutput:size(self.dimension)
       end
    end
-   self.output:resize(self.size)
+   self.output:resize(self.outputSize)
 
    local offset = 1
    for i,module in ipairs(self.modules) do
diff --git a/Container.lua b/Container.lua
index 6af4d7d..469a370 100644
--- a/Container.lua
+++ b/Container.lua
@@ -22,7 +22,7 @@ end
 
 -- Check if passing arguments through xpcall is supported in this Lua interpreter.
 local _, XPCALL_ARGS = xpcall(function(x) return x ~= nil end, function() end, 1)
-local TRACEBACK_WARNING = "WARNING: If you see a stack trace below, it doesn't point to the place where this error occured. Please use only the one above."
+local TRACEBACK_WARNING = "WARNING: If you see a stack trace below, it doesn't point to the place where this error occurred. Please use only the one above."
 -- module argument can be retrieved with moduleIndex, but code is cleaner when
 -- it has to be specified anyway.
 function Container:rethrowErrors(module, moduleIndex, funcName, ...)
diff --git a/DepthConcat.lua b/DepthConcat.lua
index 8ae8384..f64a90e 100644
--- a/DepthConcat.lua
+++ b/DepthConcat.lua
@@ -13,13 +13,13 @@ local DepthConcat, _ = torch.class('nn.DepthConcat', 'nn.Concat')
 
 function DepthConcat:windowNarrow(output, currentOutput, offset)
    local outputWindow = output:narrow(self.dimension, offset, currentOutput:size(self.dimension))
-   for dim=1,self.size:size(1) do
+   for dim=1,self.outputSize:size(1) do
       local currentSize = currentOutput:size(dim)
-      if dim ~= self.dimension and self.size[dim] ~= currentSize then
+      if dim ~= self.dimension and self.outputSize[dim] ~= currentSize then
          -- 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
          -- 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
          -- 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
-         local start = math.floor(((self.size[dim] - currentSize) / 2) + 1)
+         local start = math.floor(((self.outputSize[dim] - currentSize) / 2) + 1)
          outputWindow = outputWindow:narrow(dim, start, currentSize)
       end
    end
@@ -27,23 +27,25 @@ function DepthConcat:windowNarrow(output, currentOutput, offset)
 end
 
 function DepthConcat:updateOutput(input)
+   self.outputSize = self.outputSize or torch.LongStorage()
+
    local outs = {}
    for i=1,#self.modules do
       local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
       outs[i] = currentOutput
       if i == 1 then
-         self.size:resize(currentOutput:dim()):copy(currentOutput:size())
+         self.outputSize:resize(currentOutput:dim()):copy(currentOutput:size())
       else
-         self.size[self.dimension] = self.size[self.dimension] + currentOutput:size(self.dimension)
-         for dim=1,self.size:size(1) do
+         self.outputSize[self.dimension] = self.outputSize[self.dimension] + currentOutput:size(self.dimension)
+         for dim=1,self.outputSize:size(1) do
             if dim ~= self.dimension then
                -- take the maximum size (shouldn't change anything for batch dim)
-               self.size[dim] = math.max(self.size[dim], currentOutput:size(dim))
+               self.outputSize[dim] = math.max(self.outputSize[dim], currentOutput:size(dim))
             end
          end
       end
    end
-   self.output:resize(self.size):zero() --zero for padding
+   self.output:resize(self.outputSize):zero() --zero for padding
 
    local offset = 1
    for i,module in ipairs(self.modules) do
diff --git a/Dropout.lua b/Dropout.lua
index 946c37f..15f2f46 100644
--- a/Dropout.lua
+++ b/Dropout.lua
@@ -1,10 +1,11 @@
 local Dropout, Parent = torch.class('nn.Dropout', 'nn.Module')
 
-function Dropout:__init(p,v1,inplace)
+function Dropout:__init(p,v1,inplace,stochasticInference)
    Parent.__init(self)
    self.p = p or 0.5
    self.train = true
    self.inplace = inplace
+   self.stochastic_inference = stochasticInference or false
    -- version 2 scales output during training instead of evaluation
    self.v2 = not v1
    if self.p >= 1 or self.p < 0 then
@@ -20,7 +21,7 @@ function Dropout:updateOutput(input)
       self.output:resizeAs(input):copy(input)
    end
    if self.p > 0 then
-      if self.train then
+      if self.train or self.stochastic_inference then
          self.noise:resizeAs(input)
          self.noise:bernoulli(1-self.p)
          if self.v2 then
diff --git a/JoinTable.lua b/JoinTable.lua
index 0d20fb9..6ab68e1 100644
--- a/JoinTable.lua
+++ b/JoinTable.lua
@@ -11,7 +11,7 @@ end
 function JoinTable:_getPositiveDimension(input)
    local dimension = self.dimension
    if dimension < 0 then
-      dimension = input:dim() + dimension + 1
+      dimension = input[1]:dim() + dimension + 1
    elseif self.nInputDims and input[1]:dim()==(self.nInputDims+1) then
       dimension = dimension + 1
    end
diff --git a/LookupTable.lua b/LookupTable.lua
index 8a60354..cf9c687 100644
--- a/LookupTable.lua
+++ b/LookupTable.lua
@@ -125,7 +125,7 @@ function LookupTable:renorm(input)
    if not self.maxNorm then
       return
    end
-   -- copy input into _input, so _input is continous.
+   -- copy input into _input, so _input is continuous.
    -- The copied _input will be modified in the C code.
    self._input:resize(input:size()):copy(input)
    local row_idx = self._input
diff --git a/MapTable.lua b/MapTable.lua
new file mode 100644
index 0000000..79b967d
--- /dev/null
+++ b/MapTable.lua
@@ -0,0 +1,99 @@
+local MapTable, parent = torch.class('nn.MapTable', 'nn.Container')
+
+function MapTable:__init(module, shared)
+   parent.__init(self)
+   self.shared = shared or {'weight', 'bias', 'gradWeight', 'gradBias'}
+   self.output = {}
+   self.gradInput = {}
+   self:add(module)
+end
+
+function MapTable:_extend(n)
+   self.modules[1] = self.module
+   for i = 2, n do
+      if not self.modules[i] then
+         self.modules[i] = self.module:clone(table.unpack(self.shared))
+      end
+   end
+end
+
+function MapTable:resize(n)
+   self:_extend(n)
+   for i = n + 1, #self.modules do
+      self.modules[i] = nil
+   end
+end
+
+function MapTable:add(module)
+   assert(not self.module, 'Single module required')
+   self.module = module
+   self.modules[1] = self.module
+   return self
+end
+
+function MapTable:updateOutput(input)
+   self.output = {}
+   self:_extend(#input)
+   for i = 1, #input do
+      self.output[i] = self:rethrowErrors(self.modules[i], i, 'updateOutput', input[i])
+   end
+   return self.output
+end
+
+function MapTable:updateGradInput(input, gradOutput)
+   self.gradInput = {}
+   self:_extend(#input)
+   for i = 1, #input do
+      self.gradInput[i] = self:rethrowErrors(self.modules[i], i, 'updateGradInput', input[i], gradOutput[i])
+   end
+   return self.gradInput
+end
+
+function MapTable:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   self:_extend(#input)
+   for i = 1, #input do
+      self:rethrowErrors(self.modules[i], i, 'accGradParameters', input[i], gradOutput[i], scale)
+   end
+end
+
+function MapTable:accUpdateGradParameters(input, gradOutput, lr)
+   lr = lr or 1
+   self:_extend(#input)
+   for i = 1, #input do
+      self:rethrowErrors(self.modules[i], i, 'accUpdateGradParameters', input[i], gradOutput[i], lr)
+   end
+end
+
+function MapTable:zeroGradParameters()
+    if self.module then
+        self.module:zeroGradParameters()
+    end
+end
+
+function MapTable:updateParameters(learningRate)
+    if self.module then
+        self.module:updateParameters(learningRate)
+    end
+end
+
+function MapTable:clearState()
+   for i = 2, #self.modules do
+      self.modules[i] = nil
+   end
+   parent.clearState(self)
+end
+
+function MapTable:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local extlast = '      '
+   local str = torch.type(self)
+   if self.module then
+      str = str .. ' {' .. line .. tab
+      str = str .. tostring(self.module):gsub(line, line .. tab .. extlast) .. line .. '}'
+   else
+      str = str .. ' { }'
+   end
+   return str
+end
diff --git a/Max.lua b/Max.lua
index 691fe9d..1392d8a 100644
--- a/Max.lua
+++ b/Max.lua
@@ -21,7 +21,7 @@ end
 function Max:_lazyInit()
    self._output = self._output or self.output.new()
    self._indices = self._indices or
-      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaTensor() or torch.LongTensor())
+      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaLongTensor() or torch.LongTensor())
 end
 
 function Max:updateOutput(input)
@@ -50,18 +50,9 @@ function Max:updateGradInput(input, gradOutput)
 end
 
 function Max:type(type, tensorCache)
-  -- torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-  if type == 'torch.CudaTensor' then
+    self._indices = nil
     parent.type(self, type, tensorCache)
-  else
-    -- self._indices must be a LongTensor. Setting it to nil temporarily avoids
-    -- unnecessary memory allocations.
-    local indices
-    indices, self._indices = self._indices, nil
-    parent.type(self, type, tensorCache)
-    self._indices = indices and indices:long() or nil
-  end
-  return self
+    return self
 end
 
 function Max:clearState()
diff --git a/Min.lua b/Min.lua
index f1d2b45..dc07cf9 100644
--- a/Min.lua
+++ b/Min.lua
@@ -21,7 +21,7 @@ end
 function Min:_lazyInit()
    self._output = self._output or self.output.new()
    self._indices = self._indices or
-      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaTensor() or torch.LongTensor())
+      (torch.type(self.output) == 'torch.CudaTensor' and torch.CudaLongTensor() or torch.LongTensor())
 end
 
 function Min:updateOutput(input)
@@ -50,18 +50,9 @@ function Min:updateGradInput(input, gradOutput)
 end
 
 function Min:type(type, tensorCache)
-  -- torch.min expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-  if type == 'torch.CudaTensor' then
+    self._indices = nil
     parent.type(self, type, tensorCache)
-  else
-    -- self._indices must be a LongTensor. Setting it to nil temporarily avoids
-    -- unnecessary memory allocations.
-    local indices
-    indices, self._indices = self._indices, nil
-    parent.type(self, type, tensorCache)
-    self._indices = indices and indices:long() or nil
-  end
-  return self
+    return self
 end
 
 function Min:clearState()
diff --git a/Normalize.lua b/Normalize.lua
index 24c1d07..5cd4857 100644
--- a/Normalize.lua
+++ b/Normalize.lua
@@ -25,7 +25,7 @@ function Normalize:updateOutput(input)
     -- specialization for the infinity norm
     self._indices = self._indices or
       (torch.type(self.output) == 'torch.CudaTensor' and
-       torch.CudaTensor() or torch.LongTensor())
+       torch.CudaLongTensor() or torch.LongTensor())
 
     self.buffer:abs(input)
     torch.max(self.norm, self._indices, self.buffer, 2)
@@ -127,18 +127,9 @@ function Normalize:__tostring__()
 end
 
 function Normalize:type(type, tensorCache)
-  -- torch.max expects a LongTensor as indices, whereas cutorch.max expects a CudaTensor.
-  if type == 'torch.CudaTensor' then
+    self._indices = nil
     parent.type(self, type, tensorCache)
-  else
-    -- self._indices must be a LongTensor. Setting it to nil temporarily avoids
-    -- unnecessary memory allocations.
-    local indices
-    indices, self._indices = self._indices, nil
-    parent.type(self, type, tensorCache)
-    self._indices = indices and indices:long() or nil
-  end
-  return self
+    return self
 end
 
 function Normalize:clearState()
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index 8324f95..01a08cd 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -89,25 +89,9 @@ local function makeContiguous(self, input, gradOutput)
    return input, gradOutput
 end
 
--- function to re-view the weight layout in a way that would make the MM ops happy
-local function viewWeight(self)
-   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
-   if self.gradWeight and self.gradWeight:dim() > 0 then
-      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
-   end
-end
-
-local function unviewWeight(self)
-   self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
-   if self.gradWeight and self.gradWeight:dim() > 0 then
-      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
-   end
-end
-
 function SpatialConvolution:updateOutput(input)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    backCompatibility(self)
-   viewWeight(self)
    input = makeContiguous(self, input)
    input.THNN.SpatialConvolutionMM_updateOutput(
       input:cdata(),
@@ -120,7 +104,6 @@ function SpatialConvolution:updateOutput(input)
       self.dW, self.dH,
       self.padW, self.padH
    )
-   unviewWeight(self)
    return self.output
 end
 
@@ -128,20 +111,18 @@ function SpatialConvolution:updateGradInput(input, gradOutput)
    assert(input.THNN, torch.type(input)..'.THNN backend not imported')
    if self.gradInput then
       backCompatibility(self)
-      viewWeight(self)
       input, gradOutput = makeContiguous(self, input, gradOutput)
       input.THNN.SpatialConvolutionMM_updateGradInput(
          input:cdata(),
          gradOutput:cdata(),
          self.gradInput:cdata(),
-         self.weight:cdata(),         
+         self.weight:cdata(),
          self.finput:cdata(),
          self.fgradInput:cdata(),
          self.kW, self.kH,
          self.dW, self.dH,
          self.padW, self.padH
       )
-      unviewWeight(self)
       return self.gradInput
    end
 end
@@ -151,7 +132,6 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale)
    scale = scale or 1
    backCompatibility(self)
    input, gradOutput = makeContiguous(self, input, gradOutput)
-   viewWeight(self)
    input.THNN.SpatialConvolutionMM_accGradParameters(
       input:cdata(),
       gradOutput:cdata(),
@@ -164,7 +144,6 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale)
       self.padW, self.padH,
       scale
    )
-   unviewWeight(self)
 end
 
 function SpatialConvolution:type(type,tensorCache)
diff --git a/SpatialDilatedMaxPooling.lua b/SpatialDilatedMaxPooling.lua
index 929459c..2f0eba0 100644
--- a/SpatialDilatedMaxPooling.lua
+++ b/SpatialDilatedMaxPooling.lua
@@ -15,7 +15,7 @@ function SpatialDilatedMaxPooling:updateOutput(input)
    self.iheight = input:size(dims-1)
    self.iwidth = input:size(dims)
 
-   input.THNN.SpatialMaxPooling_updateOutput(
+   input.THNN.SpatialDilatedMaxPooling_updateOutput(
       input:cdata(),
       self.output:cdata(),
       self.indices:cdata(),
@@ -29,7 +29,7 @@ function SpatialDilatedMaxPooling:updateOutput(input)
 end
 
 function SpatialDilatedMaxPooling:updateGradInput(input, gradOutput)
-   input.THNN.SpatialMaxPooling_updateGradInput(
+   input.THNN.SpatialDilatedMaxPooling_updateGradInput(
       input:cdata(),
       gradOutput:cdata(),
       self.gradInput:cdata(),
diff --git a/SpatialDropout.lua b/SpatialDropout.lua
index 35daa18..4320061 100644
--- a/SpatialDropout.lua
+++ b/SpatialDropout.lua
@@ -1,15 +1,16 @@
 local SpatialDropout, Parent = torch.class('nn.SpatialDropout', 'nn.Module')
 
-function SpatialDropout:__init(p)
+function SpatialDropout:__init(p,stochasticInference)
    Parent.__init(self)
    self.p = p or 0.5
    self.train = true
+   self.stochastic_inference = stochasticInference or false
    self.noise = torch.Tensor()
 end
 
 function SpatialDropout:updateOutput(input)
    self.output:resizeAs(input):copy(input)
-   if self.train then
+   if self.train or self.stochastic_inference then
       if input:dim() == 4 then
         self.noise:resize(input:size(1), input:size(2), 1, 1)
       elseif input:dim() == 3 then
@@ -19,7 +20,7 @@ function SpatialDropout:updateOutput(input)
       end
       self.noise:bernoulli(1-self.p)
       -- We expand the random dropouts to the entire feature map because the
-      -- features are likely correlated accross the map and so the dropout
+      -- features are likely correlated across the map and so the dropout
       -- should also be correlated.
       self.output:cmul(torch.expandAs(self.noise, input))
    else
diff --git a/SpatialMaxPooling.lua b/SpatialMaxPooling.lua
index c05a876..8475b13 100644
--- a/SpatialMaxPooling.lua
+++ b/SpatialMaxPooling.lua
@@ -46,7 +46,6 @@ function SpatialMaxPooling:updateOutput(input)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
-      1, 1,
       self.ceil_mode
    )
    return self.output
@@ -61,7 +60,6 @@ function SpatialMaxPooling:updateGradInput(input, gradOutput)
       self.kW, self.kH,
       self.dW, self.dH,
       self.padW, self.padH,
-      1, 1,
       self.ceil_mode
    )
    return self.gradInput
diff --git a/Sum.lua b/Sum.lua
index 5d61c28..9ff73f8 100644
--- a/Sum.lua
+++ b/Sum.lua
@@ -36,8 +36,8 @@ end
 
 function Sum:updateGradInput(input, gradOutput)
     local dimension = self:_getPositiveDimension(input)
-    -- zero-strides dont work with MKL/BLAS, so
-    -- dont set self.gradInput to zero-stride tensor.
+    -- zero-strides don't work with MKL/BLAS, so
+    -- don't set self.gradInput to zero-stride tensor.
     -- Instead, do a deepcopy
     local size      = input:size()
     size[dimension] = 1
diff --git a/THNN.lua b/THNN.lua
index e18dbaa..9100239 100644
--- a/THNN.lua
+++ b/THNN.lua
@@ -2,6 +2,7 @@ local ffi = require 'ffi'
 
 local THNN = {}
 
+
 local generic_THNN_h = require 'nn.THNN_h'
 -- strip all lines starting with #
 -- to remove preprocessor directives originally present
diff --git a/TemporalDynamicKMaxPooling.lua b/TemporalDynamicKMaxPooling.lua
new file mode 100644
index 0000000..511275b
--- /dev/null
+++ b/TemporalDynamicKMaxPooling.lua
@@ -0,0 +1,65 @@
+--[[
+   This file implements Dynamic K Max Pooling as described in the paper:
+   "A Convolutional Neural Network for Modelling Sentences"
+                   by Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom
+   
+   The operation is simply selecting the k highest values out of a sequence.
+   k can be a calculated value or pre-defined
+
+   The value of k can be calulated as in the paper by using:
+      k_top as minK
+      (L-l)/L as factor
+   
+   Where:
+      k_top is the desired sequence length at the end of the convolution part,
+      L is the total number of layers,
+      l is this layers number
+]]
+
+local TemporalDynamicKMaxPooling, parent = torch.class('nn.TemporalDynamicKMaxPooling', 'nn.Module')
+
+function TemporalDynamicKMaxPooling:__init(minK, factor)
+   parent.__init(self)
+
+   self.minK = minK
+   self.factor = factor or 0
+end
+
+function TemporalDynamicKMaxPooling:updateOutput(input)
+   assert(input:dim() == 2 or input:dim() == 3, 'Only 2D or 3D(batch mode) accepted')
+
+   local seqDim = input:dim()-1
+   local k = math.max(self.minK, math.ceil(self.factor*input:size(seqDim)))
+   assert(input:size(seqDim) >= self.minK, 'Input sequence length (' .. input:size(seqDim) .. ') too small for desired k value (' .. k .. ')')
+
+   -- Sort input in descending order
+   local sorted, allIndices = input:sort(seqDim,true)
+   -- Reduce the indices to only include the top-k and return to original order by sorting
+   self.indices = allIndices:narrow(seqDim, 1, k):sort(seqDim)
+
+   self.output = input:gather(seqDim, self.indices)
+
+   return self.output
+end
+
+function TemporalDynamicKMaxPooling:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      local seqDim = input:dim()-1
+
+      self.gradInput:resizeAs(input)
+      self.gradInput:zero()
+
+      -- Using the previously stored indices, add the gradOutputs to their respective
+      -- input indices in the self.gradInput buffer
+      local updateValues = self.gradInput:gather(seqDim, self.indices)
+      updateValues:add(gradOutput)
+      self.gradInput:scatter(seqDim, self.indices, updateValues)
+
+      return self.gradInput
+   end
+end
+
+function TemporalDynamicKMaxPooling:clearState()
+   nn.utils.clear(self, 'indices')
+   return parent.clearState(self)
+end
diff --git a/Threshold.lua b/Threshold.lua
index 0c22bae..6fdd264 100644
--- a/Threshold.lua
+++ b/Threshold.lua
@@ -34,6 +34,7 @@ function Threshold:updateGradInput(input, gradOutput)
       gradOutput:cdata(),
       self.gradInput:cdata(),
       self.threshold,
+      self.val,
       self.inplace
    )
    return self.gradInput
diff --git a/VolumetricDilatedMaxPooling.lua b/VolumetricDilatedMaxPooling.lua
new file mode 100644
index 0000000..050e2c9
--- /dev/null
+++ b/VolumetricDilatedMaxPooling.lua
@@ -0,0 +1,64 @@
+local THNN = require 'nn.THNN'
+local VolumetricDilatedMaxPooling, parent = torch.class('nn.VolumetricDilatedMaxPooling', 'nn.VolumetricMaxPooling')
+
+function VolumetricDilatedMaxPooling:__init(kT, kW, kH, dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH)
+   parent.__init(self, kT, kW, kH, dT, dW, dH, padT, padW, padH)
+
+   self.dilationT = dilationT or 1
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
+
+end
+
+function VolumetricDilatedMaxPooling:updateOutput(input)
+   local dims = input:dim()
+   self.itime = input:size(dims-2)
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   self.indices = self.indices or input.new()
+   input.THNN.VolumetricDilatedMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function VolumetricDilatedMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.VolumetricDilatedMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH
+   )
+   return self.gradInput
+end
+
+function VolumetricDilatedMaxPooling:clearState()
+   if self.indices then 
+      self.indices:set() 
+   end
+   return parent.clearState(self)
+end
+
+function VolumetricDilatedMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%dx%d, %d,%d,%d', torch.type(self),
+                            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT.. ',' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ', ' .. self.dilationT .. ',' .. self.dilationW .. ',' .. self.dilationH
+   s = s .. ')'
+
+   return s
+end
diff --git a/VolumetricDropout.lua b/VolumetricDropout.lua
index 5f495af..809e28a 100644
--- a/VolumetricDropout.lua
+++ b/VolumetricDropout.lua
@@ -1,15 +1,16 @@
 local VolumetricDropout, Parent = torch.class('nn.VolumetricDropout', 'nn.Module')
 
-function VolumetricDropout:__init(p)
+function VolumetricDropout:__init(p,stochasticInference)
    Parent.__init(self)
    self.p = p or 0.5
    self.train = true
+   self.stochastic_inference = stochasticInference or false
    self.noise = torch.Tensor()
 end
 
 function VolumetricDropout:updateOutput(input)
    self.output:resizeAs(input):copy(input)
-   if self.train then
+   if self.train or self.stochastic_inference then
       if input:dim() == 5 then
         self.noise:resize(input:size(1), input:size(2), 1, 1, 1)
       elseif input:dim() == 4 then
@@ -19,7 +20,7 @@ function VolumetricDropout:updateOutput(input)
       end
       self.noise:bernoulli(1-self.p)
       -- We expand the random dropouts to the entire feature map because the
-      -- features are likely correlated accross the map and so the dropout
+      -- features are likely correlated across the map and so the dropout
       -- should also be correlated.
       self.output:cmul(torch.expandAs(self.noise, input))
    else
diff --git a/doc/convolution.md b/doc/convolution.md
index 96d92d9..b1a0d4c 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -37,6 +37,7 @@ a kernel for computing the weighted average in a neighborhood ;
     * [VolumetricFullConvolution](#nn.VolumetricFullConvolution) : a 3D full convolution over an input video (a sequence of images) ;
     * [VolumetricDilatedConvolution](#nn.VolumetricDilatedConvolution) : a 3D dilated convolution over an input image ;
     * [VolumetricMaxPooling](#nn.VolumetricMaxPooling) : a 3D max-pooling operation over an input video.
+    * [VolumetricDilatedMaxPooling](#nn.VolumetricDilatedMaxPooling) : a 3D dilated max-pooling operation over an input video ;
     * [VolumetricAveragePooling](#nn.VolumetricAveragePooling) : a 3D average-pooling operation over an input video.
     * [VolumetricMaxUnpooling](#nn.VolumetricMaxUnpooling) : a 3D max-unpooling operation.
     * [VolumetricReplicationPadding](#nn.VolumetricReplicationPadding) : Pads a volumetric feature map with the value at the edge of the input borders. ;
@@ -1022,6 +1023,30 @@ Applies 3D max-pooling operation in `kTxkWxkH` regions by step size
 `dTxdWxdH` steps. The number of output features is equal to the number of
 input planes / dT. The input can optionally be padded with zeros. Padding should be smaller than half of kernel size.  That is, `padT < kT/2`, `padW < kW/2` and `padH < kH/2`.
 
+<a name="nn.VolumetricDilatedMaxPooling"></a>
+### VolumetricDilatedMaxPooling ###
+
+```lua
+module = nn.VolumetricDilatedMaxPooling(kT, kW, kH [, dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH])
+```
+
+Also sometimes referred to as **atrous pooling**.
+Applies 3D dilated max-pooling operation in `kTxkWxkH` regions by step size
+`dTxdWxdH` steps. The number of output features is equal to the number of
+input planes. If `dilationT`, `dilationW` and `dilationH` are not provided, this is equivalent to performing normal `nn.VolumetricMaxPooling`.
+
+If the input image is a 4D tensor `nInputPlane x depth x height x width`, the output
+image size will be `nOutputPlane x otime x oheight x owidth` where
+
+```lua
+otime  = op((depth - (dilationT * (kT - 1) + 1) + 2*padT) / dT + 1)
+owidth  = op((width - (dilationW * (kW - 1) + 1) + 2*padW) / dW + 1)
+oheight = op((height - (dilationH * (kH - 1) + 1) + 2*padH) / dH + 1)
+```
+
+`op` is a rounding operator. By default, it is `floor`. It can be changed
+by calling `:ceil()` or `:floor()` methods.
+
 <a name="nn.VolumetricAveragePooling"></a>
 ### VolumetricAveragePooling ###
 
diff --git a/doc/simple.md b/doc/simple.md
index 6f01a56..302e4d8 100644
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -27,8 +27,8 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [Unsqueeze](#nn.Unsqueeze) : unsqueeze the input, i.e., insert singleton dimension;  
     * [Transpose](#nn.Transpose) : [transposes](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-transposedim1-dim2) the input ;
   * Modules that adapt mathematical Tensor methods :
-    * [AddConstant](https://github.com/torch/nn/blob/master/doc/transfer.md#nn.AddConstant) : adding a constant ;
-    * [MulConstant](https://github.com/torch/nn/blob/master/doc/transfer.md#nn.MulConstant) : multiplying a constant ;
+    * [AddConstant](https://github.com/torch/nn/blob/master/doc/transfer.md#addconstant) : adding a constant ;
+    * [MulConstant](https://github.com/torch/nn/blob/master/doc/transfer.md#mulconstant) : multiplying a constant ;
     * [Max](#nn.Max) : a [max](https://github.com/torch/torch7/blob/master/doc/maths.md#torch.max) operation over a given dimension ;
     * [Min](#nn.Min) : a [min](https://github.com/torch/torch7/blob/master/doc/maths.md#torchminresval-resind-x) operation over a given dimension ;
     * [Mean](#nn.Mean) : a [mean](https://github.com/torch/torch7/blob/master/doc/maths.md#res-torchmeanres-x-dim) operation over a given dimension ;
@@ -52,6 +52,7 @@ Simple Modules are used for various tasks like adapting Tensor methods and provi
     * [L1Penalty](#nn.L1Penalty) : adds an L1 penalty to an input (for sparsity) ;
     * [GradientReversal](#nn.GradientReversal) : reverses the gradient (to maximize an objective function) ;
     * [GPU](#nn.GPU) : decorates a module so that it can be executed on a specific GPU device.
+    * [TemporalDynamicKMaxPooling](#nn.TemporalDynamicKMaxPooling) : selects the k highest values in a sequence. k can be calculated based on sequence length ;
 
 <a name="nn.Linear"></a>
 ## Linear ##
@@ -138,7 +139,7 @@ module = nn.Bilinear(inputDimension1, inputDimension2, outputDimension, [bias =
 ```
 
 Applies a bilinear transformation to the incoming data, i.e. `\forall k: y_k = x_1 A_k x_2 + b`. The `input` tensor given in `forward(input)` is a table containing both inputs `x_1` and `x_2`, which are tensors of size `N x inputDimension1`
-and `N x inputDimension1`, respectively. The layer can be trained without biases by setting `bias = false`.
+and `N x inputDimension2`, respectively. The layer can be trained without biases by setting `bias = false`.
 
 You can create a layer in the following way:
 
@@ -246,6 +247,19 @@ During [evaluation](module.md#evaluate), `Dropout` does nothing more than forwar
 [torch.DoubleTensor of dimension 2x4]
 ```
 
+There is also an option for stochastic [evaluation](module.md#evaluate) which drops the `outputs` just like how it is done during [training](module.md#training):
+
+```lua
+module_stochastic_evaluation = nn.Dropout(nil, nil, nil, true)
+
+> module_stochastic_evaluation:evaluate()
+
+> module_stochastic_evaluation:forward(x)
+  2   4   6   0
+  0  12  14   0
+[torch.DoubleTensor of dimension 2x4]
+```
+
 We can return to training our model by first calling [Module:training()](module.md#training):
 
 ```lua
@@ -404,15 +418,16 @@ module = nn.CMul(size)
 ```
 
 Applies a component-wise multiplication to the incoming data, i.e. `y_i = w_i * x_i`. Argument `size` can be one or many numbers (sizes) or a `torch.LongStorage`. For example, `nn.CMul(3,4,5)` is equivalent to `nn.CMul(torch.LongStorage{3,4,5})`.
+If the size for a particular dimension is 1, the multiplication will be expanded along the entire axis.
 
 Example:
 
 ```lua
 mlp = nn.Sequential()
-mlp:add(nn.CMul(5))
+mlp:add(nn.CMul(5, 1))
 
-y = torch.Tensor(5)
-sc = torch.Tensor(5)
+y = torch.Tensor(5, 4)
+sc = torch.Tensor(5, 4)
 for i = 1, 5 do sc[i] = i; end -- scale input with this
 
 function gradUpdate(mlp, x, y, criterion, learningRate)
@@ -426,7 +441,7 @@ function gradUpdate(mlp, x, y, criterion, learningRate)
 end
 
 for i = 1, 10000 do
-   x = torch.rand(5)
+   x = torch.rand(5, 4)
    y:copy(x)
    y:cmul(sc)
    err = gradUpdate(mlp, x, y, nn.MSECriterion(), 0.01)
@@ -443,7 +458,7 @@ gives the output:
  3.0000
  4.0000
  5.0000
-[torch.Tensor of dimension 5]
+[torch.Tensor of dimension 5x1]
 ```
 
 i.e. the network successfully learns the input `x` has been scaled by those scaling factors to produce the output `y`.
@@ -598,7 +613,7 @@ end
 module = nn.Copy(inputType, outputType, [forceCopy, dontCast])
 ```
 
-This layer copies the input to output with type casting from `inputType` to `outputType`. Unless `forceCopy` is true, when the first two arguments are the same, the input isn't copied, only transfered as the output. The default `forceCopy` is false.
+This layer copies the input to output with type casting from `inputType` to `outputType`. Unless `forceCopy` is true, when the first two arguments are the same, the input isn't copied, only transferred as the output. The default `forceCopy` is false.
 When `dontCast` is true, a call to `nn.Copy:type(type)` will not cast the module's `output` and `gradInput` Tensors to the new type. The default is false.
 
 <a name="nn.Narrow"></a>
@@ -1039,7 +1054,7 @@ Setting `numInputDims` allows to use this module on batches.
 ```lua
 module = nn.Unsqueeze(pos [, numInputDims])
 ```
-Insert singleton dim (i.e., dimension 1) at position `pos`. 
+Insert singleton dim (i.e., dimension 1) at position `pos`.
 For an `input` with `dim = input:dim()`, there are `dim + 1` possible positions to insert the singleton dimension.
 For example, if `input` is `3` dimensional tensor in size `p x q x r`, then the singleton dim can be inserted at the following `4` positions
 ```
@@ -1070,7 +1085,7 @@ input2 = torch.Tensor(3, 5, 7) -- input2: 3 x 5 x 7
 m:forward(input2) -- output: 3 x 1 x 5 x 7
 ```
 
-Indicate the expected input feature map dimension by specifying `numInputDims`. 
+Indicate the expected input feature map dimension by specifying `numInputDims`.
 This allows the module to work with mini-batch. Example:
 ```lua
 b = 5 -- batch size 5
@@ -1413,14 +1428,14 @@ to set the hyper-parameter `lambda` dynamically during training.
 gpu = nn.GPU(module, device, [outdevice])
 require 'cunn'
 gpu:cuda()
-``` 
+```
 
 Decorates an encapsulated `module` so that it can be executed on a specific GPU `device`.
 The decorated module's `parameters` are thus hosted on the specified GPU `device`.
 All operations on the `gpu` module are executed on that device.
-Calls to `forward`/`backward` will transfer arguments `input` and `gradOutput` to the specified `device`, 
-which are then fed as arguments to the decorated `module`. 
-Returned `output` is located on the specified `outdevice` (defaults to `device`). 
+Calls to `forward`/`backward` will transfer arguments `input` and `gradOutput` to the specified `device`,
+which are then fed as arguments to the decorated `module`.
+Returned `output` is located on the specified `outdevice` (defaults to `device`).
 Returned `gradInput` is allocated on the same device as the `input`.
 
 When serialized/deserialized, the `gpu` module will be run on the same `device` that it was serialized with.
@@ -1429,16 +1444,16 @@ To prevent this from happening, the module can be converted to float/double befo
 ```lua
 gpu:float()
 gpustr = torch.serialize(gpu)
-``` 
+```
 
 The module is located in the __nn__ package instead of __cunn__ as this allows
-it to be used in CPU-only enviroments, which are common for production models.
+it to be used in CPU-only environments, which are common for production models.
 
 The module supports nested table `input` and `gradOutput` tensors originating from multiple devices.
-Each nested tensor in the returned `gradInput` will be transfered to the device its commensurate tensor in the `input`.
+Each nested tensor in the returned `gradInput` will be transferred to the device its commensurate tensor in the `input`.
 
-The intended use-case is not for model-parallelism where the models are executed in parallel on multiple devices, but 
-for sequential models where a single GPU doesn't have enough memory. 
+The intended use-case is not for model-parallelism where the models are executed in parallel on multiple devices, but
+for sequential models where a single GPU doesn't have enough memory.
 
 Example using 4 GPUs:
 
@@ -1448,7 +1463,20 @@ mlp = nn.Sequential()
    :add(nn.GPU(nn.Linear(10000,10000), 2))
    :add(nn.GPU(nn.Linear(10000,10000), 3))
    :add(nn.GPU(nn.Linear(10000,10000), 4, cutorch.getDevice()))
-``` 
+```
 
 Note how the last `GPU` instance will return an `output` tensor on the same device as the current device (`cutorch.getDevice`).
- 
+
+<a name="nn.TemporalDynamicKMaxPooling"></a>
+## TemporalDynamicKMaxPooling ##
+
+```lua
+module = nn.TemporalDynamicKMaxPooling(minK, [factor])
+```
+
+Selects the highest `k` values for each feature in the feature map sequence provided. The input sequence is composed of `nInputFrame` frames (i.e. `nInputFrame` is sequence length). The `input` tensor in `forward(input)` is expected to be a 2D tensor (`nInputFrame x inputFrameSize`) or a 3D tensor (`nBatchFrame x nInputFrame x inputFrameSize`), where `inputFrameSize` is the number of features across the sequence.
+
+If `factor` is not provided, `k = minK`, else the value of k is calculated with:
+```lua
+k = math.max(minK, math.ceil(factor*nInputFrame)))
+```
diff --git a/doc/table.md b/doc/table.md
index a2e23f8..ee61719 100644
--- a/doc/table.md
+++ b/doc/table.md
@@ -7,6 +7,7 @@ This allows one to build very rich architectures:
   * `table` Container Modules encapsulate sub-Modules:
     * [`ConcatTable`](#nn.ConcatTable): applies each member module to the same input     [`Tensor`](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor) and outputs a `table`;
     * [`ParallelTable`](#nn.ParallelTable): applies the `i`-th member module to the `i`-th input and outputs a `table`;
+    * [`MapTable`](#nn.MapTable): applies a single module to every input and outputs a `table`;
   * Table Conversion Modules convert between `table`s and `Tensor`s or `table`s:
     * [`SplitTable`](#nn.SplitTable): splits a `Tensor` into a `table` of `Tensor`s;
     * [`JoinTable`](#nn.JoinTable): joins a `table` of `Tensor`s into a `Tensor`;
@@ -23,6 +24,8 @@ This allows one to build very rich architectures:
     * [`CSubTable`](#nn.CSubTable): substraction of input `Tensor`s;
     * [`CMulTable`](#nn.CMulTable): multiplication of input `Tensor`s;
     * [`CDivTable`](#nn.CDivTable): division of input `Tensor`s;
+    * [`CMaxTable`](#nn.CMaxTable): max of input `Tensor`s;
+    * [`CMinTable`](#nn.CMinTable): min of input `Tensor`s;
   * `Table` of Criteria:
     * [`CriterionTable`](#nn.CriterionTable): wraps a [Criterion](criterion.md#nn.Criterion) so that it can accept a `table` of inputs.
 
@@ -165,6 +168,57 @@ which gives the output:
 ```
 
 
+<a name="nn.MapTable"></a>
+## MapTable ##
+
+```lua
+module = nn.MapTable(m, share)
+```
+
+`MapTable` is a container for a single module which will be applied to all input elements. The member module is cloned as necessary to process all input elements. Call `resize(n)` to set the number of clones manually or call `clearState()` to discard all clones.
+
+Optionally, the module can be initialized with the contained module and with a list of parameters that are shared across all clones. By default, these parameters are `weight`, `bias`, `gradWeight` and `gradBias`.
+
+```
++----------+         +-----------+
+| {input1, +---------> {member,  |
+|          |         |           |
+|  input2, +--------->  clone,   |
+|          |         |           |
+|  input3} +--------->  clone}   |
++----------+         +-----------+
+```
+
+### Example
+
+```lua
+map = nn.MapTable()
+map:add(nn.Linear(10, 3))
+
+x1 = torch.rand(10)
+x2 = torch.rand(10)
+y = map:forward{x1, x2}
+
+for i, k in pairs(y) do print(i, k) end
+```
+
+which gives the output:
+
+```lua
+1
+ 0.0345
+ 0.8695
+ 0.6502
+[torch.DoubleTensor of size 3]
+
+2
+ 0.0269
+ 0.4953
+ 0.2691
+[torch.DoubleTensor of size 3]
+```
+
+
 <a name="nn.SplitTable"></a>
 ## SplitTable ##
 
@@ -1212,3 +1266,30 @@ m = nn.CDivTable()
 [torch.DoubleTensor of dimension 5]
 ```
 
+<a name="nn.CMaxTable"></a>
+## CMaxTable ##
+
+Takes a `table` of `Tensor`s and outputs the max of all of them.
+
+```lua
+m = nn.CMaxTable()
+=m:forward({{torch.Tensor{1,2,3}, torch.Tensor{3,2,1}})
+ 3
+ 2
+ 3
+[torch.DoubleTensor of size 3]
+```
+
+<a name="nn.CMinTable"></a>
+## CMinTable ##
+
+Takes a `table` of `Tensor`s and outputs the min of all of them.
+
+```lua
+m = nn.CMinTable()
+=m:forward({{torch.Tensor{1,2,3}, torch.Tensor{3,2,1}})
+ 1
+ 2
+ 1
+[torch.DoubleTensor of size 3]
+```
diff --git a/init.lua b/init.lua
index a9c68da..70027a1 100644
--- a/init.lua
+++ b/init.lua
@@ -54,6 +54,8 @@ require('nn.CAddTable')
 require('nn.CDivTable')
 require('nn.CMulTable')
 require('nn.CSubTable')
+require('nn.CMaxTable')
+require('nn.CMinTable')
 
 require('nn.Euclidean')
 require('nn.WeightedEuclidean')
@@ -110,6 +112,7 @@ require('nn.SpatialAdaptiveMaxPooling')
 require('nn.TemporalConvolution')
 require('nn.TemporalSubSampling')
 require('nn.TemporalMaxPooling')
+require('nn.TemporalDynamicKMaxPooling')
 require('nn.SpatialSubtractiveNormalization')
 require('nn.SpatialDivisiveNormalization')
 require('nn.SpatialContrastiveNormalization')
@@ -125,6 +128,7 @@ require('nn.VolumetricConvolution')
 require('nn.VolumetricFullConvolution')
 require('nn.VolumetricDilatedConvolution')
 require('nn.VolumetricMaxPooling')
+require('nn.VolumetricDilatedMaxPooling')
 require('nn.VolumetricMaxUnpooling')
 require('nn.VolumetricAveragePooling')
 require('nn.VolumetricBatchNormalization')
@@ -142,6 +146,7 @@ require('nn.MixtureTable')
 require('nn.CriterionTable')
 require('nn.FlattenTable')
 require('nn.NarrowTable')
+require('nn.MapTable')
 
 require('nn.Criterion')
 require('nn.MSECriterion')
diff --git a/lib/THNN/generic/BCECriterion.c b/lib/THNN/generic/BCECriterion.c
new file mode 100644
index 0000000..c8d7da2
--- /dev/null
+++ b/lib/THNN/generic/BCECriterion.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BCECriterion.c"
+#else
+
+#define EPS 1e-12
+
+void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input, THTensor *target, THTensor *output, bool sizeAverage, THTensor *weights)
+{
+  real sum = 0;
+
+  if(weights)
+    TH_TENSOR_APPLY3(real, input, real, target, real, weights,
+      real x = *input_data;
+      real y = *target_data;
+      real w = *weights_data;
+      sum -= (log(x + EPS) * y + log(1. - x + EPS) * (1. - y)) * w;
+    )
+  else
+    TH_TENSOR_APPLY2(real, input, real, target,
+      real x = *input_data;
+      real y = *target_data;
+      sum -= log(x + EPS) * y + log(1. - x + EPS) * (1. - y);
+    );
+
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(BCECriterion_updateGradInput)(THNNState *state, THTensor *input, THTensor *target, THTensor *gradInput, bool sizeAverage, THTensor *weights)
+{
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    *gradInput_data = - norm * (y - x) / ((1. - x + EPS) * (x + EPS));
+  );
+
+  if(weights)
+    THTensor_(cmul)(gradInput, gradInput, weights);
+}
+
+#undef EPS
+
+#endif
diff --git a/lib/THNN/generic/HardTanh.c b/lib/THNN/generic/HardTanh.c
index 3b7ba3d..f360068 100644
--- a/lib/THNN/generic/HardTanh.c
+++ b/lib/THNN/generic/HardTanh.c
@@ -85,7 +85,7 @@ void THNN_(HardTanh_updateGradInput)(
     if (inplace)
     {
       TH_TENSOR_APPLY2(real, gradOutput, real, input,
-        if (*input_data < min_val || *input_data > max_val)
+        if (*input_data <= min_val || *input_data >= max_val)
           *gradOutput_data = 0;
       );
     }
diff --git a/lib/THNN/generic/SpatialConvolutionMM.c b/lib/THNN/generic/SpatialConvolutionMM.c
index e7460c8..64aa9db 100644
--- a/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/lib/THNN/generic/SpatialConvolutionMM.c
@@ -67,9 +67,12 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
   long outputWidth;
   long outputHeight;
 
+  int freeWeight = 0;
+
   THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
   THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
   THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
 
   if (input->nDimension == 4) {
     dimf++;
@@ -88,8 +91,19 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
     THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
         nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
 
-  if (nInputPlane*kW*kH != weight->size[1])
-    THError("Wrong number of input channels! Input has %d channels, expected %d",nInputPlane,weight->size[1]/(kW*kH));
+
+  int expectedWeightSize = weight->nDimension == 2 ? nInputPlane*kW*kH : nInputPlane;
+  int weightInputPlanes = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
+  if (expectedWeightSize != weight->size[1])
+    THError("Wrong number of input channels! Input has %d channels, expected %d",
+        nInputPlane, weightInputPlanes);
+
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
 
   if(input->nDimension == 3)
   {
@@ -126,6 +140,9 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
       THTensor_(free)(finput_t);
     }
   }
+
+  if (freeWeight)
+    THTensor_(free)(weight);
 }
 
 static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
@@ -167,17 +184,27 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
           int padH)
 {
   long nOutputPlane = weight->size[0];
+  int freeWeight = 0;
 
   THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
   THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
   THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
 
   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
   // depending on the BLAS library, fgradInput (result tensor) might
   // be left uninitialized on zero alpha, which might lead to weird behavior
   // hence, to be safe, zero it
-  THTensor_(zero)(fgradInput); 
+  THTensor_(zero)(fgradInput);
+
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
   THTensor_(transpose)(weight, weight, 0, 1);
 
   if(input->nDimension == 3)
@@ -205,6 +232,9 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
   }
 
   THTensor_(transpose)(weight, weight, 0, 1);
+
+  if (freeWeight)
+    THTensor_(free)(weight);
 }
 
 static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
@@ -254,10 +284,19 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
           int padH,
           real scale)
 {
+  int freeWeight = 0;
   long nOutputPlane = gradWeight->size[0];
   THArgCheck( nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 3, "Number of output features is not equal to nOutputPlane" );
   THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
   THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D");
+
+  if (gradWeight->nDimension == 4) {
+    long s1 = gradWeight->size[0];
+    long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
+    gradWeight = THTensor_(newWithStorage2d)(gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
 
   if(input->nDimension == 3)
   {
@@ -279,6 +318,8 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
       THTensor_(free)(finput_t);
     }
   }
+  if (freeWeight)
+    THTensor_(free)(gradWeight);
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialMaxPooling.c b/lib/THNN/generic/SpatialDilatedMaxPooling.c
similarity index 91%
copy from lib/THNN/generic/SpatialMaxPooling.c
copy to lib/THNN/generic/SpatialDilatedMaxPooling.c
index 3daef1d..6500f49 100644
--- a/lib/THNN/generic/SpatialMaxPooling.c
+++ b/lib/THNN/generic/SpatialDilatedMaxPooling.c
@@ -1,8 +1,8 @@
 #ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c"
 #else
 
-static void THNN_(SpatialMaxPooling_updateOutput_frame)(
+static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
           real *ind_p,
@@ -74,7 +74,7 @@ static void THNN_(SpatialMaxPooling_updateOutput_frame)(
   }
 }
 
-void THNN_(SpatialMaxPooling_updateOutput)(
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
@@ -155,7 +155,7 @@ void THNN_(SpatialMaxPooling_updateOutput)(
     output_data = THTensor_(data)(output);
     indices_data = THTensor_(data)(indices);
 
-    THNN_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
+    THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(input_data, output_data,
                                               indices_data,
                                               nslices,
                                               iwidth, iheight,
@@ -180,7 +180,7 @@ void THNN_(SpatialMaxPooling_updateOutput)(
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      THNN_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
+      THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
                                                 indices_data+p*nslices*owidth*oheight,
                                                 nslices,
                                                 iwidth, iheight,
@@ -196,7 +196,7 @@ void THNN_(SpatialMaxPooling_updateOutput)(
   THTensor_(free)(input);
 }
 
-static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
+static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
           real *ind_p,
@@ -231,7 +231,7 @@ static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
   }
 }
 
-void THNN_(SpatialMaxPooling_updateGradInput)(
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
@@ -287,7 +287,7 @@ void THNN_(SpatialMaxPooling_updateGradInput)(
   /* backprop */
   if (input->nDimension == 3)
   {
-    THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+    THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
                                                  indices_data,
                                                  nslices,
                                                  iwidth, iheight,
@@ -300,7 +300,7 @@ void THNN_(SpatialMaxPooling_updateGradInput)(
 #pragma omp parallel for private(p)
     for (p = 0; p < nbatch; p++)
     {
-      THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+      THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
                                                    indices_data+p*nslices*owidth*oheight,
                                                    nslices,
                                                    iwidth, iheight,
diff --git a/lib/THNN/generic/SpatialMaxPooling.c b/lib/THNN/generic/SpatialMaxPooling.c
index 3daef1d..e0fafb1 100644
--- a/lib/THNN/generic/SpatialMaxPooling.c
+++ b/lib/THNN/generic/SpatialMaxPooling.c
@@ -2,78 +2,6 @@
 #define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
 #else
 
-static void THNN_(SpatialMaxPooling_updateOutput_frame)(
-          real *input_p,
-          real *output_p,
-          real *ind_p,
-          long nslices,
-          long iwidth,
-          long iheight,
-          long owidth,
-          long oheight,
-          int kW,
-          int kH,
-          int dW,
-          int dH,
-          int padW,
-          int padH,
-          int dilationW,
-          int dilationH
-          )
-{
-  long k;
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)
-  {
-    /* loop over output */
-    long i, j;
-    real *ip = input_p   + k*iwidth*iheight;
-    for(i = 0; i < oheight; i++)
-    {
-      for(j = 0; j < owidth; j++)
-      {
-        long hstart = i * dH - padH;
-        long wstart = j * dW - padW;
-        long hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight);
-        long wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth);
-        while(hstart < 0)
-          hstart += dilationH;
-        while(wstart < 0)
-          wstart += dilationW;
-
-        /* local pointers */
-        real *op = output_p  + k*owidth*oheight + i*owidth + j;
-        real *indp = ind_p   + k*owidth*oheight + i*owidth + j;
-
-        /* compute local max: */
-        long maxindex = -1;
-        real maxval = -THInf;
-        long tcntr = 0;
-        long x,y;
-        for(y = hstart; y < hend; y += dilationH)
-        {
-          for(x = wstart; x < wend; x += dilationW)
-          {
-            tcntr = y*iwidth + x;
-            real val = *(ip + tcntr);
-            if (val > maxval)
-            {
-              maxval = val;
-              maxindex = tcntr;
-            }
-          }
-        }
-
-        /* set output to local max */
-        *op = maxval;
-
-        /* store location of max */
-        *indp = maxindex + TH_INDEX_BASE;
-      }
-    }
-  }
-}
-
 void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -85,150 +13,12 @@ void THNN_(SpatialMaxPooling_updateOutput)(
           int dH,
           int padW,
           int padH,
-          int dilationW,
-          int dilationH,
           bool ceil_mode)
 {
-  int dimw = 2;
-  int dimh = 1;
-  long nbatch = 1;
-  long nslices;
-  long iheight;
-  long iwidth;
-  long oheight;
-  long owidth;
-  real *input_data;
-  real *output_data;
-  real *indices_data;
-
-
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
-
-  if (input->nDimension == 4)
-  {
-    nbatch = input->size[0];
-    dimw++;
-    dimh++;
-  }
-  THArgCheck(input->size[dimw] >= kW - padW && input->size[dimh] >= kH - padH, 2, "input image smaller than kernel size");
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
-  
-  /* sizes */
-  nslices = input->size[dimh-1];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
-  if (ceil_mode)
-  {
-    oheight = (long)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
-    owidth  = (long)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
-  }
-  else
-  {
-    oheight = (long)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
-    owidth  = (long)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
-  }
-
-  if (owidth < 1 || oheight < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-            nslices,iheight,iwidth,nslices,oheight,owidth);
-
-  if (padW || padH)
-  {
-    // ensure that the last pooling starts inside the image
-    if ((oheight - 1)*dH >= iheight + padH)
-      --oheight;
-    if ((owidth  - 1)*dW >= iwidth  + padW)
-      --owidth;
-  }
-
-  /* get contiguous input */
-  input = THTensor_(newContiguous)(input);
-
-  /* resize output */
-  if (input->nDimension == 3)
-  {
-    THTensor_(resize3d)(output, nslices, oheight, owidth);
-    /* indices will contain the locations for each output point */
-    THTensor_(resize3d)(indices,  nslices, oheight, owidth);
-
-    input_data = THTensor_(data)(input);
-    output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
-
-    THNN_(SpatialMaxPooling_updateOutput_frame)(input_data, output_data,
-                                              indices_data,
-                                              nslices,
-                                              iwidth, iheight,
-                                              owidth, oheight,
-                                              kW, kH, dW, dH,
-                                              padW, padH,
-                                              dilationW, dilationH
-                                              );
-  }
-  else
-  {
-    long p;
-
-    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
-    /* indices will contain the locations for each output point */
-    THTensor_(resize4d)(indices, nbatch, nslices, oheight, owidth);
-
-    input_data = THTensor_(data)(input);
-    output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
-
-#pragma omp parallel for private(p)
-    for (p = 0; p < nbatch; p++)
-    {
-      THNN_(SpatialMaxPooling_updateOutput_frame)(input_data+p*nslices*iwidth*iheight, output_data+p*nslices*owidth*oheight,
-                                                indices_data+p*nslices*owidth*oheight,
-                                                nslices,
-                                                iwidth, iheight,
-                                                owidth, oheight,
-                                                kW, kH, dW, dH,
-                                                padW, padH,
-                                                dilationW, dilationH
-                                                );
-    }
-  }
-
-  /* cleanup */
-  THTensor_(free)(input);
-}
-
-static void THNN_(SpatialMaxPooling_updateGradInput_frame)(
-          real *gradInput_p,
-          real *gradOutput_p,
-          real *ind_p,
-          long nslices,
-          long iwidth,
-          long iheight,
-          long owidth,
-          long oheight,
-          int dW,
-          int dH)
-{
-  long k;
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)
-  {
-    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
-    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
-    real *ind_p_k = ind_p + k*owidth*oheight;
-
-    /* calculate max points */
-    long i, j;
-    for(i = 0; i < oheight; i++)
-    {
-      for(j = 0; j < owidth; j++)
-      {
-        /* retrieve position of max */
-        long maxp = ind_p_k[i*owidth + j] - TH_INDEX_BASE;
-        /* update gradient */
-        gradInput_p_k[maxp] += gradOutput_p_k[i*owidth + j];
-      }
-    }
-  }
+  THNN_(SpatialDilatedMaxPooling_updateOutput)(
+      state, input, output, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
 }
 
 void THNN_(SpatialMaxPooling_updateGradInput)(
@@ -243,74 +33,12 @@ void THNN_(SpatialMaxPooling_updateGradInput)(
           int dH,
           int padW,
           int padH,
-          int dilationW,
-          int dilationH,
           bool ceil_mode)
 {
-  int dimw = 2;
-  int dimh = 1;
-  long nbatch = 1;
-  int nslices;
-  int iheight;
-  int iwidth;
-  int oheight;
-  int owidth;
-  real *gradInput_data;
-  real *gradOutput_data;
-  real *indices_data;
-
-  /* get contiguous gradOutput */
-  gradOutput = THTensor_(newContiguous)(gradOutput);
-
-  /* resize */
-  THTensor_(resizeAs)(gradInput, input);
-  THTensor_(zero)(gradInput);
-
-  if (input->nDimension == 4) {
-    nbatch = input->size[0];
-    dimw++;
-    dimh++;
-  }
-
-  /* sizes */
-  nslices = input->size[dimh-1];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
-  oheight = gradOutput->size[dimh];
-  owidth = gradOutput->size[dimw];
-
-  /* get raw pointers */
-  gradInput_data = THTensor_(data)(gradInput);
-  gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
-
-  /* backprop */
-  if (input->nDimension == 3)
-  {
-    THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
-                                                 indices_data,
-                                                 nslices,
-                                                 iwidth, iheight,
-                                                 owidth, oheight,
-                                                 dW, dH);
-  }
-  else
-  {
-    long p;
-#pragma omp parallel for private(p)
-    for (p = 0; p < nbatch; p++)
-    {
-      THNN_(SpatialMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
-                                                   indices_data+p*nslices*owidth*oheight,
-                                                   nslices,
-                                                   iwidth, iheight,
-                                                   owidth, oheight,
-                                                   dW, dH);
-    }
-  }
-
-  /* cleanup */
-  THTensor_(free)(gradOutput);
+  THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+      state, input, gradOutput, gradInput, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
 }
 
 #endif
diff --git a/lib/THNN/generic/SpatialUpSamplingNearest.c b/lib/THNN/generic/SpatialUpSamplingNearest.c
index 7ef093c..b67c68d 100644
--- a/lib/THNN/generic/SpatialUpSamplingNearest.c
+++ b/lib/THNN/generic/SpatialUpSamplingNearest.c
@@ -14,7 +14,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
   int yDim = input->nDimension-1;
 
   // dims
-  int idim = input->nDimension;  // Gauranteed to be between 3 and 5
+  int idim = input->nDimension;  // Guaranteed to be between 3 and 5
   int osz0 = output->size[0];
   int osz1 = output->size[1];
   int osz2 = output->size[2];
@@ -80,7 +80,7 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
   int yDim = gradInput->nDimension-1;
 
   // dims
-  int idim = gradInput->nDimension;  // Gauranteed to be between 3 and 5
+  int idim = gradInput->nDimension;  // Guaranteed to be between 3 and 5
   int isz0 = gradInput->size[0];
   int isz1 = gradInput->size[1];
   int isz2 = gradInput->size[2];
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 7ad6f70..0f2149a 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -25,6 +25,21 @@ TH_API void THNN_(AbsCriterion_updateGradInput)(
           THTensor *gradInput,         // [OUT] gradient w.r.t. input
           bool sizeAverage);           // if true, the gradient will be normalized by batch size
 
+TH_API void THNN_(BCECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+	  THTensor *weights);
+TH_API void THNN_(BCECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+	  THTensor *weights);
+
 TH_API void THNN_(ClassNLLCriterion_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor (1D/2D)
@@ -472,6 +487,7 @@ TH_API void THNN_(Threshold_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           real threshold,
+          real val,
           bool inplace);
 
 TH_API void THNN_(TemporalConvolution_updateOutput)(
@@ -840,7 +856,6 @@ TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
-          int dilationW, int dilationH,
           bool ceil_mode);
 TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           THNNState *state,
@@ -851,6 +866,27 @@ TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
           int dilationW, int dilationH,
           bool ceil_mode);
 
@@ -1101,6 +1137,26 @@ TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
           int dT, int dW, int dH,
           int pT, int pW, int pH);
 
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *indices,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH);
+
 TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
           THNNState *state,
           THTensor *input,
diff --git a/lib/THNN/generic/Threshold.c b/lib/THNN/generic/Threshold.c
index ac00360..54310a0 100644
--- a/lib/THNN/generic/Threshold.c
+++ b/lib/THNN/generic/Threshold.c
@@ -33,6 +33,7 @@ void THNN_(Threshold_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           real threshold,
+          real val,
           bool inplace)
 {
   if (inplace)
diff --git a/lib/THNN/generic/VolumetricMaxPooling.c b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
similarity index 76%
copy from lib/THNN/generic/VolumetricMaxPooling.c
copy to lib/THNN/generic/VolumetricDilatedMaxPooling.c
index 053c02c..0db41ae 100644
--- a/lib/THNN/generic/VolumetricMaxPooling.c
+++ b/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -1,8 +1,8 @@
 #ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c"
 #else
 
-static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
+static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
           real *input_p,
           real *output_p,
           real *indz_p,
@@ -21,7 +21,10 @@ static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
           int dH,
           int pT,
           int pW,
-          int pH)
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -40,15 +43,18 @@ static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
           long start_t = ti * dT - pT;
           long start_h = i * dH - pH;
           long start_w = j * dW - pW;
-
+          
           long kernel_t = fminf(kT, kT + start_t);
           long kernel_h = fminf(kH, kH + start_h);
           long kernel_w = fminf(kW, kW + start_w);
 
-          start_t = fmaxf(start_t, 0);
-          start_h = fmaxf(start_h, 0);
-          start_w = fmaxf(start_w, 0);
-
+          while(start_t < 0)
+            start_t += dilationT;
+          while(start_h < 0)
+            start_h += dilationH;
+          while(start_w < 0)
+            start_w += dilationW;
+          
           real *ip = input_p + k * itime * iwidth * iheight
             + start_t * iwidth * iheight + start_h * iwidth + start_w;
           real *op = output_p + k * otime * owidth * oheight
@@ -67,9 +73,9 @@ static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
             {
               for (x = 0; x < kernel_w; x++)
               {
-                if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
+                if ((start_t + z * dilationT < itime) && (start_h + y * dilationH < iheight) && (start_w + x * dilationW < iwidth))
                 {
-                  real val = *(ip + z * iwidth * iheight + y * iwidth + x);
+                  real val = *(ip + z * dilationT * iwidth * iheight + y * dilationH * iwidth + x * dilationW);
                   if (val > maxval)
                   {
                     maxval = val;
@@ -97,7 +103,7 @@ static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
   }
 }
 
-void THNN_(VolumetricMaxPooling_updateOutput)(
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
@@ -111,6 +117,9 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
           int pT,
           int pW,
           int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
           bool ceilMode)
 {
   long nslices;
@@ -156,17 +165,21 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
   iwidth  = input->size[dimw];
   if (ceilMode)
   {
-    otime   = (int)(ceil((float)(itime   - kT + 2 * pT) / dT) + 1);
-    oheight = (int)(ceil((float)(iheight - kH + 2 * pH) / dH) + 1);
-    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * pW) / dW) + 1);
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
   }
   else
   {
-    otime   = (int)(floor((float)(itime   - kT + 2 * pT) / dT) + 1);
-    oheight = (int)(floor((float)(iheight - kH + 2 * pH) / dH) + 1);
-    owidth  = (int)(floor((float)(iwidth  - kW + 2 * pW) / dW) + 1);
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
   }
 
+  if (otime < 1 || owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
   if (pT || pW || pH)
   {
     // ensure that the last pooling starts inside the image
@@ -192,7 +205,7 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
     output_data = THTensor_(data)(output);
     indices_data = THTensor_(data)(indices);
 
-    THNN_(VolumetricMaxPooling_updateOutput_frame)(
+    THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
       input_data, output_data,
       indices_data,
       nslices,
@@ -200,7 +213,8 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
       otime, owidth, oheight,
       kT, kW, kH,
       dT, dW, dH,
-      pT, pW, pH
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
     );
   }
   else /* batch mode */
@@ -223,7 +237,7 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
 #pragma omp parallel for private(p)
     for (p=0; p < nBatch; p++)
     {
-      THNN_(VolumetricMaxPooling_updateOutput_frame)(
+      THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
         input_data   + p * istride,
         output_data  + p * ostride,
         indices_data + p * ostride,
@@ -232,7 +246,8 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
         otime, owidth, oheight,
         kT, kW, kH,
         dT, dW, dH,
-        pT, pW, pH
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
       );
     }
   }
@@ -241,7 +256,7 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
   THTensor_(free)(input);
 }
 
-static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
           real *gradInput_p,
           real *gradOutput_p,
           real *indz_p,
@@ -257,7 +272,10 @@ static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
           int dH,
           int pT,
           int pW,
-          int pH)
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
 {
   long k;
 #pragma omp parallel for private(k)
@@ -277,9 +295,9 @@ static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
         {
           /* retrieve position of max */
           real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
-          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - pT;
-          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - pH;
-          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - pW;
+          long maxti = ((unsigned char*)(indzp))[0] * dilationT + ti * dT - pT;
+          long maxi  = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH;
+          long maxj  = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW;
 
           /* update gradient */
           gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
@@ -290,7 +308,7 @@ static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
   }
 }
 
-void THNN_(VolumetricMaxPooling_updateGradInput)(
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
@@ -301,7 +319,10 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
           int dH,
           int pT,
           int pW,
-          int pH)
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
 {
   int nslices;
   int itime;
@@ -351,14 +372,15 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
   /* backprop */
   if (input->nDimension == 4) /* non-batch mode*/
   {
-    THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+    THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
       gradInput_data, gradOutput_data,
       indices_data,
       nslices,
       itime, iwidth, iheight,
       otime, owidth, oheight,
       dT, dW, dH,
-      pT, pW, pH
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
     );
   }
   else /* batch mode */
@@ -372,7 +394,7 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
 #pragma omp parallel for private(p)
     for (p = 0; p < nBatch; p++)
     {
-      THNN_(VolumetricMaxPooling_updateGradInput_frame)(
+      THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
         gradInput_data + p * istride,
         gradOutput_data + p * ostride,
         indices_data + p * ostride,
@@ -380,7 +402,8 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
         itime, iwidth, iheight,
         otime, owidth, oheight,
         dT, dW, dH,
-        pT, pW, pH
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
       );
     }
   }
diff --git a/lib/THNN/generic/VolumetricMaxPooling.c b/lib/THNN/generic/VolumetricMaxPooling.c
index 053c02c..dc376e6 100644
--- a/lib/THNN/generic/VolumetricMaxPooling.c
+++ b/lib/THNN/generic/VolumetricMaxPooling.c
@@ -2,101 +2,6 @@
 #define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
 #else
 
-static void THNN_(VolumetricMaxPooling_updateOutput_frame)(
-          real *input_p,
-          real *output_p,
-          real *indz_p,
-          long nslices,
-          long itime,
-          long iwidth,
-          long iheight,
-          long otime,
-          long owidth,
-          long oheight,
-          int kT,
-          int kW,
-          int kH,
-          int dT,
-          int dW,
-          int dH,
-          int pT,
-          int pW,
-          int pH)
-{
-  long k;
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)
-  {
-    /* loop over output */
-    long i, j, ti;
-    for (ti = 0; ti < otime; ti++)
-    {
-      for (i = 0; i < oheight; i++)
-      {
-        for (j = 0; j < owidth; j++)
-        {
-          /* local pointers */
-
-          long start_t = ti * dT - pT;
-          long start_h = i * dH - pH;
-          long start_w = j * dW - pW;
-
-          long kernel_t = fminf(kT, kT + start_t);
-          long kernel_h = fminf(kH, kH + start_h);
-          long kernel_w = fminf(kW, kW + start_w);
-
-          start_t = fmaxf(start_t, 0);
-          start_h = fmaxf(start_h, 0);
-          start_w = fmaxf(start_w, 0);
-
-          real *ip = input_p + k * itime * iwidth * iheight
-            + start_t * iwidth * iheight + start_h * iwidth + start_w;
-          real *op = output_p + k * otime * owidth * oheight
-            + ti * owidth * oheight + i * owidth + j;
-          real *indzp = indz_p + k * otime * owidth * oheight
-            + ti * owidth * oheight + i * owidth + j;
-
-          /* compute local max: */
-          real maxval = -THInf;
-          int x,y,z;
-          int mx, my, mz;
-
-          for (z = 0; z < kernel_t; z++)
-          {
-            for (y = 0; y < kernel_h; y++)
-            {
-              for (x = 0; x < kernel_w; x++)
-              {
-                if ((start_t + z < itime) && (start_h + y < iheight) && (start_w + x < iwidth))
-                {
-                  real val = *(ip + z * iwidth * iheight + y * iwidth + x);
-                  if (val > maxval)
-                  {
-                    maxval = val;
-                    // Store indices w.r.t the kernel dimension
-                    mz = z + (kT - kernel_t);
-                    my = y + (kH - kernel_h);
-                    mx = x + (kW - kernel_w);
-                  }
-                }
-              }
-            }
-          }
-
-          // set max values
-          ((unsigned char*)(indzp))[0] = mz;
-          ((unsigned char*)(indzp))[1] = my;
-          ((unsigned char*)(indzp))[2] = mx;
-          ((unsigned char*)(indzp))[3] = 0;
-
-          /* set output to local max */
-          *op = maxval;
-        }
-      }
-    }
-  }
-}
-
 void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -113,181 +18,10 @@ void THNN_(VolumetricMaxPooling_updateOutput)(
           int pH,
           bool ceilMode)
 {
-  long nslices;
-  long itime;
-  long iheight;
-  long iwidth;
-  long otime;
-  long oheight;
-  long owidth;
-  real *input_data;
-  real *output_data;
-  real *indices_data;
-
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch-mode) tensor expected"
-  );
-
-  int dimN = 0;
-  int dimt = 1;
-  int dimh = 2;
-  int dimw = 3;
-
-  if (input->nDimension == 5)
-  {
-    dimN++;
-    dimt++;
-    dimh++;
-    dimw++;
-  }
-
-  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH && input->size[dimt] >= kT, 2,
-    "input image smaller than kernel size"
-  );
-
-  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
-    "pad should be smaller than half of kernel size"
-  );
-
-  /* sizes */
-  nslices = input->size[dimN];
-  itime   = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth  = input->size[dimw];
-  if (ceilMode)
-  {
-    otime   = (int)(ceil((float)(itime   - kT + 2 * pT) / dT) + 1);
-    oheight = (int)(ceil((float)(iheight - kH + 2 * pH) / dH) + 1);
-    owidth  = (int)(ceil((float)(iwidth  - kW + 2 * pW) / dW) + 1);
-  }
-  else
-  {
-    otime   = (int)(floor((float)(itime   - kT + 2 * pT) / dT) + 1);
-    oheight = (int)(floor((float)(iheight - kH + 2 * pH) / dH) + 1);
-    owidth  = (int)(floor((float)(iwidth  - kW + 2 * pW) / dW) + 1);
-  }
-
-  if (pT || pW || pH)
-  {
-    // ensure that the last pooling starts inside the image
-    if ((otime - 1)*dT >= itime + pT)
-      --otime;
-    if ((oheight - 1)*dH >= iheight + pH)
-      --oheight;
-    if ((owidth  - 1)*dW >= iwidth  + pW)
-      --owidth;
-  }
-
-  /* get contiguous input */
-  input = THTensor_(newContiguous)(input);
-
-  if (input->nDimension == 4) /* non-batch mode */
-  {
-    /* resize output */
-    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
-    /* indices will contain ti,i,j uchar locations packed into float/double */
-    THTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
-
-    input_data = THTensor_(data)(input);
-    output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
-
-    THNN_(VolumetricMaxPooling_updateOutput_frame)(
-      input_data, output_data,
-      indices_data,
-      nslices,
-      itime, iwidth, iheight,
-      otime, owidth, oheight,
-      kT, kW, kH,
-      dT, dW, dH,
-      pT, pW, pH
-    );
-  }
-  else /* batch mode */
-  {
-    long p;
-    long nBatch = input->size[0];
-
-    long istride = nslices * itime * iwidth * iheight;
-    long ostride = nslices * otime * owidth * oheight;
-
-    /* resize output */
-    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
-    /* indices will contain ti,i,j locations for each output point */
-    THTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
-
-    input_data = THTensor_(data)(input);
-    output_data = THTensor_(data)(output);
-    indices_data = THTensor_(data)(indices);
-
-#pragma omp parallel for private(p)
-    for (p=0; p < nBatch; p++)
-    {
-      THNN_(VolumetricMaxPooling_updateOutput_frame)(
-        input_data   + p * istride,
-        output_data  + p * ostride,
-        indices_data + p * ostride,
-        nslices,
-        itime, iwidth, iheight,
-        otime, owidth, oheight,
-        kT, kW, kH,
-        dT, dW, dH,
-        pT, pW, pH
-      );
-    }
-  }
-
-  /* cleanup */
-  THTensor_(free)(input);
-}
-
-static void THNN_(VolumetricMaxPooling_updateGradInput_frame)(
-          real *gradInput_p,
-          real *gradOutput_p,
-          real *indz_p,
-          long nslices,
-          long itime,
-          long iwidth,
-          long iheight,
-          long otime,
-          long owidth,
-          long oheight,
-          int dT,
-          int dW,
-          int dH,
-          int pT,
-          int pW,
-          int pH)
-{
-  long k;
-#pragma omp parallel for private(k)
-  for (k = 0; k < nslices; k++)
-  {
-    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
-    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
-    real *indz_p_k = indz_p + k * otime * owidth * oheight;
-
-    /* calculate max points */
-    long ti, i, j;
-    for (ti = 0; ti < otime; ti++)
-    {
-      for (i = 0; i < oheight; i++)
-      {
-        for (j = 0; j < owidth; j++)
-        {
-          /* retrieve position of max */
-          real * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
-          long maxti = ((unsigned char*)(indzp))[0] + ti * dT - pT;
-          long maxi  = ((unsigned char*)(indzp))[1] + i * dH - pH;
-          long maxj  = ((unsigned char*)(indzp))[2] + j * dW - pW;
-
-          /* update gradient */
-          gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
-            gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
-        }
-      }
-    }
-  }
+  THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          state, input, output, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
 }
 
 void THNN_(VolumetricMaxPooling_updateGradInput)(
@@ -303,90 +37,9 @@ void THNN_(VolumetricMaxPooling_updateGradInput)(
           int pW,
           int pH)
 {
-  int nslices;
-  int itime;
-  int iheight;
-  int iwidth;
-  int otime;
-  int oheight;
-  int owidth;
-  real *gradInput_data;
-  real *gradOutput_data;
-  real *indices_data;
-
-  int dimN = 0;
-  int dimt = 1;
-  int dimh = 2;
-  int dimw = 3;
-
-  /* get contiguous gradOutput */
-  gradOutput = THTensor_(newContiguous)(gradOutput);
-
-  /* resize */
-  THTensor_(resizeAs)(gradInput, input);
-  THTensor_(zero)(gradInput);
-
-  if (input->nDimension == 5)
-  {
-    dimN++;
-    dimt++;
-    dimh++;
-    dimw++;
-  }
-
-  /* sizes */
-  nslices = input->size[dimN];
-  itime = input->size[dimt];
-  iheight = input->size[dimh];
-  iwidth = input->size[dimw];
-  otime = gradOutput->size[dimt];
-  oheight = gradOutput->size[dimh];
-  owidth = gradOutput->size[dimw];
-
-  /* get raw pointers */
-  gradInput_data = THTensor_(data)(gradInput);
-  gradOutput_data = THTensor_(data)(gradOutput);
-  indices_data = THTensor_(data)(indices);
-
-  /* backprop */
-  if (input->nDimension == 4) /* non-batch mode*/
-  {
-    THNN_(VolumetricMaxPooling_updateGradInput_frame)(
-      gradInput_data, gradOutput_data,
-      indices_data,
-      nslices,
-      itime, iwidth, iheight,
-      otime, owidth, oheight,
-      dT, dW, dH,
-      pT, pW, pH
-    );
-  }
-  else /* batch mode */
-  {
-    long p;
-    long nBatch = input->size[0];
-
-    long istride = nslices * itime * iwidth * iheight;
-    long ostride = nslices * otime * owidth * oheight;
-
-#pragma omp parallel for private(p)
-    for (p = 0; p < nBatch; p++)
-    {
-      THNN_(VolumetricMaxPooling_updateGradInput_frame)(
-        gradInput_data + p * istride,
-        gradOutput_data + p * ostride,
-        indices_data + p * ostride,
-        nslices,
-        itime, iwidth, iheight,
-        otime, owidth, oheight,
-        dT, dW, dH,
-        pT, pW, pH
-      );
-    }
-  }
-
-  /* cleanup */
-  THTensor_(free)(gradOutput);
+  THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          state, input, gradOutput, gradInput, indices,
+          dT, dW, dH, pT, pW, pH, 1, 1, 1);
 }
 
 #endif
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
index 739706c..b4218cb 100644
--- a/lib/THNN/init.c
+++ b/lib/THNN/init.c
@@ -10,6 +10,9 @@
 #include "generic/AbsCriterion.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/BCECriterion.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/ClassNLLCriterion.c"
 #include "THGenerateFloatTypes.h"
 
@@ -139,6 +142,9 @@
 #include "generic/SpatialMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialDilatedMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialMaxUnpooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -169,6 +175,9 @@
 #include "generic/VolumetricMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/VolumetricDilatedMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricMaxUnpooling.c"
 #include "THGenerateFloatTypes.h"
 
diff --git a/test.lua b/test.lua
index e288e25..9098b46 100644
--- a/test.lua
+++ b/test.lua
@@ -125,8 +125,9 @@ function nntest.CMul()
    local ini = math.random(3,5)
    local inj = math.random(3,5)
    local ink = math.random(3,5)
+   local inl = math.random(3,5)
    local input = torch.Tensor(ini,inj,ink):zero()
-   local module = nn.CMul(ini, inj, ink)
+   local module = nn.CMul(1, ini, inj, ink, 1)
 
    -- 1D
    local err = jac.testJacobian(module,input)
@@ -144,8 +145,7 @@ function nntest.CMul()
    end
 
    -- 2D
-   local nframe = math.random(50,70)
-   local nframe = 5
+   local nframe = math.random(3,14)
    local input = torch.randn(nframe, ini,inj,ink)
    local output = module:forward(input)
    local output2 = torch.cmul(input, module.weight:view(1,ini,inj,ink):expandAs(input))
@@ -168,6 +168,27 @@ function nntest.CMul()
    mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'CMul accGradParameters 2D err')
    mytester:assert(module.weight:isSameSizeAs(module.gradWeight), 'CMul gradWeight size err')
 
+   -- Expansion
+   input = torch.randn(nframe, ini,inj,ink,inl)
+   output = module:forward(input)
+   output2 = torch.cmul(input, module.weight:expandAs(input))
+   mytester:assertTensorEq(output2, output, 0.000001, 'CMul forward expand err')
+
+   module:zeroGradParameters()
+   gradWeight:zero()
+   gradInput = module:backward(input, output)
+   gradInput2 = gradInput:clone():zero()
+   gradInput2:addcmul(1, module.weight:expandAs(output), output)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, 'CMul updateGradInput expansion err')
+   mytester:assert(gradInput:isSameSizeAs(input), 'CMul gradInput expand size err')
+
+   for i=1,nframe do
+      -- 4 is the [non-batch] singleton dim
+      gradWeight:add(torch.cmul(input[i], output[i]):sum(4))
+   end
+   mytester:assertTensorEq(gradWeight:sum(5), module.gradWeight, 0.000001, 'CMul accGradParameters expand err')
+   mytester:assert(module.weight:isSameSizeAs(module.gradWeight), 'CMul accGradParameters expand size err')
+
    input:zero()
 
    local err = jac.testJacobian(module,input)
@@ -360,6 +381,19 @@ function nntest.HardTanh()
    local ferr, berr = jac.testIO(module, input)
    mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+   -- test inclusive bounds -- HardTahn(1,inf) should behave like Threshold(1)
+   local input = torch.Tensor({1})
+   local gradOutput = torch.Tensor({1})
+   local gradOutputClone = gradOutput:clone()
+   local module = nn.HardTanh(1, math.huge, true)
+   local tanhGradInput = module:backward(input, gradOutput)
+
+   local input = input:clone()
+   local gradOutput = gradOutputClone
+   local module  = nn.Threshold(1, 0, true)
+   local threshGradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(tanhGradInput, threshGradInput, 0.000001, 'HardTanh gradInput')
 end
 
 function nntest.Clamp()
@@ -2296,6 +2330,21 @@ function nntest.SpatialConvolution()
    module.gradBias = torch.Tensor(module.nOutputPlane):zero()
    module:reset()
    jacTests(module)
+
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():normal()
+   local gradInput = module:forward(input, gradOutput):clone()
+   local bigWeight = module.weight.new(module.weight:nElement() * 4):fill(0/0) -- fill with nans
+   local newWeight = bigWeight:narrow(1, module.weight:nElement() * 3, module.weight:nElement())
+   newWeight = newWeight:viewAs(module.weight):copy(module.weight)
+   module.weight = newWeight
+   local newOutput = module:forward(input)
+   local newGradInput = module:forward(input, gradOutput)
+   mytester:asserteq((newOutput - output):abs():max(), 0,
+      torch.typename(module) .. ' forward failure case in a getParameters setting ')
+   mytester:asserteq((newGradInput - gradInput):abs():max(), 0,
+      torch.typename(module) .. ' backward failure case in a getParameters setting ')
+
 end
 
 function nntest.SpatialConvolutionMM()
@@ -3679,6 +3728,48 @@ function nntest.TemporalConvolution()
    mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001, 'error on 2D vs 1D backward)')
 end
 
+function nntest.TemporalDynamicKMaxPooling()
+   local features = math.random(5,10)
+   local seqLen = math.random(6,9)
+   local minK = math.random(3,6)
+   local factor = math.random(1,100)*0.01
+   local nBatchFrame = math.random(2,4)
+   local module = nn.TemporalDynamicKMaxPooling(minK, factor)
+
+   -- 1D
+   local input = torch.Tensor(seqLen, features)
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- 2D
+   local input = torch.Tensor(nBatchFrame, seqLen, features)
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- 2D matches 1D
+   local output = module:forward(input):clone()
+   local outputGrad = torch.randn(output:size())
+   local inputGrad = module:backward(input, outputGrad):clone()
+
+   local input1D = input:select(1, 2)
+   local output1D = module:forward(input1D)
+   local outputGrad1D = outputGrad:select(1, 2)
+   local inputGrad1D = module:backward(input1D, outputGrad1D)
+
+   mytester:assertTensorEq(output:select(1,2), output1D, 0.000001, 'error on 2D vs 1D forward)')
+   mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001, 'error on 2D vs 1D backward)')
+
+
+end
+
 function nntest.TemporalSubSampling()
    local from = math.random(1,5)
    local ki = math.random(1,6)
@@ -4145,6 +4236,55 @@ function nntest.VolumetricMaxPooling()
    mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
 end
 
+function nntest.VolumetricDilatedMaxPooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(2,3)
+      local kt = math.random(3,4)
+      local ki = math.random(3,4)
+      local kj = math.random(3,4)
+      local st = math.random(2,3)
+      local si = math.random(2,3)
+      local sj = math.random(2,3)
+      local outt = math.random(3,4)
+      local outi = math.random(3,4)
+      local outj = math.random(3,4)
+      local padT = math.min(math.random(0,1),math.floor(kt/2))
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH =  math.min(math.random(0,1),math.floor(kj/2))
+      local dilationT = math.random(1,3)
+      local dilationW = math.random(1,3)
+      local dilationH = math.random(1,3)
+      local int = (outt-1)*st+(dilationT*(kt-1)+1)-2*padT
+      local ini = (outi-1)*si+(dilationW*(ki-1)+1)-2*padW
+      local inj = (outj-1)*sj+(dilationH*(kj-1)+1)-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local module = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+      local input = torch.rand(from,int,inj,ini)
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+      local nbatch = math.random(2,5)
+      input = torch.rand(nbatch,from,int,inj,ini)
+      module = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+  end
+end
+
 function nntest.VolumetricMaxUnpooling()
    local from = math.random(2,3)
    local kt = math.random(3,4)
@@ -4766,7 +4906,7 @@ function nntest.LookupTable()
    for _, normType in ipairs{1, 2, math.random()} do
       local module = nn.LookupTable(totalIndex, entry_size, 0, maxNorm, normType)
       local oriW = module.weight:clone()
-      output = module:updateOutput(input)
+      local output = module:updateOutput(input)
       -- check output is of small norm
       for j = 1,output:size(1) do
          local norm = torch.norm(output:select(1, j), normType)
@@ -4925,6 +5065,36 @@ function nntest.Copy()
    mytester:assert(torch.type(output) == 'torch.FloatTensor', 'copy forward type err')
 end
 
+function nntest.CMaxTable()
+   local input1 = torch.Tensor{{1,3},{2,4}}
+   local input2 = torch.Tensor{{4,2},{3,1}}
+   local input = {input1, input2}
+   local module = nn.CMaxTable()
+   local err1 = torch.add(module:forward(input), -1, torch.Tensor{{4,3},{3,4}})
+   mytester:assertalmosteq(err1:abs():max(), 0, 1e-15, "CMaxTable forward call")
+   local gradOutputs = torch.Tensor{5,6,7,8}
+   local gradInputs = module:backward(input, gradOutputs)
+   local err2 = torch.add(gradInputs[1], -1, torch.Tensor{{0,6},{0,8}})
+   local err3 = torch.add(gradInputs[2], -1, torch.Tensor{{5,0},{7,0}})
+   mytester:assertalmosteq(err2:abs():max(), 0, 1e-15, "CMaxTable backward call")
+   mytester:assertalmosteq(err3:abs():max(), 0, 1e-15, "CMaxTable backward call")
+end
+
+function nntest.CMinTable()
+   local input1 = torch.Tensor{{1,3},{2,4}}
+   local input2 = torch.Tensor{{4,2},{3,1}}
+   local input = {input1, input2}
+   local module = nn.CMinTable()
+   local err1 = torch.add(module:forward(input), -1, torch.Tensor{{1,2},{2,1}})
+   mytester:assertalmosteq(err1:abs():max(), 0, 1e-15, "CMinTable forward call")
+   local gradOutputs = torch.Tensor{5,6,7,8}
+   local gradInputs = module:backward(input, gradOutputs)
+   local err2 = torch.add(gradInputs[1], -1, torch.Tensor{{5,0},{7,0}})
+   local err3 = torch.add(gradInputs[2], -1, torch.Tensor{{0,6},{0,8}})
+   mytester:assertalmosteq(err2:abs():max(), 0, 1e-15, "CMinTable backward call")
+   mytester:assertalmosteq(err3:abs():max(), 0, 1e-15, "CMinTable backward call")
+end
+
 function nntest.JoinTable()
    local tensor = torch.rand(3,4,5)
    local input = {tensor, tensor}
@@ -5451,6 +5621,7 @@ function nntest.Concat()
       module.weight:fill(1)
       module.bias:fill(0)
    end
+   mytester:asserteq(m:size(), num_modules)
 
    local output = m:forward(input)
    local output2 = input:sum(2):expand(4, 5):repeatTensor(num_modules, 1)
@@ -5561,6 +5732,53 @@ function nntest.ConcatTable()
    mytester:assert(go2 == 1, "ConcatTable table variable length")
 end
 
+function nntest.MapTable()
+   local map = nn.MapTable(nn.Linear(10,5))
+   local lin = map:get(1):clone()
+
+   -- ParalleTable with clones as reference
+   local parallel = nn.ParallelTable()
+   parallel:add(lin)
+   parallel:add(lin:clone('weight','bias'))
+   parallel:add(lin:clone('weight','bias'))
+
+   local input = {torch.rand(10), torch.rand(10), torch.rand(10)}
+   local gradOutput = {torch.ones(5), torch.ones(5), torch.ones(5)}
+
+   local outputM = map:forward(input)
+   local outputP = parallel:forward(input)
+   mytester:assertTensorEq(outputM[1], outputP[1])
+   mytester:assertTensorEq(outputM[2], outputP[2])
+   mytester:assertTensorEq(outputM[3], outputP[3])
+   mytester:assert(map:size() == #input)
+
+   map:zeroGradParameters()
+   parallel:zeroGradParameters()
+   local gradInputM = map:backward(input, gradOutput)
+   local gradInputP = parallel:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInputM[1], gradInputP[1])
+   mytester:assertTensorEq(gradInputM[2], gradInputP[2])
+   mytester:assertTensorEq(gradInputM[3], gradInputP[3])
+
+   map:updateParameters(1)
+   parallel:updateParameters(1)
+   mytester:assertTensorEq(map:get(1).weight, parallel:get(1).weight, 0.00001)
+
+   local output = map:forward({input[1], input[2], input[3], input[3]})
+   mytester:assert(#output == 4)
+   local output = map:forward({input[1], input[2]})
+   mytester:assert(#output == 2)
+
+   map:resize(10)
+   mytester:assert(map:size() == 10)
+   map:resize(4)
+   mytester:assert(map:size() == 4)
+   mytester:assert(torch.pointer(map:get(4).weight:storage())
+      == torch.pointer(map:get(1).weight:storage()))
+   map:clearState()
+   mytester:assert(map:size() == 1)
+end
+
 function nntest.FlattenTable()
    -- Create a nested table.  Obviously we can't even stochastically test
    -- the space of all possible nested tables (it's infinite), but here is a
@@ -6306,9 +6524,9 @@ function nntest.addSingletonDimension()
    local resultArg = torch.Tensor()
    local resultR = nn.utils.addSingletonDimension(resultArg, tensor, dim)
    mytester:eq(resultArg:size():totable(), resultSize,
-               'wrong content for random singleton dimention '..
+               'wrong content for random singleton dimension '..
                'when the result is passed as argument')
-   mytester:eq(resultArg, result, 'wrong content for random singleton dimention '..
+   mytester:eq(resultArg, result, 'wrong content for random singleton dimension '..
                'when the result is passed as argument')
 
    mytester:eq(resultR == resultArg, true,
@@ -6360,7 +6578,7 @@ function nntest.VolumetricReplicationPadding()
       local padLeft = math.random(-3,3)
       local padRight = math.random(-3,3)
       local padTop = math.random(-3,3)
-      local padBotom = math.random(-3,3)
+      local padBottom = math.random(-3,3)
       local padFront = math.random(3,3)
       local padBack = math.random(3,3)
       local jac = nn.Jacobian

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git