[lua-torch-nn] 01/06: New upstream version 0~20170308-g1d38cba

Tue Mar 14 11:22:22 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository lua-torch-nn.

commit 47b0eea7ef94f5855e0e095ebd0411c569f45b6e
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Tue Mar 14 10:59:55 2017 +0000

    New upstream version 0~20170308-g1d38cba
---
 AddConstant.lua                                   |  87 +++---
 Bottle.lua                                        |  12 +-
 Contiguous.lua                                    |   2 +
 Jacobian.lua                                      |   8 +-
 LayerNormalization.lua                            |  27 ++
 MapTable.lua                                      |   6 +
 Maxout.lua                                        |  13 +
 MixtureTable.lua                                  |  59 ++--
 MultiLabelSoftMarginCriterion.lua                 |  86 ++++--
 SpatialAdaptiveAveragePooling.lua                 |  35 +++
 THNN.lua                                          |   4 +-
 VolumetricConvolution.lua                         |  27 +-
 VolumetricFractionalMaxPooling.lua                | 175 +++++++++++
 VolumetricFullConvolution.lua                     |  12 +-
 WeightNorm.lua                                    | 165 +++++++++++
 doc/containers.md                                 |  63 ++--
 doc/convolution.md                                |  75 ++++-
 doc/image/lena.jpg                                | Bin 0 -> 39706 bytes
 doc/training.md                                   |   4 +-
 init.lua                                          |   5 +
 lib/THNN/generic/ELU.c                            |   8 +-
 lib/THNN/generic/HardShrink.c                     |   6 +-
 lib/THNN/generic/HardTanh.c                       |  15 +-
 lib/THNN/generic/LeakyReLU.c                      |   6 +-
 lib/THNN/generic/Linear.c                         |   3 +-
 lib/THNN/generic/LookupTable.c                    |   9 +-
 lib/THNN/generic/MarginCriterion.c                |  10 +-
 lib/THNN/generic/MultiMarginCriterion.c           |   6 +-
 lib/THNN/generic/PReLU.c                          |   3 +-
 lib/THNN/generic/RReLU.c                          |  18 +-
 lib/THNN/generic/SoftPlus.c                       |  14 +-
 lib/THNN/generic/SoftShrink.c                     |   8 +-
 lib/THNN/generic/SparseLinear.c                   |  18 +-
 lib/THNN/generic/SpatialAdaptiveAveragePooling.c  | 258 ++++++++++++++++
 lib/THNN/generic/SpatialConvolutionLocal.c        |  64 ++--
 lib/THNN/generic/SpatialConvolutionMM.c           |   7 +-
 lib/THNN/generic/SpatialConvolutionMap.c          |  14 +-
 lib/THNN/generic/SpatialDilatedConvolution.c      |   3 +-
 lib/THNN/generic/SpatialFullConvolution.c         |   7 +-
 lib/THNN/generic/SpatialFullConvolutionMap.c      |  14 +-
 lib/THNN/generic/SpatialSubSampling.c             |  11 +-
 lib/THNN/generic/Sqrt.c                           |   7 +-
 lib/THNN/generic/THNN.h                           | 160 +++++-----
 lib/THNN/generic/TemporalConvolution.c            |  45 +--
 lib/THNN/generic/TemporalRowConvolution.c         |  11 +-
 lib/THNN/generic/TemporalSubSampling.c            |   7 +-
 lib/THNN/generic/Threshold.c                      |  12 +-
 lib/THNN/generic/VolumetricConvolution.c          |  68 +++--
 lib/THNN/generic/VolumetricConvolutionMM.c        |  45 +--
 lib/THNN/generic/VolumetricDilatedConvolution.c   |   3 +-
 lib/THNN/generic/VolumetricFractionalMaxPooling.c | 279 ++++++++++++++++++
 lib/THNN/generic/VolumetricFullConvolution.c      |  43 +--
 lib/THNN/generic/unfold.c                         |  22 +-
 lib/THNN/init.c                                   |   6 +
 test.lua                                          | 343 +++++++++++++++++++---
 55 files changed, 1961 insertions(+), 457 deletions(-)

diff --git a/AddConstant.lua b/AddConstant.lua
index 5848462..8a0223a 100644
--- a/AddConstant.lua
+++ b/AddConstant.lua
@@ -1,37 +1,50 @@
-local AddConstant, parent = torch.class('nn.AddConstant', 'nn.Module')
-
-function AddConstant:__init(constant_scalar,ip)
-  parent.__init(self)
-  assert(type(constant_scalar) == 'number', 'input is not scalar!')
-  self.constant_scalar = constant_scalar
-  
-  -- default for inplace is false
-   self.inplace = ip or false
-   if (ip and type(ip) ~= 'boolean') then
-      error('in-place flag must be boolean')
-   end
-end
-
-function AddConstant:updateOutput(input)
-  if self.inplace then
-    input:add(self.constant_scalar)
-    self.output:set(input)
-  else
-    self.output:resizeAs(input)
-    self.output:copy(input)
-    self.output:add(self.constant_scalar)
-  end
-  return self.output
-end 
-
-function AddConstant:updateGradInput(input, gradOutput)
-  if self.inplace then
-    self.gradInput:set(gradOutput)
-    -- restore previous input value
-    input:add(-self.constant_scalar)
-  else
-    self.gradInput:resizeAs(gradOutput)
-    self.gradInput:copy(gradOutput)
-  end
-  return self.gradInput
-end
+local AddConstant, parent = torch.class('nn.AddConstant', 'nn.Module')
+
+function AddConstant:__init(constant_scalar,ip)
+   parent.__init(self)
+   self.constant_scalar = constant_scalar
+
+  -- default for inplace is false
+   self.inplace = ip or false
+   if (ip and type(ip) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+end
+
+function AddConstant:updateOutput(input)
+   assert(type(self.constant_scalar) == 'number' or
+      (torch.isTensor(self.constant_scalar) and input:nDimension() <= 2 and
+      input:size(input:nDimension()) == self.constant_scalar:size(1)),
+      'input is not scalar or doesn\'t match with the dimension of constant!')
+   local tmp
+   if torch.isTensor(self.constant_scalar) and input:nDimension() == 2 then
+      local nOutput = self.constant_scalar:size(1)
+      tmp = self.constant_scalar.new()
+      tmp:resize(1,nOutput)
+      tmp:copy(self.constant_scalar)
+      tmp = tmp:expand(input:size(1),nOutput)
+   else
+      tmp = self.constant_scalar
+   end
+   if self.inplace then
+      input:add(tmp)
+      self.output:set(input)
+   else
+      self.output:resizeAs(input)
+      self.output:copy(input)
+      self.output:add(tmp)
+   end
+   return self.output
+end
+
+function AddConstant:updateGradInput(input, gradOutput)
+   if self.inplace then
+      self.gradInput:set(gradOutput)
+      -- restore previous input value
+      input:add(-self.constant_scalar)
+   else
+      self.gradInput:resizeAs(gradOutput)
+      self.gradInput:copy(gradOutput)
+   end
+   return self.gradInput
+end
diff --git a/Bottle.lua b/Bottle.lua
index 6934bff..683935c 100644
--- a/Bottle.lua
+++ b/Bottle.lua
@@ -49,9 +49,17 @@ function Bottle:updateGradInput(input, gradOutput)
       local input_ = input:view(unpack(self.inShape:totable()))
       local gradOutput_ = gradOutput:view(unpack(self.outShape:totable()))
       self.modules[1]:updateGradInput(input_, gradOutput_)
-      self.gradInput:set(self.modules[1].gradInput:viewAs(input))
+      if self.modules[1].gradInput then
+         self.gradInput:set(self.modules[1].gradInput:viewAs(input))
+      else
+         self.gradInput = nil
+      end
    else
-      self.gradInput:set(self.modules[1]:updateGradInput(input))
+      if self.modules[1].gradInput then
+         self.gradInput:set(self.modules[1]:updateGradInput(input))
+      else
+         self.gradInput = nil
+      end
    end
    return self.gradInput
 end
diff --git a/Contiguous.lua b/Contiguous.lua
old mode 100644
new mode 100755
index 2f07e92..f9974ce
--- a/Contiguous.lua
+++ b/Contiguous.lua
@@ -2,6 +2,7 @@ local Contiguous, parent = torch.class('nn.Contiguous', 'nn.Module')
 
 function Contiguous:updateOutput(input)
    if not input:isContiguous() then
+      if self.output:storage() == input:storage() then self.output:set() end
       self.output:resizeAs(input):copy(input)
    else
       self.output:set(input)
@@ -11,6 +12,7 @@ end
 
 function Contiguous:updateGradInput(input, gradOutput)
    if not gradOutput:isContiguous() then
+      if self.gradInput:storage() == gradOutput:storage() then self.gradInput:set() end
       self.gradInput:resizeAs(gradOutput):copy(gradOutput)
    else
       self.gradInput:set(gradOutput)
diff --git a/Jacobian.lua b/Jacobian.lua
index 64187c3..4f728b1 100644
--- a/Jacobian.lua
+++ b/Jacobian.lua
@@ -293,10 +293,12 @@ function nn.Jacobian.testIO(module,input, minval, maxval)
    minval = minval or -2
    maxval = maxval or 2
    local inrange = maxval - minval
+   local inputclone = input:clone()
 
    -- run module
    module:forward(input)
    local go = module.output:clone():copy(torch.rand(module.output:nElement()):mul(inrange):add(minval))
+   local goclone = go:clone()
    module:zeroGradParameters()
    module:updateGradInput(input,go)
    module:accGradParameters(input,go)
@@ -313,10 +315,10 @@ function nn.Jacobian.testIO(module,input, minval, maxval)
    f:close()
    -- read module
    local m = torch.DiskFile(filename):binary():readObject()
-   m:forward(input)
+   m:forward(inputclone)
    m:zeroGradParameters()
-   m:updateGradInput(input,go)
-   m:accGradParameters(input,go)
+   m:updateGradInput(inputclone,goclone)
+   m:accGradParameters(inputclone,goclone)
    -- cleanup
    os.remove(filename)
 
diff --git a/LayerNormalization.lua b/LayerNormalization.lua
new file mode 100644
index 0000000..722d7c8
--- /dev/null
+++ b/LayerNormalization.lua
@@ -0,0 +1,27 @@
+-- Reference: https://arxiv.org/pdf/1607.06450.pdf (Section 3)
+
+local LayerNormalization, parent = torch.class('nn.LayerNormalization', 'nn.Sequential')
+function LayerNormalization:__init(nOutput, bias, eps, affine)
+   parent.__init(self)
+   eps = eps or 1e-10
+   affine = (affine == nil) and true or affine
+   bias = bias or 0
+
+   self:add(nn.ConcatTable()
+               :add(nn.Identity())
+               :add(nn.Sequential()
+                       :add(nn.Mean(1, 1))
+                       :add(nn.Replicate(nOutput,1,1))))
+      :add(nn.CSubTable())
+      :add(nn.Normalize(2, eps))
+      :add(nn.MulConstant(torch.sqrt(nOutput)))
+
+   if affine then
+      local biasTransform = nn.Add(nOutput, false)
+      biasTransform.bias:fill(bias)
+      local gainTransform = nn.CMul(nOutput)
+      gainTransform.weight:fill(1.)
+      self:add(gainTransform)
+      self:add(biasTransform)
+   end
+end
diff --git a/MapTable.lua b/MapTable.lua
index 79b967d..90b439c 100644
--- a/MapTable.lua
+++ b/MapTable.lua
@@ -20,6 +20,9 @@ end
 function MapTable:resize(n)
    self:_extend(n)
    for i = n + 1, #self.modules do
+      -- It's not clear why this clearState call is necessary, but it fixes
+      -- https://github.com/torch/nn/issues/1141 .
+      self.modules[i]:clearState()
       self.modules[i] = nil
    end
 end
@@ -79,6 +82,9 @@ end
 
 function MapTable:clearState()
    for i = 2, #self.modules do
+      -- It's not clear why this clearState call is necessary, but it fixes
+      -- https://github.com/torch/nn/issues/1141 .
+      self.modules[i]:clearState()
       self.modules[i] = nil
    end
    parent.clearState(self)
diff --git a/Maxout.lua b/Maxout.lua
new file mode 100644
index 0000000..a797a9f
--- /dev/null
+++ b/Maxout.lua
@@ -0,0 +1,13 @@
+-- Reference: http://jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+
+local Maxout, parent = torch.class('nn.Maxout', 'nn.Sequential')
+
+function Maxout:__init(inputSize, outputSize, maxoutNumber, preprocess)
+   parent.__init(self)
+   self:add(nn.Linear(inputSize, outputSize * maxoutNumber))
+   self:add(nn.View(maxoutNumber, outputSize):setNumInputDims(1))
+   if preprocess then
+      self:add(preprocess)
+   end
+   self:add(nn.Max(1, 2))
+end
diff --git a/MixtureTable.lua b/MixtureTable.lua
index 17c307e..e48f791 100644
--- a/MixtureTable.lua
+++ b/MixtureTable.lua
@@ -34,16 +34,13 @@ function MixtureTable:updateOutput(input)
          error"Should be one gater output per expert"
       end
       local expertInput = expertInputs[1]
-      if self.batchSize ~= batchSize then
-         self.size:resize(expertInput:dim()+1):fill(1)
-         if self.dimG > 1 then 
-            self.size[1] = gaterInput:size(1)
-         end
-         self.size[self.dim] = gaterInput:size(self.dimG)
-         self.output:resizeAs(expertInput)
-         self.backwardSetup = false
-         self.batchSize = batchSize
+      self.size:resize(expertInput:dim()+1):fill(1)
+      if self.dimG > 1 then 
+         self.size[1] = gaterInput:size(1)
       end
+      self.size[self.dim] = gaterInput:size(self.dimG)
+      self.output:resizeAs(expertInput)
+      self.batchSize = batchSize
       self._gaterView:view(gaterInput, self.size)
       self.output:zero()
       -- multiply accumulate gater outputs by their commensurate expert
@@ -53,16 +50,13 @@ function MixtureTable:updateOutput(input)
       end
    else
       -- expertInputs is a Tensor :
-      if self.batchSize ~= batchSize then
-         self.size:resize(expertInputs:dim()):fill(1)
-         if self.dimG > 1 then
-            self.size[1] = gaterInput:size(1)
-         end
-         self.size[self.dim] = gaterInput:size(self.dimG)
-         self.output:resizeAs(expertInputs:select(self.dim, 1))
-         self.batchSize = batchSize
-         self.backwardSetup = false
+      self.size:resize(expertInputs:dim()):fill(1)
+      if self.dimG > 1 then
+         self.size[1] = gaterInput:size(1)
       end
+      self.size[self.dim] = gaterInput:size(self.dimG)
+      self.output:resizeAs(expertInputs:select(self.dim, 1))
+      self.batchSize = batchSize
       self._gaterView:view(gaterInput, self.size)
       self._expert:cmul(self._gaterView:expandAs(expertInputs), expertInputs)
       self.output:sum(self._expert, self.dim)
@@ -83,14 +77,18 @@ function MixtureTable:updateGradInput(input, gradOutput)
    self._expert2 = self._expert2 or input[1].new()
       
    if self.table then
-      if not self.backwardSetup then
-         for i,expertInput in ipairs(expertInputs) do
-            local expertGradInput = expertGradInputs[i] or expertInput:clone()
-            expertGradInput:resizeAs(expertInput)
-            expertGradInputs[i] = expertGradInput
+      for i,expertInput in ipairs(expertInputs) do
+         local expertGradInput = expertGradInputs[i] or expertInput:clone()
+         expertGradInput:resizeAs(expertInput)
+         expertGradInputs[i] = expertGradInput
+      end
+      gaterGradInput:resizeAs(gaterInput)
+      
+      -- Clear invalid gradients
+      if #expertGradInputs > #expertInputs then 
+         for i=#expertInputs+1, #expertGradInputs do
+            expertGradInputs[i] = nil
          end
-         gaterGradInput:resizeAs(gaterInput)
-         self.backwardSetup = true
       end
       
       -- like CMulTable, but with broadcasting
@@ -114,13 +112,10 @@ function MixtureTable:updateGradInput(input, gradOutput)
          expertGradInput:cmul(gate, gradOutput)     
       end
    else
-      if not self.backwardSetup then
-         self.size2:resize(expertInputs:dim())
-         self.size2:copy(expertInputs:size())
-         self.size2[self.dim] = 1
-         gaterGradInput:resizeAs(gaterInput)
-         self.backwardSetup = true
-      end
+      self.size2:resize(expertInputs:dim())
+      self.size2:copy(expertInputs:size())
+      self.size2[self.dim] = 1
+      gaterGradInput:resizeAs(gaterInput)
       
       -- gater updateGradInput
       self._expertView:view(gradOutput, self.size2)
diff --git a/MultiLabelSoftMarginCriterion.lua b/MultiLabelSoftMarginCriterion.lua
index a73ef38..9d471d4 100644
--- a/MultiLabelSoftMarginCriterion.lua
+++ b/MultiLabelSoftMarginCriterion.lua
@@ -8,37 +8,79 @@
 -- and with weights:
 -- l(x,y) = - sum_i weights[i] (y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i]))
 --
---
+-- This uses the stable form of the loss and gradients.
 --]]
 
 
-local MultiLabelSoftMarginCriterion, parent =
-torch.class('nn.MultiLabelSoftMarginCriterion', 'nn.Criterion')
+local MultiLabelSoftMarginCriterion, parent = torch.class('nn.MultiLabelSoftMarginCriterion', 'nn.Criterion')
 
 
-function MultiLabelSoftMarginCriterion:__init(weights)
-    parent.__init(self)
-    self.lsm = nn.Sigmoid()
-    self.nll = nn.BCECriterion(weights)
+function MultiLabelSoftMarginCriterion:__init(weights, sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+      self.sizeAverage = sizeAverage
+   else
+      self.sizeAverage = true
+   end
+   if weights ~= nil then
+      assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+      self.weights = weights
+   end
+   self.sigmoid = nn.Sigmoid()
 end
 
 function MultiLabelSoftMarginCriterion:updateOutput(input, target)
-    input = input:nElement() == 1 and input or input:squeeze()
-    target = target:nElement() == 1 and target or target:squeeze()
-    self.lsm:updateOutput(input)
-    self.nll:updateOutput(self.lsm.output, target)
-    self.output = self.nll.output
-    return self.output
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   local x = input:view(input:nElement())
+   local t = target:view(target:nElement())
+
+   self.sigmoid:updateOutput(x)
+
+   self._buffer1 = self._buffer1 or input.new()
+   self._buffer2 = self._buffer2 or input.new()
+
+   self._buffer1:ge(x, 0) -- indicator
+
+   -- log(1 + exp(x - cmul(x, indicator):mul(2)))
+   self._buffer2:cmul(x, self._buffer1):mul(-2):add(x):exp():add(1):log()
+   -- cmul(x, t - indicator)
+   self._buffer1:mul(-1):add(t):cmul(x)
+   -- log(1 + exp(x - cmul(x, indicator):mul(2))) - cmul(x, t - indicator)
+   self._buffer2:add(-1, self._buffer1)
+
+   if weights ~= nil then
+      self._buffer2:cmul(weights)
+   end
+
+   self.output = self._buffer2:sum()
+
+   if self.sizeAverage then
+      self.output = self.output / input:nElement()
+   end
+
+   return self.output
 end
 
 function MultiLabelSoftMarginCriterion:updateGradInput(input, target)
-    local size = input:size()
-    input = input:nElement() ==1 and input or input:squeeze()
-    target = target:nElement() == 1 and target or target:squeeze()
-    self.nll:updateGradInput(self.lsm.output, target)
-    self.lsm:updateGradInput(input, self.nll.gradInput)
-    self.gradInput:view(self.lsm.gradInput, size)
-    return self.gradInput
-end
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   self.gradInput:resizeAs(input):copy(self.sigmoid.output)
+   self.gradInput:add(-1, target)
 
- return nn.MultiLabelSoftMarginCriterion
+   if weights ~= nil then
+      self.gradInput:cmul(weights)
+   end
+
+   if self.sizeAverage then
+      self.gradInput:div(target:nElement())
+   end
+
+   return self.gradInput
+end
diff --git a/SpatialAdaptiveAveragePooling.lua b/SpatialAdaptiveAveragePooling.lua
new file mode 100644
index 0000000..2e22358
--- /dev/null
+++ b/SpatialAdaptiveAveragePooling.lua
@@ -0,0 +1,35 @@
+local SpatialAdaptiveAveragePooling, parent = torch.class('nn.SpatialAdaptiveAveragePooling', 'nn.Module')
+
+function SpatialAdaptiveAveragePooling:__init(W, H)
+   parent.__init(self)
+
+   self.W = W
+   self.H = H
+end
+
+function SpatialAdaptiveAveragePooling:updateOutput(input)
+   input.THNN.SpatialAdaptiveAveragePooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.W, self.H
+   )
+   return self.output
+end
+
+function SpatialAdaptiveAveragePooling:updateGradInput(input, gradOutput)
+   input.THNN.SpatialAdaptiveAveragePooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
+
+-- for backward compat
+function SpatialAdaptiveAveragePooling:empty()
+   self:clearState()
+end
+
+function SpatialAdaptiveAveragePooling:clearState()
+   return parent.clearState(self)
+end
diff --git a/THNN.lua b/THNN.lua
index 9100239..0848e9e 100644
--- a/THNN.lua
+++ b/THNN.lua
@@ -55,7 +55,7 @@ local replacements =
 {
    {
       ['TYPE'] = 'Double',
-      ['real'] = 'double',
+      ['accreal'] = 'double',
       ['THTensor'] = 'THDoubleTensor',
       ['THIndexTensor'] = 'THLongTensor',
       ['THIntegerTensor'] = 'THIntTensor',
@@ -64,7 +64,7 @@ local replacements =
    },
    {
       ['TYPE'] = 'Float',
-      ['real'] = 'float',
+      ['accreal'] = 'double',
       ['THTensor'] = 'THFloatTensor',
       ['THIndexTensor'] = 'THLongTensor',
       ['THIntegerTensor'] = 'THIntTensor',
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index 24dbfe3..329609a 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -1,3 +1,4 @@
+local THNN = require 'nn.THNN'
 local VolumetricConvolution, parent = torch.class('nn.VolumetricConvolution', 'nn.Module')
 
 function VolumetricConvolution:__init(nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH, padT, padW, padH)
@@ -36,15 +37,25 @@ function VolumetricConvolution:reset(stdv)
       self.weight:apply(function()
          return torch.uniform(-stdv, stdv)
       end)
-      self.bias:apply(function()
-         return torch.uniform(-stdv, stdv)
-      end)
+      if self.bias then
+         self.bias:apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      end
    else
       self.weight:uniform(-stdv, stdv)
-      self.bias:uniform(-stdv, stdv)
+      if self.bias then
+         self.bias:uniform(-stdv, stdv)
+      end
    end
 end
 
+function VolumetricConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
 function VolumetricConvolution:updateOutput(input)
    self.finput = self.finput or input.new()
    self.fgradInput = self.fgradInput or input.new()
@@ -53,7 +64,7 @@ function VolumetricConvolution:updateOutput(input)
         input:cdata(),
         self.output:cdata(),
         self.weight:cdata(),
-        self.bias:cdata(),
+        THNN.optionalTensor(self.bias),
         self.finput:cdata(),
         self.fgradInput:cdata(),
         self.dT, self.dW, self.dH,
@@ -64,7 +75,7 @@ function VolumetricConvolution:updateOutput(input)
          input:cdata(),
          self.output:cdata(),
          self.weight:cdata(),
-         self.bias:cdata(),
+         THNN.optionalTensor(self.bias),
          self.finput:cdata(),
          self.kT, self.kW, self.kH,
          self.dT, self.dW, self.dH,
@@ -110,7 +121,7 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
          input:cdata(),
          gradOutput:cdata(),
          self.gradWeight:cdata(),
-         self.gradBias:cdata(),
+         THNN.optionalTensor(self.gradBias),
          self.finput:cdata(),
          self.fgradInput:cdata(),
          self.dT, self.dW, self.dH,
@@ -122,7 +133,7 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
          input:cdata(),
          gradOutput:cdata(),
          self.gradWeight:cdata(),
-         self.gradBias:cdata(),
+         THNN.optionalTensor(self.gradBias),
          self.finput:cdata(),
          self.kT, self.kW, self.kH,
          self.dT, self.dW, self.dH,
diff --git a/VolumetricFractionalMaxPooling.lua b/VolumetricFractionalMaxPooling.lua
new file mode 100644
index 0000000..f5ff58c
--- /dev/null
+++ b/VolumetricFractionalMaxPooling.lua
@@ -0,0 +1,175 @@
+local VolumetricFractionalMaxPooling, parent =
+   torch.class('nn.VolumetricFractionalMaxPooling', 'nn.Module')
+
+-- Usage:
+-- nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH, outT, outW, outH)
+--   the output should be the exact size (outT x outH x outW)
+-- nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH, ratioT, ratioW, ratioH)
+--   the output should be the size (floor(inT x ratioT) x floor(inH x ratioH) x floor(inW x ratioW))
+--   ratios are numbers between (0, 1) exclusive
+function VolumetricFractionalMaxPooling:__init(poolSizeT, poolSizeW, poolSizeH, arg1, arg2, arg3)
+   parent.__init(self)
+   assert(poolSizeT >= 2)
+   assert(poolSizeW >= 2)
+   assert(poolSizeH >= 2)
+
+   -- Pool size (how wide the pooling for each output unit is)
+   self.poolSizeT = poolSizeT
+   self.poolSizeW = poolSizeW
+   self.poolSizeH = poolSizeH
+
+   -- Random samples are drawn for all
+   -- batch * plane * (time, height, width; i.e., 3) points. This determines
+   -- the 3d "pseudorandom" overlapping pooling regions for each
+   -- (batch element x input plane). A new set of random samples is
+   -- drawn every updateOutput call, unless we disable it via
+   -- :fixPoolingRegions().
+   self.randomSamples = nil
+
+   -- Flag to disable re-generation of random samples for producing
+   -- a new pooling. For testing purposes
+   self.newRandomPool = false
+
+   if arg1 >= 1 and arg2 >= 1 and arg3 >= 1 then
+      -- Desired output size: the input tensor will determine the reduction
+      -- ratio
+      self.outT = arg1
+      self.outW = arg2
+      self.outH = arg3
+   else
+      -- Reduction ratio specified per each input
+      -- This is the reduction ratio that we use
+      self.ratioT = arg1
+      self.ratioW = arg2
+      self.ratioH = arg3
+
+      -- The reduction ratio must be between 0 and 1
+      assert(self.ratioT > 0 and self.ratioT < 1)
+      assert(self.ratioW > 0 and self.ratioW < 1)
+      assert(self.ratioH > 0 and self.ratioH < 1)
+   end
+end
+
+function VolumetricFractionalMaxPooling:getBufferSize_(input)
+   local batchSize = 0
+   local planeSize = 0
+
+   if input:nDimension() == 4 then
+      batchSize = 1
+      planeSize = input:size(1)
+   elseif input:nDimension() == 5 then
+      batchSize = input:size(1)
+      planeSize = input:size(2)
+   else
+      error('input must be dim 4 or 5')
+   end
+
+   return torch.LongStorage({batchSize, planeSize, 3})
+end
+
+function VolumetricFractionalMaxPooling:initSampleBuffer_(input)
+   local sampleBufferSize = self:getBufferSize_(input)
+
+   if self.randomSamples == nil then
+      self.randomSamples = input.new():resize(sampleBufferSize):uniform()
+   elseif (self.randomSamples:size(1) ~= sampleBufferSize[1] or
+           self.randomSamples:size(2) ~= sampleBufferSize[2]) then
+      self.randomSamples:resize(sampleBufferSize):uniform()
+   else
+      if not self.newRandomPool then
+         -- Create new pooling windows, since this is a subsequent call
+         self.randomSamples:uniform()
+      end
+   end
+end
+
+function VolumetricFractionalMaxPooling:getOutputSizes_(input)
+   local outT = self.outT
+   local outW = self.outW
+   local outH = self.outH
+   if self.ratioW ~= nil and self.ratioH ~= nil then
+      if input:nDimension() == 5 then
+         outT = math.floor(input:size(5) * self.ratioT)
+         outW = math.floor(input:size(4) * self.ratioW)
+         outH = math.floor(input:size(3) * self.ratioH)
+      elseif input:nDimension() == 4 then
+         outT = math.floor(input:size(4) * self.ratioT)
+         outW = math.floor(input:size(3) * self.ratioW)
+         outH = math.floor(input:size(2) * self.ratioH)
+      else
+         error('input must be dim 4 or 5')
+      end
+
+      -- Neither can be smaller than 1
+      assert(outT > 0, 'reduction ratio or input time too small')
+      assert(outW > 0, 'reduction ratio or input width too small')
+      assert(outH > 0, 'reduction ratio or input height too small')
+   else
+      assert(outT ~= nil and outW ~= nil and outH ~= nil)
+   end
+
+   return outT, outW, outH
+end
+
+-- Call this to turn off regeneration of random pooling regions each
+-- updateOutput call.
+function VolumetricFractionalMaxPooling:fixPoolingRegions(val)
+   if val == nil then
+      val = true
+   end
+
+   self.newRandomPool = val
+   return self
+end
+
+function VolumetricFractionalMaxPooling:updateOutput(input)
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+   self:initSampleBuffer_(input)
+   local outT, outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.VolumetricFractionalMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      outT, outW, outH, self.poolSizeT, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata(), self.randomSamples:cdata())
+   return self.output
+end
+
+function VolumetricFractionalMaxPooling:updateGradInput(input, gradOutput)
+   assert(self.randomSamples ~= nil,
+          'must call updateOutput/forward first')
+
+   local outT, outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.VolumetricFractionalMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      outT, outW, outH, self.poolSizeT, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata())
+   return self.gradInput
+end
+
+-- backward compat
+function VolumetricFractionalMaxPooling:empty()
+   self:clearState()
+end
+
+function VolumetricFractionalMaxPooling:clearState()
+   self.indices = nil
+   self.randomSamples = nil
+   return parent.clearState(self)
+end
+
+function VolumetricFractionalMaxPooling:__tostring__()
+   return string.format('%s(%dx%dx%d, %d,%d,%d)', torch.type(self),
+                        self.outT and self.outT or self.ratioT,
+                        self.outW and self.outW or self.ratioW,
+                        self.outH and self.outH or self.ratioH,
+                        self.poolSizeT, self.poolSizeW, self.poolSizeH)
+end
diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua
index 58eaa1d..0ce2340 100644
--- a/VolumetricFullConvolution.lua
+++ b/VolumetricFullConvolution.lua
@@ -1,3 +1,4 @@
+local THNN = require 'nn.THNN'
 local VolumetricFullConvolution, parent = torch.class('nn.VolumetricFullConvolution','nn.Module')
 
 function VolumetricFullConvolution:__init(nInputPlane, nOutputPlane,
@@ -77,6 +78,13 @@ function VolumetricFullConvolution:backCompatibility()
    self.adjT = self.adjT or 0
 end
 
+
+function VolumetricFullConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
 function VolumetricFullConvolution:updateOutput(input)
    self:backCompatibility()
 
@@ -101,7 +109,7 @@ function VolumetricFullConvolution:updateOutput(input)
       inputTensor:cdata(),
       self.output:cdata(),
       self.weight:cdata(),
-      self.bias:cdata(),
+      THNN.optionalTensor(self.bias),
       self.finput:cdata(),
       self.fgradInput:cdata(),
       self.dT, self.dW, self.dH,
@@ -185,7 +193,7 @@ function VolumetricFullConvolution:accGradParameters(input, gradOutput, scale)
       inputTensor:cdata(),
       gradOutput:cdata(),
       self.gradWeight:cdata(),
-      self.gradBias:cdata(),
+      THNN.optionalTensor(self.gradBias),
       self.finput:cdata(),
       self.fgradInput:cdata(),
       self.dT, self.dW, self.dH,
diff --git a/WeightNorm.lua b/WeightNorm.lua
new file mode 100644
index 0000000..ad832b7
--- /dev/null
+++ b/WeightNorm.lua
@@ -0,0 +1,165 @@
+-- Weight Normalization
+-- https://arxiv.org/pdf/1602.07868v3.pdf
+local WeightNorm, parent = torch.class("nn.WeightNorm", "nn.Container")
+
+function WeightNorm:__init(module, outputDim)
+    -- this container will apply Weight Normalization to any module it wraps
+    -- it accepts parameter ``outputDim`` that represents the dimension of the output of the weight
+    -- if outputDim is not 1, the container will transpose the weight
+    -- if the weight is not 2D, the container will view the weight into a 2D shape
+    -- that is nOut x (nIn x kw x dw x ...)
+
+    parent.__init(self)
+    assert(module.weight)
+
+    if module.bias then
+        self.bias = module.bias
+        self.gradBias = module.gradBias
+    end
+    self.gradWeight = module.gradWeight
+    self.weight = module.weight
+
+    self.outputDim = outputDim or 1
+
+    -- track the non-output weight dimensions
+    self.otherDims = 1
+    for i = 1, self.weight:dim() do
+        if i ~= self.outputDim then
+            self.otherDims = self.otherDims * self.weight:size(i)
+        end
+    end
+
+    -- view size for weight norm 2D calculations
+    self.viewIn = torch.LongStorage({self.weight:size(self.outputDim), self.otherDims})
+
+    -- view size back to original weight
+    self.viewOut = self.weight:size()
+
+    -- bubble outputDim size up to the front
+    for i = self.outputDim - 1, 1, -1 do
+        self.viewOut[i], self.viewOut[i + 1] = self.viewOut[i + 1], self.viewOut[i]
+    end
+
+    -- weight is reparametrized to decouple the length from the direction
+    -- such that w = g * ( v / ||v|| )
+    self.v = torch.Tensor(self.viewIn[1], self.viewIn[2])
+    self.g = torch.Tensor(self.viewIn[1])
+
+    self._norm = torch.Tensor(self.viewIn[1])
+    self._scale = torch.Tensor(self.viewIn[1])
+
+    -- gradient of g
+    self.gradG = torch.Tensor(self.viewIn[1]):zero()
+    -- gradient of v
+    self.gradV = torch.Tensor(self.viewIn)
+
+    self.modules[1] = module
+    self:resetInit()
+end
+
+function WeightNorm:permuteIn(inpt)
+    local ans = inpt
+    for i = self.outputDim - 1, 1, -1 do
+        ans = ans:transpose(i, i+1)
+    end
+    return ans
+end
+
+function WeightNorm:permuteOut(inpt)
+    local ans = inpt
+    for i = 1, self.outputDim - 1 do
+        ans = ans:transpose(i, i+1)
+    end
+    return ans
+end
+
+function WeightNorm:resetInit(inputSize, outputSize)
+    self.v:normal(0, math.sqrt(2/self.viewIn[2]))
+    self.g:norm(self.v, 2, 2)
+    if self.bias then
+        self.bias:zero()
+    end
+end
+
+function WeightNorm:updateOutput(input)
+    -- view to 2D when weight norm container operates
+    self.gradV:copy(self:permuteIn(self.weight))
+    self.gradV = self.gradV:view(self.viewIn)
+
+    -- ||w||
+    self._norm:norm(self.v, 2, 2):pow(2):add(10e-5):sqrt()
+    -- g * w / ||w||
+    self.gradV:copy(self.v)
+    self._scale:copy(self.g):cdiv(self._norm)
+    self.gradV:cmul(self._scale:view(self.viewIn[1], 1)
+                                :expand(self.viewIn[1], self.viewIn[2]))
+
+    -- otherwise maintain size of original module weight
+    self.gradV = self.gradV:view(self.viewOut)
+
+    self.weight:copy(self:permuteOut(self.gradV))
+    self.output:set(self.modules[1]:updateOutput(input))
+    return self.output
+end
+
+function WeightNorm:accGradParameters(input, gradOutput, scale)
+    scale = scale or 1
+    self.modules[1]:accGradParameters(input, gradOutput, scale)
+
+    self.weight:copy(self:permuteIn(self.weight))
+    self.gradV:copy(self:permuteIn(self.gradWeight))
+    self.weight = self.weight:view(self.viewIn)
+
+    local norm = self._norm:view(self.viewIn[1], 1):expand(self.viewIn[1], self.viewIn[2])
+    local scale = self._scale:view(self.viewIn[1], 1):expand(self.viewIn[1], self.viewIn[2])
+
+    -- dL / dw * (w / ||w||)
+    self.weight:copy(self.gradV)
+    self.weight:cmul(self.v):cdiv(norm)
+    self.gradG:sum(self.weight, 2)
+
+    -- dL / dw * g / ||w||
+    self.gradV:cmul(scale)
+
+    -- dL / dg * (w * g / ||w||^2)
+    self.weight:copy(self.v):cmul(scale):cdiv(norm)
+    self.weight:cmul(self.gradG:view(self.viewIn[1], 1)
+                            :expand(self.viewIn[1], self.viewIn[2]))
+
+    -- dL / dv update
+    self.gradV:add(-1, self.weight)
+
+    self.gradV = self.gradV:view(self.viewOut)
+    self.weight = self.weight:view(self.viewOut)
+    self.gradWeight:copy(self:permuteOut(self.gradV))
+end
+
+function WeightNorm:updateGradInput(input, gradOutput)
+    self.gradInput:set(self.modules[1]:updateGradInput(input, gradOutput))
+    return self.gradInput
+end
+
+function WeightNorm:zeroGradParameters()
+    self.modules[1]:zeroGradParameters()
+    self.gradV:zero()
+    self.gradG:zero()
+end
+
+function WeightNorm:updateParameters(lr)
+    self.modules[1]:updateParameters(lr)
+    self.g:add(-lr, self.gradG)
+    self.v:add(-lr, self.gradV)
+end
+
+function WeightNorm:parameters()
+    if self.bias then
+        return {self.v, self.g, self.bias}, {self.gradV, self.gradG, self.gradBias}
+    else
+        return {self.v, self.g}, {self.gradV, self.gradG}
+    end
+end
+
+function WeightNorm:__tostring__()
+    local str = 'nn.WeightNorm [' .. tostring(self.modules[1]) .. ']'
+    return str
+end
diff --git a/doc/containers.md b/doc/containers.md
index 98264fa..bff4fab 100644
--- a/doc/containers.md
+++ b/doc/containers.md
@@ -8,14 +8,14 @@ Complex neural networks are easily built using container classes:
     * [Concat](#nn.Concat) : concatenates in one layer several modules along dimension `dim` ;
       * [DepthConcat](#nn.DepthConcat) : like Concat, but adds zero-padding when non-`dim` sizes don't match;
     * [Bottle](#nn.Bottle) : allows any dimensionality input be forwarded through a module ;
- 
+
 See also the [Table Containers](#nn.TableContainers) for manipulating tables of [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md).
 
 <a name="nn.Container"></a>
 ## Container ##
 
 This is an abstract [Module](module.md#nn.Module) class which declares methods defined in all containers.
-It reimplements many of the Module methods such that calls are propagated to the 
+It reimplements many of the Module methods such that calls are propagated to the
 contained modules. For example, a call to [zeroGradParameters](module.md#nn.Module.zeroGradParameters)
 will be propagated to all contained modules.
 
@@ -37,7 +37,7 @@ Returns the number of contained modules.
 Sequential provides a means to plug layers together
 in a feed-forward fully connected manner.
 
-E.g. 
+E.g.
 creating a one hidden-layer multi-layer perceptron is thus just as easy as:
 ```lua
 mlp = nn.Sequential()
@@ -104,17 +104,17 @@ nn.Sequential {
 
 `module` = `Parallel(inputDimension,outputDimension)`
 
-Creates a container module that applies its `ith` child module to the  `ith` slice of the input Tensor by using [select](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-selectdim-index) 
+Creates a container module that applies its `ith` child module to the  `ith` slice of the input Tensor by using [select](https://github.com/torch/torch7/blob/master/doc/tensor.md#tensor-selectdim-index)
 on dimension `inputDimension`. It concatenates the results of its contained modules together along dimension `outputDimension`.
 
 Example:
 ```lua
 mlp = nn.Parallel(2,1);   -- Parallel container will associate a module to each slice of dimension 2
                            -- (column space), and concatenate the outputs over the 1st dimension.
-                           
+
 mlp:add(nn.Linear(10,3)); -- Linear module (input 10, output 3), applied on 1st slice of dimension 2
 mlp:add(nn.Linear(10,2))  -- Linear module (input 10, output 2), applied on 2nd slice of dimension 2
- 
+
                                   -- After going through the Linear module the outputs are
                                   -- concatenated along the unique dimension, to form 1D Tensor
 > mlp:forward(torch.randn(10,2)) -- of size 5.
@@ -131,8 +131,8 @@ A more complicated example:
 
 mlp = nn.Sequential();
 c = nn.Parallel(1,2)     -- Parallel container will associate a module to each slice of dimension 1
-                         -- (row space), and concatenate the outputs over the 2nd dimension.           
-                         
+                         -- (row space), and concatenate the outputs over the 2nd dimension.
+
 for i=1,10 do            -- Add 10 Linear+Reshape modules in parallel (input = 3, output = 2x1)
  local t=nn.Sequential()
  t:add(nn.Linear(3,2))   -- Linear module (input = 3, output = 2)
@@ -165,7 +165,7 @@ for i = 1, 10000 do     -- Train for a few iterations
  local err = criterion:forward(pred,y)
  local gradCriterion = criterion:backward(pred,y);
  mlp:zeroGradParameters();
- mlp:backward(x, gradCriterion); 
+ mlp:backward(x, gradCriterion);
  mlp:updateParameters(0.01);
  print(err)
 end
@@ -209,16 +209,16 @@ module = nn.DepthConcat(dim)
 DepthConcat concatenates the output of one layer of "parallel" modules along the
 provided dimension `dim`: they take the same inputs, and their output is
 concatenated. For dimensions other than `dim` having different sizes,
-the smaller tensors are copied in the center of the output tensor, 
+the smaller tensors are copied in the center of the output tensor,
 effectively padding the borders with zeros.
 
-The module is particularly useful for concatenating the output of [Convolutions](convolution.md) 
-along the depth dimension (i.e. `nOutputFrame`). 
-This is used to implement the *DepthConcat* layer 
+The module is particularly useful for concatenating the output of [Convolutions](convolution.md)
+along the depth dimension (i.e. `nOutputFrame`).
+This is used to implement the *DepthConcat* layer
 of the [Going deeper with convolutions](http://arxiv.org/pdf/1409.4842v1.pdf) article.
-The normal [Concat](#nn.Concat) Module can't be used since the spatial 
-dimensions (height and width) of the output Tensors requiring concatenation 
-may have different values. To deal with this, the output uses the largest 
+The normal [Concat](#nn.Concat) Module can't be used since the spatial
+dimensions (height and width) of the output Tensors requiring concatenation
+may have different values. To deal with this, the output uses the largest
 spatial dimensions and adds zero-padding around the smaller Tensors.
 ```lua
 inputSize = 3
@@ -231,7 +231,7 @@ mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 3, 3))
 mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
 
 > print(mlp:forward(input))
-(1,.,.) = 
+(1,.,.) =
  -0.2874  0.6255  1.1122  0.4768  0.9863 -0.2201 -0.1516
   0.2779  0.9295  1.1944  0.4457  1.1470  0.9693  0.1654
  -0.5769 -0.4730  0.3283  0.6729  1.3574 -0.6610  0.0265
@@ -240,7 +240,7 @@ mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
   0.4147  0.5062  0.6251  0.4374  0.3252  0.3478  0.0046
   0.7845 -0.0902  0.3499  0.0342  1.0706 -0.0605  0.5525
 
-(2,.,.) = 
+(2,.,.) =
  -0.7351 -0.9327 -0.3092 -1.3395 -0.4596 -0.6377 -0.5097
  -0.2406 -0.2617 -0.3400 -0.4339 -0.3648  0.1539 -0.2961
  -0.7124 -1.2228 -0.2632  0.1690  0.4836 -0.9469 -0.7003
@@ -249,7 +249,7 @@ mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
  -0.3086 -0.0298 -0.2031  0.1026 -0.5785 -0.3275 -0.1630
   0.0596 -0.6097  0.1443 -0.8603 -0.2774 -0.4506 -0.5367
 
-(3,.,.) = 
+(3,.,.) =
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
   0.0000 -0.7326  0.3544  0.1821  0.4796  1.0164  0.0000
   0.0000 -0.9195 -0.0567 -0.1947  0.0169  0.1924  0.0000
@@ -258,7 +258,7 @@ mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
   0.0000 -0.1911  0.2912  0.5092  0.2955  0.7171  0.0000
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
 
-(4,.,.) = 
+(4,.,.) =
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
   0.0000 -0.8263  0.3646  0.6750  0.2062  0.2785  0.0000
   0.0000 -0.7572  0.0432 -0.0821  0.4871  1.9506  0.0000
@@ -267,7 +267,7 @@ mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
   0.0000  0.2570  0.4694 -0.1262  0.5602  0.0821  0.0000
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
 
-(5,.,.) = 
+(5,.,.) =
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
   0.0000  0.3158  0.4389 -0.0485 -0.2179  0.0000  0.0000
   0.0000  0.1966  0.6185 -0.9563 -0.3365  0.0000  0.0000
@@ -276,7 +276,7 @@ mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
 
-(6,.,.) = 
+(6,.,.) =
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
   0.0000  1.1148  0.2324 -0.1093  0.5024  0.0000  0.0000
   0.0000 -0.2624 -0.5863  0.3444  0.3506  0.0000  0.0000
@@ -286,11 +286,11 @@ mlp:add(nn.SpatialConvolutionMM(inputSize, outputSize, 4, 4))
   0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
 [torch.DoubleTensor of dimension 6x7x7]
 ```
-Note how the last 2 of 6 filter maps have 1 column of zero-padding 
-on the left and top, as well as 2 on the right and bottom. 
+Note how the last 2 of 6 filter maps have 1 column of zero-padding
+on the left and top, as well as 2 on the right and bottom.
 This is inevitable when the component
-module output tensors non-`dim` sizes aren't all odd or even. 
-Such that in order to keep the mappings aligned, one need 
+module output tensors non-`dim` sizes aren't all odd or even.
+Such that in order to keep the mappings aligned, one need
 only ensure that these be all odd (or even).
 
 <a name="nn.Bottle"></a>
@@ -323,6 +323,17 @@ mlp = nn.Bottle(nn.Linear(10, 2))
 [torch.LongStorage of size 4]
 ```
 
+<a name="nn.WeightNorm"></a>
+## Weight Normalization
+
+```lua
+module = nn.WeightNorm(module)
+```
+
+WeightNorm implements the reparametrization presented in [Weight Normalization](https://arxiv.org/pdf/1602.07868v3.pdf), which decouples the length of neural network weight vectors from their direction. The weight vectors `w` is determined instead by parameters `g` and `v` such that `w = g * v / ||v||`, where `||v||` is the euclidean norm of vector v. This container can wrap nn layers with weights.
+
+It accepts a parameter ``outputDim`` that represents the output dimension of the module weight it wraps, which defaults to 1. If the outputDim is not 1, the container will transpose the weight appropriately. If the module weight is not 2D, the container will view the weight into an appropriate 2D shape based on the outputDim specified by the user.
+
 <a name="nn.TableContainers"></a>
 ## Table Containers ##
 While the above containers are used for manipulating input [Tensors](https://github.com/torch/torch7/blob/master/doc/tensor.md), table containers are used for manipulating tables :
diff --git a/doc/convolution.md b/doc/convolution.md
index d87a749..3dde128 100644
--- a/doc/convolution.md
+++ b/doc/convolution.md
@@ -21,6 +21,7 @@ A convolution is an integral that expresses the amount of overlap of one functio
     * [SpatialFractionalMaxPooling](#nn.SpatialFractionalMaxPooling) : a 2D fractional max-pooling operation over an input image ;
     * [SpatialAveragePooling](#nn.SpatialAveragePooling) : a 2D average-pooling operation over an input image ;
     * [SpatialAdaptiveMaxPooling](#nn.SpatialAdaptiveMaxPooling) : a 2D max-pooling operation which adapts its parameters dynamically such that the output is of fixed size ;
+    * [SpatialAdaptiveAveragePooling](#nn.SpatialAdaptiveAveragePooling) : a 2D average-pooling operation which adapts its parameters dynamically such that the output is of fixed size ;
     * [SpatialMaxUnpooling](#nn.SpatialMaxUnpooling) : a 2D max-unpooling operation ;
     * [SpatialLPPooling](#nn.SpatialLPPooling) : computes the `p` norm in a convolutional manner on a set of input images ;
     * [SpatialConvolutionMap](#nn.SpatialConvolutionMap) : a 2D convolution that uses a generic connection table ;
@@ -31,14 +32,15 @@ A convolution is an integral that expresses the amount of overlap of one functio
     * [SpatialCrossMapLRN](#nn.SpatialCrossMapLRN) : a spatial local response normalization between feature maps ;
     * [SpatialBatchNormalization](#nn.SpatialBatchNormalization): mean/std normalization over the mini-batch inputs and pixels, with an optional affine transform that follows
 a kernel for computing the weighted average in a neighborhood ;
-    * [SpatialUpsamplingNearest](#nn.SpatialUpSamplingNearest): A simple nearest neighbor upsampler applied to every channel of the feature map.
-    * [SpatialUpsamplingBilinear](#nn.SpatialUpSamplingNearest): A simple bilinear upsampler applied to every channel of the feature map.
+    * [SpatialUpSamplingNearest](#nn.SpatialUpSamplingNearest): A simple nearest neighbor upsampler applied to every channel of the feature map.
+    * [SpatialUpSamplingBilinear](#nn.SpatialUpSamplingBilinear): A simple bilinear upsampler applied to every channel of the feature map.
   * [Volumetric Modules](#nn.VolumetricModules) apply to inputs with three-dimensional relationships (e.g. videos) :
     * [VolumetricConvolution](#nn.VolumetricConvolution) : a 3D convolution over an input video (a sequence of images) ;
     * [VolumetricFullConvolution](#nn.VolumetricFullConvolution) : a 3D full convolution over an input video (a sequence of images) ;
     * [VolumetricDilatedConvolution](#nn.VolumetricDilatedConvolution) : a 3D dilated convolution over an input image ;
     * [VolumetricMaxPooling](#nn.VolumetricMaxPooling) : a 3D max-pooling operation over an input video.
     * [VolumetricDilatedMaxPooling](#nn.VolumetricDilatedMaxPooling) : a 3D dilated max-pooling operation over an input video ;
+    * [VolumetricFractionalMaxPooling](#nn.VolumetricFractionalMaxPooling) : a 3D fractional max-pooling operation over an input image ;
     * [VolumetricAveragePooling](#nn.VolumetricAveragePooling) : a 3D average-pooling operation over an input video.
     * [VolumetricMaxUnpooling](#nn.VolumetricMaxUnpooling) : a 3D max-unpooling operation.
     * [VolumetricReplicationPadding](#nn.VolumetricReplicationPadding) : Pads a volumetric feature map with the value at the edge of the input borders. ;
@@ -538,8 +540,8 @@ The parameters are the following:
 If the input image is a 3D tensor `nInputPlane x height x width`, the output image size
 will be `nOutputPlane x oheight x owidth` where
 ```lua
-owidth  = floor(width + 2 * padW - dilationW * (kW-1) - 1) / dW + 1
-oheight = floor(height + 2 * padH - dilationH * (kH-1) - 1) / dH + 1
+owidth  = floor((width + 2 * padW - dilationW * (kW-1) - 1) / dW) + 1
+oheight = floor((height + 2 * padH - dilationH * (kH-1) - 1) / dH) + 1
 ```
 
 Further information about the dilated convolution can be found in the following paper: [Multi-Scale Context Aggregation by Dilated Convolutions](http://arxiv.org/abs/1511.07122).
@@ -722,6 +724,19 @@ y_i_start = floor((i   /oheight) * iheight)
 y_i_end   = ceil(((i+1)/oheight) * iheight)
 ```
 
+<a name="nn.SpatialAdaptiveAveragePooling"></a>
+### SpatialAdaptiveAveragePooling ###
+
+```lua
+module = nn.SpatialAdaptiveAveragePooling(W, H)
+```
+
+Applies 2D average-pooling operation in an image such that the output is of
+size `WxH`, for any input size. The number of output features is equal
+to the number of input planes.
+
+The pooling region algorithm is the same as that in [SpatialAdaptiveMaxPooling](#nn.SpatialAdaptiveMaxPooling).
+
 <a name="nn.SpatialMaxUnpooling"></a>
 ### SpatialMaxUnpooling ###
 
@@ -940,7 +955,7 @@ The learning of gamma and beta is optional.
 
    In training time, this layer keeps a running estimate of it's computed mean and std.
    The running sum is kept with a default momentup of 0.1 (unless over-ridden)
-   In test time, this running mean/std is used to normalize.
+   In test time, this running mean/std is used to normalize. (**Note that the running mean/std will not be saved if one only checkpoints a model's parameters. In order to correctly use the calculated running mean/std, one needs to checkpoint the model itself (call [clearState()](https://github.com/torch/nn/blob/master/doc/module.md#clearstate) first to save space).**)
 
 
 
@@ -989,7 +1004,7 @@ The parameters are the following:
 
 Note that depending of the size of your kernel, several (of the last)
 columns or rows of the input image might be lost. It is up to the user to
-add proper padding in images.
+add proper padding in images. This layer can be used without a bias by module:noBias().
 
 If the input image is a 4D tensor `nInputPlane x time x height x width`, the output image size
 will be `nOutputPlane x otime x oheight x owidth` where
@@ -1015,7 +1030,7 @@ Applies a 3D full convolution over an input image composed of several input plan
 `forward(input)` is expected to be a 4D or 5D tensor. Note that instead of setting `adjT`, `adjW` and `adjH`, VolumetricFullConvolution also accepts a table input with two tensors: `{convInput, sizeTensor}` where `convInput` is the standard input on which the full convolution is applied, and the size of `sizeTensor` is used to set the size of the output. Using the two-input version of forward
 will ignore the `adjT`, `adjW` and `adjH` values used to construct the module.
 
-This can be used as 3D deconvolution, or 3D upsampling. So that the 3D FCN can be easly implemented.
+This can be used as 3D deconvolution, or 3D upsampling. So that the 3D FCN can be easly implemented. This layer can be used without a bias by module:noBias().
 
 The parameters are the following:
 * `nInputPlane`: The number of expected input planes in the image given into `forward()`.
@@ -1070,9 +1085,9 @@ The parameters are the following:
 If the input image is a 4D tensor `nInputPlane x depth x height x width`, the output image size
 will be `nOutputPlane x odepth x oheight x owidth` where
 ```lua
-odepth  = floor(depth + 2 * padT - dilationT * (kT-1) + 1) / dT + 1
-owidth  = floor(width + 2 * padW - dilationW * (kW-1) + 1) / dW + 1
-oheight = floor(height + 2 * padH - dilationH * (kH-1) + 1) / dH + 1
+odepth  = floor((depth + 2 * padT - dilationT * (kT-1) + 1) / dT) + 1
+owidth  = floor((width + 2 * padW - dilationW * (kW-1) + 1) / dW) + 1
+oheight = floor((height + 2 * padH - dilationH * (kH-1) + 1) / dH) + 1
 ```
 
 Further information about the dilated convolution can be found in the following paper: [Multi-Scale Context Aggregation by Dilated Convolutions](http://arxiv.org/abs/1511.07122).
@@ -1112,6 +1127,46 @@ oheight = op((height - (dilationH * (kH - 1) + 1) + 2*padH) / dH + 1)
 `op` is a rounding operator. By default, it is `floor`. It can be changed
 by calling `:ceil()` or `:floor()` methods.
 
+<a name="nn.VolumetricFractionalMaxPooling"></a>
+### VolumetricFractionalMaxPooling ###
+
+```lua
+module = nn.VolumetricFractionalMaxPooling(kT, kW, kH, outT, outW, outH)
+--   the output should be the exact size (outH x outW x outT)
+OR
+module = nn.VolumetricFractionalMaxPooling(kT, kW, kH, ratioT, ratioW, ratioH)
+--   the output should be the size (floor(inH x ratioH) x floor(inW x ratioW) x floor(inT x ratioT))
+--   ratios are numbers between (0, 1) exclusive
+```
+
+Applies 3D Fractional max-pooling operation in the "pseudorandom" mode, analogous to [SpatialFractionalMaxPooling](#nn.SpatialFractionalMaxPooling).
+
+The max-pooling operation is applied in `kTxkWxkH` regions by a stochastic step size determined by the target output size.
+The number of output features is equal to the number of input planes.
+
+There are two constructors available.
+
+Constructor 1:
+```lua
+module = nn.VolumetricFractionalMaxPooling(kT, kW, kH, outT, outW, outH)
+```
+
+Constructor 2:
+```lua
+module = nn.VolumetricFractionalMaxPooling(kT, kW, kH, ratioT, ratioW, ratioH)
+```
+If the input image is a 4D tensor `nInputPlane x height x width x time`, the output
+image size will be `nOutputPlane x oheight x owidth x otime`
+
+ where
+
+```lua
+otime  = floor(time * ratioT)
+owidth  = floor(width * ratioW)
+oheight = floor(height * ratioH)
+```
+ratios are numbers between (0, 1) exclusive
+
 <a name="nn.VolumetricAveragePooling"></a>
 ### VolumetricAveragePooling ###
 
diff --git a/doc/image/lena.jpg b/doc/image/lena.jpg
new file mode 100644
index 0000000..d4a8c36
Binary files /dev/null and b/doc/image/lena.jpg differ
diff --git a/doc/training.md b/doc/training.md
index 165b2d5..c2155c0 100644
--- a/doc/training.md
+++ b/doc/training.md
@@ -3,8 +3,8 @@
 
 Training a neural network is easy with a [simple `for` loop](#nn.DoItYourself).  Typically however we would
 use the `optim` optimizer, which implements some cool functionalities, like Nesterov momentum,
-[adagrad](https://github.com/torch/optim/blob/master/doc/index.md#x-adagradopfunc-x-config-state) and
-[adam](https://github.com/torch/optim/blob/master/doc/index.md#x-adamopfunc-x-config-state).
+[adagrad](https://github.com/torch/optim/blob/master/doc/algos.md#optim.adagrad) and
+[adam](https://github.com/torch/optim/blob/master/doc/algos.md#optim.adam).
 
 We will demonstrate using a for-loop first, to show the low-level view of what happens in training.  [StochasticGradient](#nn.StochasticGradient), a simple class
 which does the job for you, is provided as standard.  Finally, [`optim`](https://github.com/torch/optim) is a powerful module,
diff --git a/init.lua b/init.lua
index 66ef8f5..97ec910 100644
--- a/init.lua
+++ b/init.lua
@@ -16,6 +16,7 @@ require('nn.Parallel')
 require('nn.Sequential')
 require('nn.DepthConcat')
 require('nn.Bottle')
+require('nn.WeightNorm')
 
 require('nn.Linear')
 require('nn.Bilinear')
@@ -32,6 +33,7 @@ require('nn.Unsqueeze')
 require('nn.Replicate')
 require('nn.Transpose')
 require('nn.BatchNormalization')
+require('nn.LayerNormalization')
 require('nn.Padding')
 require('nn.GradientReversal')
 require('nn.MaskedSelect')
@@ -86,6 +88,7 @@ require('nn.Sqrt')
 require('nn.HardShrink')
 require('nn.SoftShrink')
 require('nn.Threshold')
+require('nn.Maxout')
 require('nn.ReLU')
 require('nn.ReLU6')
 require('nn.PReLU')
@@ -113,6 +116,7 @@ require('nn.SpatialFractionalMaxPooling')
 require('nn.SpatialLPPooling')
 require('nn.SpatialAveragePooling')
 require('nn.SpatialAdaptiveMaxPooling')
+require('nn.SpatialAdaptiveAveragePooling')
 require('nn.TemporalConvolution')
 require('nn.TemporalSubSampling')
 require('nn.TemporalMaxPooling')
@@ -134,6 +138,7 @@ require('nn.VolumetricFullConvolution')
 require('nn.VolumetricDilatedConvolution')
 require('nn.VolumetricMaxPooling')
 require('nn.VolumetricDilatedMaxPooling')
+require('nn.VolumetricFractionalMaxPooling')
 require('nn.VolumetricMaxUnpooling')
 require('nn.VolumetricAveragePooling')
 require('nn.VolumetricBatchNormalization')
diff --git a/lib/THNN/generic/ELU.c b/lib/THNN/generic/ELU.c
index 784a203..ddcfb97 100644
--- a/lib/THNN/generic/ELU.c
+++ b/lib/THNN/generic/ELU.c
@@ -6,9 +6,10 @@ void THNN_(ELU_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real alpha,
+          accreal alpha_,
           bool inplace)
-{  
+{
+  real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
   if(inplace) {
     TH_TENSOR_APPLY(real, input,
       if(*input_data <= 0) {
@@ -30,9 +31,10 @@ void THNN_(ELU_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *output,
-          real alpha,
+          accreal alpha_,
           bool inplace)
 {
+  real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   if(inplace) {
     TH_TENSOR_APPLY2(real, gradOutput, real, output,
diff --git a/lib/THNN/generic/HardShrink.c b/lib/THNN/generic/HardShrink.c
index 50d272c..aaae85b 100644
--- a/lib/THNN/generic/HardShrink.c
+++ b/lib/THNN/generic/HardShrink.c
@@ -6,8 +6,9 @@ void THNN_(HardShrink_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real lambda)
+          accreal lambda_)
 {
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
   THTensor_(resizeAs)(output, input);
 
   TH_TENSOR_APPLY2(real, output, real, input,
@@ -25,8 +26,9 @@ void THNN_(HardShrink_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          real lambda)
+          accreal lambda_)
 {
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
diff --git a/lib/THNN/generic/HardTanh.c b/lib/THNN/generic/HardTanh.c
index 57ef1be..b38a946 100644
--- a/lib/THNN/generic/HardTanh.c
+++ b/lib/THNN/generic/HardTanh.c
@@ -6,15 +6,17 @@ void THNN_(HardTanh_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real min_val,
-          real max_val,
+          accreal min_val_,
+          accreal max_val_,
           bool inplace)
 {
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
   if (inplace)
     THTensor_(set)(output, input);
   else
     THTensor_(resizeAs)(output, input);
-  
+
   if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
   {
     if (inplace)
@@ -68,10 +70,13 @@ void THNN_(HardTanh_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          real min_val,
-          real max_val,
+          accreal min_val_,
+          accreal max_val_,
           bool inplace)
 {
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+
   THNN_CHECK_NELEMENT(input, gradOutput);
   if (inplace)
     THTensor_(set)(gradInput, gradOutput);
diff --git a/lib/THNN/generic/LeakyReLU.c b/lib/THNN/generic/LeakyReLU.c
index a4d9677..074047d 100644
--- a/lib/THNN/generic/LeakyReLU.c
+++ b/lib/THNN/generic/LeakyReLU.c
@@ -6,9 +6,10 @@ void THNN_(LeakyReLU_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real negval,
+          accreal negval_,
           bool inplace)
 {
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
   if (inplace)
   {
     TH_TENSOR_APPLY(real, input,
@@ -31,9 +32,10 @@ void THNN_(LeakyReLU_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          real negval,
+          accreal negval_,
           bool inplace)
 {
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   if (inplace)
   {
diff --git a/lib/THNN/generic/Linear.c b/lib/THNN/generic/Linear.c
index 933bc4b..faef421 100644
--- a/lib/THNN/generic/Linear.c
+++ b/lib/THNN/generic/Linear.c
@@ -87,8 +87,9 @@ void THNN_(Linear_accGradParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *addBuffer,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   long dim = THTensor_(nDimension)(input);
   if (dim == 1) {
     THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input);
diff --git a/lib/THNN/generic/LookupTable.c b/lib/THNN/generic/LookupTable.c
index b460f38..46bc2c3 100644
--- a/lib/THNN/generic/LookupTable.c
+++ b/lib/THNN/generic/LookupTable.c
@@ -32,8 +32,9 @@ void THNN_(LookupTable_accGradParameters)(
           THIndexTensor *indices,
           bool scaleGradByFreq,
           int paddingValue,
-          real scale)
+          accreal ascale)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale);
   ptrdiff_t i;
   THInteger_t *count_data = NULL;
 
@@ -163,9 +164,11 @@ void THNN_(LookupTable_renorm)(
           THNNState *state,
           THIndexTensor *idx,
           THTensor *weight,
-          real maxNorm,
-          real normType)
+          accreal maxNorm_,
+          accreal normType_)
 {
+  real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_);
+  real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_);
   if (!THTensor_(isContiguous)(weight))
     THError("weight must be contiguous");
   if (!THIndexTensor_(isContiguous)(idx))
diff --git a/lib/THNN/generic/MarginCriterion.c b/lib/THNN/generic/MarginCriterion.c
index 1675860..d6d9b60 100644
--- a/lib/THNN/generic/MarginCriterion.c
+++ b/lib/THNN/generic/MarginCriterion.c
@@ -8,10 +8,11 @@ void THNN_(MarginCriterion_updateOutput)(
           THTensor *target,
           THTensor *output,
           bool sizeAverage,
-          real margin)
+          accreal margin_)
 {
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
   THNN_CHECK_NELEMENT(input, target);
-  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);  
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
   real sum = 0;
 
   TH_TENSOR_APPLY2(real, input, real, target,
@@ -31,9 +32,10 @@ void THNN_(MarginCriterion_updateGradInput)(
           THTensor *target,
           THTensor *gradInput,
           bool sizeAverage,
-          real margin)
+          accreal margin_)
 {
-  THNN_CHECK_NELEMENT(input, target);  
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  THNN_CHECK_NELEMENT(input, target);
   real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
 
   THTensor_(resizeAs)(gradInput, input);
diff --git a/lib/THNN/generic/MultiMarginCriterion.c b/lib/THNN/generic/MultiMarginCriterion.c
index af83e89..2f8f8ff 100644
--- a/lib/THNN/generic/MultiMarginCriterion.c
+++ b/lib/THNN/generic/MultiMarginCriterion.c
@@ -11,8 +11,9 @@ void THNN_(MultiMarginCriterion_updateOutput)(
           bool sizeAverage,
           int p,
           THTensor *weights,
-          real margin)
+          accreal margin_)
 {
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
   real *input_data, *weights_data;
   THIndex_t *target_data;
   long nframe, dim;
@@ -90,8 +91,9 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
           bool sizeAverage,
           int p,
           THTensor *weights,
-          real margin)
+          accreal margin_)
 {
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
   real *input_data;
   real *gradInput_data;
   THIndex_t *target_data;
diff --git a/lib/THNN/generic/PReLU.c b/lib/THNN/generic/PReLU.c
index 3d2ebfc..174f514 100644
--- a/lib/THNN/generic/PReLU.c
+++ b/lib/THNN/generic/PReLU.c
@@ -165,8 +165,9 @@ void THNN_(PReLU_accGradParameters)(
           THTensor *gradWeightBuf,
           THTensor *gradWeightBuf2,
           THIndex_t nOutputPlane,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   real *gradWeight_data = THTensor_(data)(gradWeight);
 
diff --git a/lib/THNN/generic/RReLU.c b/lib/THNN/generic/RReLU.c
index cdb9dca..8fd46d3 100644
--- a/lib/THNN/generic/RReLU.c
+++ b/lib/THNN/generic/RReLU.c
@@ -7,12 +7,14 @@ void THNN_(RReLU_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *noise,
-          real lower,
-          real upper,
+          accreal lower_,
+          accreal upper_,
           bool train,
           bool inplace,
           THGenerator *generator)
 {
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
   if (train)
   {
     // get default random generator
@@ -72,7 +74,7 @@ void THNN_(RReLU_updateOutput)(
         *output_data = *input_data * r;
       );
     }
-  }  
+  }
 }
 
 void THNN_(RReLU_updateGradInput)(
@@ -81,11 +83,13 @@ void THNN_(RReLU_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *noise,
-          real lower,
-          real upper,
+          accreal lower_,
+          accreal upper_,
           bool train,
           bool inplace)
 {
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
   {
@@ -99,10 +103,10 @@ void THNN_(RReLU_updateGradInput)(
     {
       THTensor_(resizeAs)(gradInput, input);
       THTensor_(cmul)(gradInput, gradOutput, noise);
-    }    
+    }
   }
   else
-  { 
+  {
     // use constant factor for negative input values
     const real negSlope = (lower + upper) / 2;
     if (inplace)
diff --git a/lib/THNN/generic/SoftPlus.c b/lib/THNN/generic/SoftPlus.c
index 7305238..6491e66 100644
--- a/lib/THNN/generic/SoftPlus.c
+++ b/lib/THNN/generic/SoftPlus.c
@@ -6,9 +6,11 @@ void THNN_(SoftPlus_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real beta,
-          real threshold)
+          accreal beta_,
+          accreal threshold_)
 {
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
   THTensor_(resizeAs)(output, input);
 
   // f(x) = 1/beta * log(1 + exp(beta * x))
@@ -23,12 +25,14 @@ void THNN_(SoftPlus_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *output,
-          real beta,
-          real threshold)
+          accreal beta_,
+          accreal threshold_)
 {
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
-  
+
   // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
   // SINCE
   // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
diff --git a/lib/THNN/generic/SoftShrink.c b/lib/THNN/generic/SoftShrink.c
index 28dcce0..e779508 100644
--- a/lib/THNN/generic/SoftShrink.c
+++ b/lib/THNN/generic/SoftShrink.c
@@ -6,10 +6,11 @@ void THNN_(SoftShrink_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real lambda)
+          accreal lambda_)
 {
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
   THTensor_(resizeAs)(output, input);
-  
+
   TH_TENSOR_APPLY2(real, output, real, input,
     if ((*input_data) > lambda)
      *output_data = *input_data - lambda;
@@ -25,8 +26,9 @@ void THNN_(SoftShrink_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          real lambda)
+          accreal lambda_)
 {
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
diff --git a/lib/THNN/generic/SparseLinear.c b/lib/THNN/generic/SparseLinear.c
index 807280e..0c52541 100644
--- a/lib/THNN/generic/SparseLinear.c
+++ b/lib/THNN/generic/SparseLinear.c
@@ -167,9 +167,11 @@ void THNN_(SparseLinear_accGradParameters)(
           THTensor *gradBias,
           THTensor *weight,
           THTensor *bias,
-          real weightDecay,
-          real scale)
+          accreal weightDecay_,
+          accreal scale_)
 {
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   long h, i, col, hp0, hp1;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
@@ -243,9 +245,11 @@ void THNN_(SparseLinear_legacyAccGradParameters)(
           THTensor *gradBias,
           THTensor *weight,
           THTensor *bias,
-          real weightDecay,
-          real scale)
+          accreal weightDecay_,
+          accreal scale_)
 {
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   long h, i;
   long outDim = THTensor_(size)(weight, 0);
   long inDim = THTensor_(size)(weight, 1);
@@ -308,8 +312,9 @@ void THNN_(SparseLinear_updateParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput,
-          real learningRate)
+          accreal learningRate_)
 {
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
   long h, i;
   long outDim = weight->size[0];
   long inDim = weight->size[1];
@@ -381,8 +386,9 @@ void THNN_(SparseLinear_legacyUpdateParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput,
-          real learningRate)
+          accreal learningRate_)
 {
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
   long h, i;
   long outDim = weight->size[0];
   long inDim = weight->size[1];
diff --git a/lib/THNN/generic/SpatialAdaptiveAveragePooling.c b/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
new file mode 100644
index 0000000..3675b42
--- /dev/null
+++ b/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
@@ -0,0 +1,258 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = START_IND(i, oheight, iheight);
+      int y_end   = END_IND(i, oheight, iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = START_IND(j, owidth, iwidth);
+        int x_end   = END_IND(j, owidth, iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+
+        /* compute local average: */
+        real sum = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            sum += val;
+          }
+        }
+
+        /* set output to local average */
+        *op = sum / kW / kH;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int owidth,
+          int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+
+    /* calculate average */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = START_IND(i, oheight, iheight);
+      int y_end   = END_IND(i, oheight, iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = START_IND(j, owidth, iwidth);
+        int x_end   = END_IND(j, owidth, iwidth);
+        int kW = x_end-x_start;
+
+        int x,y;
+        for(y = y_start; y < y_end; y++)
+        {
+          for(x = x_start; x < x_end; x++)
+          {
+            /* update gradient */
+            gradInput_p_k[y*iwidth + x] += gradOutput_p_k[i*owidth + j] / kW / kH;
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
+
+#undef START_IND
+#undef END_IND
\ No newline at end of file
diff --git a/lib/THNN/generic/SpatialConvolutionLocal.c b/lib/THNN/generic/SpatialConvolutionLocal.c
index efba30e..06b57f3 100644
--- a/lib/THNN/generic/SpatialConvolutionLocal.c
+++ b/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -4,8 +4,8 @@
 
 static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
 	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias, 
-	int kH, int kW, int dH, 
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH,
 	int dW, int padH, int padW,
 	long inputHeight, long inputWidth,
 	long outputHeight, long outputWidth) {
@@ -39,7 +39,7 @@ static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
   }
 
   THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
-  
+
   if (gradOutput != NULL) {
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -56,8 +56,8 @@ static int THNN_(view_weight_local)(THTensor **_weight)
     long s1 = weight->size[0] * weight->size[1];
     long s2 = weight->size[2];
     long s3 = weight->size[3] * weight->size[4] * weight->size[5];
-    *_weight = THTensor_(newWithStorage3d)(weight->storage, 
-					   weight->storageOffset, 
+    *_weight = THTensor_(newWithStorage3d)(weight->storage,
+					   weight->storageOffset,
 					   s1, -1, s2, -1, s3, -1);
     return 1;
   }
@@ -75,8 +75,8 @@ static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
   long i;
   THTensor *output3d, *finput3d;
 
-  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, 
-		       nInputPlane, inputWidth, inputHeight, 
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
 		       outputWidth, outputHeight);
 
   THTensor_(copy)(output, bias);
@@ -86,7 +86,7 @@ static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
      outputHeight * outputWidth, 1,
      nOutputPlane, outputHeight * outputWidth,
      1, nOutputPlane * outputHeight * outputWidth);
-  
+
   finput3d = THTensor_(newWithStorage3d)
     (finput->storage, finput->storageOffset,
      outputHeight * outputWidth, 1,
@@ -94,10 +94,10 @@ static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
      1, kW * kH * nInputPlane * outputHeight * outputWidth);
 
   // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
-  // finput3d:  oH*oW x nInputPlane*kH*kW x 1  
+  // finput3d:  oH*oW x nInputPlane*kH*kW x 1
   THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
   // output3d:  oH*oW x nOutputPlane x 1
-  
+
   THTensor_(free)(output3d);
   THTensor_(free)(finput3d);
 }
@@ -120,10 +120,10 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
 
   THNN_(SpatialConvolutionLocal_shapeCheck)
     (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
-     inputHeight, inputWidth, outputHeight, outputWidth);					    
+     inputHeight, inputWidth, outputHeight, outputWidth);
 
   input = THTensor_(newContiguous)(input);
-  
+
   long nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH);
   long nOutputPlane = THTensor_(size)(weight, 1);
 
@@ -174,7 +174,7 @@ void THNN_(SpatialConvolutionLocal_updateOutput)(
 static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
      (THTensor *gradInput, THTensor *gradOutput,
       THTensor *weight, THTensor *fgradInput,
-      int kW, int kH, int dW, int dH, int padW, int padH, 
+      int kW, int kH, int dW, int dH, int padW, int padH,
       long nInputPlane, long inputWidth, long inputHeight,
       long nOutputPlane, long outputWidth, long outputHeight)
 {
@@ -188,17 +188,17 @@ static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
                                              kW*kH*nInputPlane, outputHeight*outputWidth,
                                              1, kW*kH*nInputPlane*outputHeight*outputWidth);
   // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
-  // gradOutput3d:  oH*oW x nOutputPlane x 1         
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
   THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
-  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1  
-  
+  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1
+
   THTensor_(free)(gradOutput3d);
   THTensor_(free)(fgradInput3d);
-  
+
   THTensor_(zero)(gradInput);
-  
-  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, 
-		      nInputPlane, inputWidth, inputHeight, 
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH,
+		      nInputPlane, inputWidth, inputHeight,
 		      outputWidth, outputHeight);
 
 }
@@ -235,8 +235,8 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
   if(input->nDimension == 3)
   {
     THNN_(SpatialConvolutionLocal_updateGradInput_frame)
-      (gradInput, gradOutput, weight, 
-       fgradInput, kW, kH, dW, dH, padW, padH, 
+      (gradInput, gradOutput, weight,
+       fgradInput, kW, kH, dW, dH, padW, padH,
        nInputPlane, inputWidth, inputHeight,
        nOutputPlane, outputWidth, outputHeight);
   }
@@ -253,8 +253,8 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
 
       THNN_(SpatialConvolutionLocal_updateGradInput_frame)
-	(gradInput_t, gradOutput_t, weight, fgradInput_t, 
-	 kW, kH, dW, dH, padW, padH, 
+	(gradInput_t, gradOutput_t, weight, fgradInput_t,
+	 kW, kH, dW, dH, padW, padH,
 	 nInputPlane, inputWidth, inputHeight,
 	 nOutputPlane, outputWidth, outputHeight);
 
@@ -275,12 +275,12 @@ void THNN_(SpatialConvolutionLocal_updateGradInput)(
 
 static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
      (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
-      THTensor *finput, real scale, 
-      int kW, int kH, int dW, int dH, int padW, int padH, 
+      THTensor *finput, real scale,
+      int kW, int kH, int dW, int dH, int padW, int padH,
       long nInputPlane, long inputWidth, long inputHeight,
       long nOutputPlane, long outputWidth, long outputHeight)
 {
-   
+
   THTensor *gradOutput3d, *finput3d;
   gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
                                              outputHeight*outputWidth, 1,
@@ -290,7 +290,7 @@ static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
                                          outputHeight*outputWidth, 1,
                                          1, kW*kH*nInputPlane*outputHeight*outputWidth,
                                          kW*kH*nInputPlane, outputHeight*outputWidth);
-  // gradOutput3d:  oH*oW x nOutputPlane x 1  
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
   // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
   THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
   // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
@@ -314,9 +314,9 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
     int padW, int padH,
     long inputWidth, long inputHeight,
     long outputWidth, long outputHeight,
-    real scale)
+    accreal scale_)
 {
-
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   int freeWeight = THNN_(view_weight_local)(&gradWeight);
 
   THNN_(SpatialConvolutionLocal_shapeCheck)
@@ -332,7 +332,7 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
   if(input->nDimension == 3)
   {
     THNN_(SpatialConvolutionLocal_accGradParameters_frame)
-      (gradOutput, gradWeight, gradBias, finput, scale, 
+      (gradOutput, gradWeight, gradBias, finput, scale,
        kW, kH, dW, dH, padW, padH,
        nInputPlane, inputWidth, inputHeight,
        nOutputPlane, outputWidth, outputHeight);
@@ -348,7 +348,7 @@ void THNN_(SpatialConvolutionLocal_accGradParameters)(
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
 
       THNN_(SpatialConvolutionLocal_accGradParameters_frame)
-	(gradOutput_t, gradWeight, gradBias, finput_t, scale, 
+	(gradOutput_t, gradWeight, gradBias, finput_t, scale,
 	 kW, kH, dW, dH, padW, padH,
 	 nInputPlane, inputWidth, inputHeight,
 	 nOutputPlane, outputWidth, outputHeight);
diff --git a/lib/THNN/generic/SpatialConvolutionMM.c b/lib/THNN/generic/SpatialConvolutionMM.c
index 83635c1..c9c22bc 100644
--- a/lib/THNN/generic/SpatialConvolutionMM.c
+++ b/lib/THNN/generic/SpatialConvolutionMM.c
@@ -4,7 +4,7 @@
 
 static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
 	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias, 
+	THTensor *weight, THTensor *bias,
 	int kH, int kW, int dH, int dW, int padH, int padW) {
 
   THArgCheck(kW > 0 && kH > 0, 9,
@@ -45,7 +45,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
 	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
 
   THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
-  
+
   if (gradOutput != NULL) {
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -336,8 +336,9 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
           int dH,
           int padW,
           int padH,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   int freeWeight = 0;
 
   if (gradWeight->nDimension == 4) {
diff --git a/lib/THNN/generic/SpatialConvolutionMap.c b/lib/THNN/generic/SpatialConvolutionMap.c
index 82886c2..750b212 100644
--- a/lib/THNN/generic/SpatialConvolutionMap.c
+++ b/lib/THNN/generic/SpatialConvolutionMap.c
@@ -175,10 +175,18 @@ void THNN_(SpatialConvolutionMap_updateGradInput)(
 }
 
 void THNN_(SpatialConvolutionMap_accGradParameters)(
-  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
-  THTensor *connTable, int nInputPlane, int nOutputPlane,
-  int dW, int dH, real scale)
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THArgCheck(
     gradWeight != NULL && gradWeight->nDimension == 3
     && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
diff --git a/lib/THNN/generic/SpatialDilatedConvolution.c b/lib/THNN/generic/SpatialDilatedConvolution.c
index 8b18910..d345f7a 100644
--- a/lib/THNN/generic/SpatialDilatedConvolution.c
+++ b/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -289,8 +289,9 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
     int dW, int dH,
     int padW, int padH,
     int dilationW, int dilationH,
-    real scale)
+    accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THNN_(SpatialDilatedConvolution_shapeCheck)
     (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
      dilationH, dilationW);
diff --git a/lib/THNN/generic/SpatialFullConvolution.c b/lib/THNN/generic/SpatialFullConvolution.c
index 4adcca6..e2a835d 100644
--- a/lib/THNN/generic/SpatialFullConvolution.c
+++ b/lib/THNN/generic/SpatialFullConvolution.c
@@ -59,7 +59,7 @@ static void THNN_(col2im)(const real* data_col, const int channels,
 
 static inline void THNN_(SpatialFullConvolution_shapeCheck)(
 	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias, 
+	THTensor *weight, THTensor *bias,
 	int kH, int kW, int dH, int dW, int padH, int padW, int adjH, int adjW) {
 
   THArgCheck(kW > 0 && kH > 0, 9,
@@ -103,7 +103,7 @@ static inline void THNN_(SpatialFullConvolution_shapeCheck)(
 	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
 
   THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
-  
+
   if (gradOutput != NULL) {
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -342,8 +342,9 @@ void THNN_(SpatialFullConvolution_accGradParameters)(
     int dW, int dH,
     int padW, int padH,
     int adjW, int adjH,
-    real scale)
+    accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THNN_(SpatialFullConvolution_shapeCheck)
     (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, adjH, adjW);
 
diff --git a/lib/THNN/generic/SpatialFullConvolutionMap.c b/lib/THNN/generic/SpatialFullConvolutionMap.c
index 1bd3455..e98dea0 100644
--- a/lib/THNN/generic/SpatialFullConvolutionMap.c
+++ b/lib/THNN/generic/SpatialFullConvolutionMap.c
@@ -147,10 +147,18 @@ void THNN_(SpatialFullConvolutionMap_updateGradInput)(
 }
 
 void THNN_(SpatialFullConvolutionMap_accGradParameters)(
-  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
-  THTensor *connTable, int nInputPlane, int nOutputPlane,
-  int dW, int dH, real scale)
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *connTable,
+  int nInputPlane,
+  int nOutputPlane,
+  int dW, int dH,
+  accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THArgCheck(
     gradWeight != NULL && gradWeight->nDimension == 3
     && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
diff --git a/lib/THNN/generic/SpatialSubSampling.c b/lib/THNN/generic/SpatialSubSampling.c
index 3674f2c..3f01540 100644
--- a/lib/THNN/generic/SpatialSubSampling.c
+++ b/lib/THNN/generic/SpatialSubSampling.c
@@ -40,7 +40,7 @@ void THNN_(SpatialSubSampling_updateOutput)(
     int kW, int kH,
     int dW, int dH)
 {
-  
+
   real *weight_data = THTensor_(data)(weight);
   real *bias_data = THTensor_(data)(bias);
   real *output_data;
@@ -76,11 +76,11 @@ void THNN_(SpatialSubSampling_updateOutput)(
     THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
   else
     THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
-  
+
   input = THTensor_(newContiguous)(input);
   input_data = THTensor_(data)(input);
   output_data = THTensor_(data)(output);
-  
+
 #pragma omp parallel for private(k)
   for(k = 0; k < nInputPlane; k++)
   {
@@ -97,7 +97,7 @@ void THNN_(SpatialSubSampling_updateOutput)(
       long i;
       for(i = 0; i < outputWidth*outputHeight; i++)
         ptr_output[i] = z;
-      
+
       for(yy = 0; yy < outputHeight; yy++)
       {
         for(xx = 0; xx < outputWidth; xx++)
@@ -214,8 +214,9 @@ void THNN_(SpatialSubSampling_accGradParameters)(
     THTensor *gradBias,
     int kW, int kH,
     int dW, int dH,
-    real scale)
+    accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH);
 
   long nbatch = 1;
diff --git a/lib/THNN/generic/Sqrt.c b/lib/THNN/generic/Sqrt.c
index 24cd51a..174884e 100644
--- a/lib/THNN/generic/Sqrt.c
+++ b/lib/THNN/generic/Sqrt.c
@@ -6,8 +6,9 @@ void THNN_(Sqrt_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real eps)
+          accreal eps_)
 {
+  real eps = TH_CONVERT_ACCREAL_TO_REAL(eps_);
   THTensor_(resizeAs)(output, input);
   THTensor_(sqrt)(output, input);
 }
@@ -22,8 +23,8 @@ void THNN_(Sqrt_updateGradInput)(
   THNN_CHECK_SHAPE(output, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
 
-  if (output->nDimension == 1 || 
-      !THTensor_(isContiguous)(output) || 
+  if (output->nDimension == 1 ||
+      !THTensor_(isContiguous)(output) ||
       !THTensor_(isContiguous)(gradOutput) ||
       !THTensor_(isContiguous)(gradInput))
   {
diff --git a/lib/THNN/generic/THNN.h b/lib/THNN/generic/THNN.h
index 4420962..9515abb 100644
--- a/lib/THNN/generic/THNN.h
+++ b/lib/THNN/generic/THNN.h
@@ -78,7 +78,7 @@ TH_API void THNN_(ELU_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] ELU output
-          real alpha,                  // an ELU parameter (as in paper)
+          accreal alpha,               // an ELU parameter (as in paper)
           bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 TH_API void THNN_(ELU_updateGradInput)(
           THNNState *state,            // library's state
@@ -86,7 +86,7 @@ TH_API void THNN_(ELU_updateGradInput)(
           THTensor *gradOutput,        // gradient w.r.t. output
           THTensor *gradInput,         // [OUT] gradient w.r.t. input
           THTensor *output,            // output from a forward pass
-          real alpha,                  // an ELU parameter (as in paper)
+          accreal alpha,               // an ELU parameter (as in paper)
           bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 
 TH_API void THNN_(DistKLDivCriterion_updateOutput)(
@@ -119,30 +119,30 @@ TH_API void THNN_(HardShrink_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] output tensor
-          real lambda);                // HardShrink parameter
+          accreal lambda);             // HardShrink parameter
 TH_API void THNN_(HardShrink_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *gradOutput,        // gradient w.r.t. module's output
           THTensor *gradInput,         // [OUT] gradient w.r.t. input
-          real lambda);                // HardShrink parameter
+          accreal lambda);             // HardShrink parameter
 
 // HardTanh clamps the values to the interval [min_val; max_val].
 TH_API void THNN_(HardTanh_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] output tensor
-          real min_val,                // lower threshold
-          real max_val,
-          bool inplace);               // upper threshold
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
 TH_API void THNN_(HardTanh_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *gradOutput,        // gradient w.r.t. module's output
           THTensor *gradInput,         // [OUT] gradient w.r.t. the input
-          real min_val,                // lower threshold
-          real max_val,
-          bool inplace);               // upper threshold
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
 
 TH_API void THNN_(L1Cost_updateOutput)(
           THNNState *state,            // library's state
@@ -158,14 +158,14 @@ TH_API void THNN_(LeakyReLU_updateOutput)(
           THNNState *state,            // library's state
           THTensor *input,             // [MODIFIED] input tensor
           THTensor *output,            // [OUT] output tensor
-          real negval,                 // negative part slope
+          accreal negval,              // negative part slope
           bool inplace);               // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
 TH_API void THNN_(LeakyReLU_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *gradOutput,        // [MODIFIED] gradient w.r.t. module's output
           THTensor *gradInput,         // [OUT] gradient w.r.t. the input
-          real negval,                 // negative part slope
+          accreal negval,              // negative part slope
           bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 
 TH_API void THNN_(LogSigmoid_updateOutput)(
@@ -201,14 +201,14 @@ TH_API void THNN_(LookupTable_accGradParameters)(
           THIndexTensor *indices,      // [OPTIONAL]
           bool scaleGradByFreq,
           int paddingValue,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(LookupTable_renorm)(
           THNNState *state,            // library's state
-          THIndexTensor *idx,          // vector that contains row indices (modified in function)
+          THIndexTensor *idx,          // vector containing row indices (modified in function)
           THTensor *weight,            // 2D tensor whose rows will be renormalized
-          real maxNorm,                // maximum norm
-          real normType);              // the norm type (e.g., normType=2, then it's 2-norm)
+          accreal maxNorm,             // maximum norm
+          accreal normType);           // the norm type (e.g., normType=2, then it's 2-norm)
 
 TH_API void THNN_(MarginCriterion_updateOutput)(
           THNNState *state,            // library's state
@@ -216,14 +216,15 @@ TH_API void THNN_(MarginCriterion_updateOutput)(
           THTensor *target,            // target tensor (should contain only 1s and -1s)
           THTensor *output,            // [OUT] a one-element tensor containing the loss
           bool sizeAverage,            // if true, the loss is normalized by **total number of elements**
-          real margin);                // a margin that is required for the loss to be 0
+          accreal margin);             // a margin that is required for the loss to be 0
+
 TH_API void THNN_(MarginCriterion_updateGradInput)(
           THNNState *state,            // library's state
           THTensor *input,             // input tensor
           THTensor *target,            // target tensor (should contin only 1s and -1s)
           THTensor *gradInput,         // [OUT] gradient w.r.t. module's input
           bool sizeAverage,            // if true, the gradient is normalized by **total number of elements**
-          real margin);                // a margin that is required for the loss to be 0
+          accreal margin);             // a margin that is required for the loss to be 0
 
 TH_API void THNN_(SoftMarginCriterion_updateOutput)(
           THNNState *state,
@@ -275,7 +276,7 @@ TH_API void THNN_(MultiMarginCriterion_updateOutput)(
           bool sizeAverage,
           int p,
           THTensor* weights,      // [OPTIONAL]
-          real margin);
+          accreal margin);
 TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           THNNState *state,
           THTensor *input,
@@ -284,7 +285,7 @@ TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
           bool sizeAverage,
           int p,
           THTensor *weights,      // [OPTIONAL]
-          real margin);
+          accreal margin);
 
 TH_API void THNN_(PReLU_updateOutput)(
           THNNState *state,
@@ -309,7 +310,7 @@ TH_API void THNN_(PReLU_accGradParameters)(
           THTensor *gradWeightBuf,
           THTensor *gradWeightBuf2,
           THIndex_t nOutputPlane,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(Linear_updateOutput)(
           THNNState *state,
@@ -334,15 +335,15 @@ TH_API void THNN_(Linear_accGradParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *addBuffer,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(RReLU_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
           THTensor *noise,
-          real lower,
-          real upper,
+          accreal lower,
+          accreal upper,
           bool train,
           bool inplace,
           THGenerator *generator);
@@ -352,8 +353,8 @@ TH_API void THNN_(RReLU_updateGradInput)(
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *noise,
-          real lower,
-          real upper,
+          accreal lower,
+          accreal upper,
           bool train,
           bool inplace);
 
@@ -396,28 +397,28 @@ TH_API void THNN_(SoftPlus_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real beta,
-          real threshold);
+          accreal beta,
+          accreal threshold);
 TH_API void THNN_(SoftPlus_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
           THTensor *output,
-          real beta,
-          real threshold);
+          accreal beta,
+          accreal threshold);
 
 TH_API void THNN_(SoftShrink_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real lambda);
+          accreal lambda);
 TH_API void THNN_(SoftShrink_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          real lambda);
+          accreal lambda);
 
 TH_API void THNN_(SparseLinear_updateOutput)(
           THNNState *state,
@@ -433,8 +434,8 @@ TH_API void THNN_(SparseLinear_accGradParameters)(
           THTensor *gradBias,
           THTensor *weight,
           THTensor *bias,
-          real weightDecay,
-          real scale);
+          accreal weightDecay,
+          accreal scale);
 TH_API void THNN_(SparseLinear_zeroGradParameters)(
           THNNState *state,
           THTensor *gradWeight,
@@ -447,7 +448,7 @@ TH_API void THNN_(SparseLinear_updateParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput,
-          real learningRate);
+          accreal learningRate);
 TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
           THNNState *state,
           THTensor *input,
@@ -462,8 +463,8 @@ TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
           THTensor *gradBias,
           THTensor *weight,
           THTensor *bias,
-          real weightDecay,
-          real scale);
+          accreal weightDecay,
+          accreal scale);
 TH_API void THNN_(SparseLinear_legacyZeroGradParameters)(
           THNNState *state,
           THTensor *gradWeight,
@@ -476,13 +477,13 @@ TH_API void THNN_(SparseLinear_legacyUpdateParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           THTensor *lastInput,
-          real learningRate);
+          accreal learningRate);
 
 TH_API void THNN_(Sqrt_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real eps);
+          accreal eps);
 TH_API void THNN_(Sqrt_updateGradInput)(
           THNNState *state,
           THTensor *input,
@@ -515,16 +516,16 @@ TH_API void THNN_(Threshold_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real threshold,
-          real val,
+          accreal threshold,
+          accreal val,
           bool inplace);
 TH_API void THNN_(Threshold_updateGradInput)(
           THNNState *state,
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          real threshold,
-          real val,
+          accreal threshold,
+          accreal val,
           bool inplace);
 
 TH_API void THNN_(TemporalConvolution_updateOutput)(
@@ -550,7 +551,7 @@ TH_API void THNN_(TemporalConvolution_accGradParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           int kW, int dW,
-          real scale);
+          accreal scale);
 TH_API void THNN_(TemporalMaxPooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -586,7 +587,7 @@ TH_API void THNN_(TemporalSubSampling_accGradParameters)(
           THTensor *gradWeight,
           THTensor *gradBias,
           int kW, int dW,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(TemporalRowConvolution_updateOutput)(
           THNNState *state,
@@ -624,7 +625,7 @@ TH_API void THNN_(TemporalRowConvolution_accGradParameters)(
           int dW,
           int padW,
           bool featFirst,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(BatchNormalization_updateOutput)(
           THNNState *state,
@@ -686,7 +687,7 @@ TH_API void THNN_(SpatialConvolutionMap_accGradParameters)(
           int nInputPlane,        // number of input planes
           int nOutputPlane,       // number of output planes
           int dW, int dH,         // stride
-          real scale);            // scaling factor
+          accreal scale);         // scaling factor
 
 TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
           THNNState *state,
@@ -721,7 +722,7 @@ TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
           int kW, int kH,
           int dW, int dH,
           int padW, int padH,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
           THNNState *state,
@@ -762,7 +763,7 @@ TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
           int padW, int padH,
           long inputWidth, long inputHeight,
           long outputWidth, long outputHeight,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
           THNNState *state,
@@ -777,6 +778,17 @@ TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
           THTensor *gradInput,
           THIndexTensor *indices);
 
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
 TH_API void THNN_(SpatialAveragePooling_updateOutput)(
           THNNState *state,
           THTensor *input,
@@ -849,7 +861,7 @@ TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
           int dW, int dH,
           int padW, int padH,
           int adjW, int adjH,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)(
           THNNState *state,       // library state
@@ -882,7 +894,7 @@ TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
           int nInputPlane,        // number of input planes
           int nOutputPlane,       // number of output planes
           int dW, int dH,         // stride
-          real scale);            // scaling factor
+          accreal scale);         // scaling factor
 
 TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
           THNNState *state,
@@ -921,7 +933,7 @@ TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
           int dW, int dH,
           int padW, int padH,
           int dilationW, int dilationH,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(SpatialMaxPooling_updateOutput)(
           THNNState *state,
@@ -1003,7 +1015,7 @@ TH_API void THNN_(SpatialSubSampling_accGradParameters)(
           THTensor *gradBias,
           int kW, int kH,
           int dW, int dH,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
           THNNState *state,
@@ -1072,7 +1084,7 @@ TH_API void THNN_(VolumetricConvolution_updateOutput)(
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,           // [OPTIONAL]
           THTensor *finput,
           THTensor *fgradInput,
           int dT, int dW, int dH,
@@ -1091,19 +1103,19 @@ TH_API void THNN_(VolumetricConvolution_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,       // [OPTIONAL]
           THTensor *finput,
           THTensor *fgradInput,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,           // [OPTIONAL]
           THTensor *finput,
           int kT, int kW, int kH,
           int dT, int dW, int dH,
@@ -1124,19 +1136,36 @@ TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,       // [OPTIONAL]
           THTensor *finput,
           int kT, int kW, int kH,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
-          real scale);
+          accreal scale);
+
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices);
 
 TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
           THNNState *state,         // library state
           THTensor *input,          // 4D or 5D (batch) tensor
           THTensor *output,         // [OUT] volumetric convolution output
           THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
-          THTensor *bias,           // gradBias tensor (nOutputPlane)
+          THTensor *bias,           // [OPTIONAL] gradBias tensor (nOutputPlane)
           THTensor *finput,         // [OUT] internal columns buffer
           THTensor *fgradInput,     // [OUT] internal ones buffer
           int dT, int dW, int dH,   // stride of the convolution
@@ -1158,20 +1187,20 @@ TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
           THTensor *input,          // 4D or 5D (batch) tensor
           THTensor *gradOutput,     // gradient w.r.t. output
           THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
-          THTensor *gradBias,       // gradBias tensor (nOutputPlane)
+          THTensor *gradBias,       // [OPTIONAL] gradBias tensor (nOutputPlane)
           THTensor *finput,         // internal columns buffer
           THTensor *fgradInput,     // internal ones buffer
           int dT, int dW, int dH,   // stride
           int pT, int pW, int pH,   // padding
           int aT, int aW, int aH,   // extra output adjustment
-          real scale);              // scaling factor
+          accreal scale);           // scaling factor
 
 TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
           THTensor *weight,
-          THTensor *bias,
+          THTensor *bias,           // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kT, int kW, int kH,
@@ -1196,14 +1225,14 @@ TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradWeight,
-          THTensor *gradBias,
+          THTensor *gradBias,       // [OPTIONAL]
           THTensor *columns,
           THTensor *ones,
           int kT, int kW, int kH,
           int dT, int dW, int dH,
           int padT, int padW, int padH,
           int dilationT, int dilationW, int dilationH,
-          real scale);
+          accreal scale);
 
 TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
           THNNState *state,
@@ -1311,5 +1340,4 @@ TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
           int pleft, int pright,
           int ptop, int pbottom,
           int pfront, int pback);
-
 #endif
diff --git a/lib/THNN/generic/TemporalConvolution.c b/lib/THNN/generic/TemporalConvolution.c
index 14297ad..a107da2 100644
--- a/lib/THNN/generic/TemporalConvolution.c
+++ b/lib/THNN/generic/TemporalConvolution.c
@@ -48,11 +48,11 @@ void THNN_(TemporalConvolution_updateOutput)(
   THTensor *outputWindow, *inputWindow;
   int nInputFrame, nOutputFrame;
   long k, i;
-  
+
   int dimS = 0; // sequence dimension
   int dimF = 1; // feature dimension
-  
-  if (input->nDimension == 3) 
+
+  if (input->nDimension == 3)
   {
     dimS = 1;
     dimF = 2;
@@ -93,7 +93,7 @@ void THNN_(TemporalConvolution_updateOutput)(
                               nFrame, inputFrameStride*input->size[1],
                               kW*input->size[1], 1);
 
-      THTensor_(setStorage2d)(outputWindow, output->storage, 
+      THTensor_(setStorage2d)(outputWindow, output->storage,
                               output->storageOffset + k*output->size[1],
                               nFrame, outputFrameStride*output->size[1],
                               output->size[1], 1);
@@ -108,18 +108,18 @@ void THNN_(TemporalConvolution_updateOutput)(
     THTensor *outputSample = THTensor_(new)();
     THTensor *inputSample = THTensor_(new)();
     int nBatchFrame = input->size[0];
-    
+
     THTensor_(resize3d)(output,
                         nBatchFrame,
                         nOutputFrame,
                         outputFrameSize);
-    
+
     for(i = 0; i < nBatchFrame; i++)
     {
       THTensor_(select)(outputSample, output, 0, i);
       THTensor_(select)(inputSample, input, 0, i);
       long nOutputSampleFrame = nOutputFrame;
-      
+
       /* bias first */
       for(k = 0; k < nOutputFrame; k++)
       {
@@ -140,7 +140,7 @@ void THNN_(TemporalConvolution_updateOutput)(
                                 nFrame, inputFrameStride*inputSample->size[1],
                                 kW*inputSample->size[1], 1);
 
-        THTensor_(setStorage2d)(outputWindow, outputSample->storage, 
+        THTensor_(setStorage2d)(outputWindow, outputSample->storage,
                                 outputSample->storageOffset + k*outputSample->size[1],
                                 nFrame, outputFrameStride*outputSample->size[1],
                                 outputSample->size[1], 1);
@@ -175,11 +175,11 @@ void THNN_(TemporalConvolution_updateGradInput)(
   THTensor *gradOutputWindow;
   THTensor *gradInputWindow;
   long k, i;
-  
+
   int dimS = 0; // sequence dimension
   int dimF = 1; // feature dimension
-  
-  if (gradOutput->nDimension == 3) 
+
+  if (gradOutput->nDimension == 3)
   {
     dimS = 1;
     dimF = 2;
@@ -227,13 +227,13 @@ void THNN_(TemporalConvolution_updateGradInput)(
     THTensor *gradOutputSample = THTensor_(new)();
     THTensor *gradInputSample = THTensor_(new)();
     int nBatchFrame = input->size[0];
-    
+
     for(i = 0; i < nBatchFrame; i++)
     {
       THTensor_(select)(gradOutputSample, gradOutput, 0, i);
       THTensor_(select)(gradInputSample, gradInput, 0, i);
       int nOutputSampleFrame = nOutputFrame;
-      
+
       /* ouch */
       for(k = 0; nOutputSampleFrame > 0; k++)
       {
@@ -274,19 +274,20 @@ void THNN_(TemporalConvolution_accGradParameters)(
           THTensor *gradBias,
           int kW,
           int dW,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   long nInputFrame;
   long nOutputFrame;
 
   THTensor *gradOutputWindow;
   THTensor *inputWindow;
   long k, i;
-  
+
   int dimS = 0; // sequence dimension
   int dimF = 1; // feature dimension
-  
-  if (gradOutput->nDimension == 3) 
+
+  if (gradOutput->nDimension == 3)
   {
     dimS = 1;
     dimF = 2;
@@ -301,7 +302,7 @@ void THNN_(TemporalConvolution_accGradParameters)(
   gradOutput = THTensor_(newContiguous)(gradOutput);
   gradOutputWindow = THTensor_(new)();
   inputWindow = THTensor_(new)();
-  
+
   if (input->nDimension == 2)
   {
     /* bias first */
@@ -324,7 +325,7 @@ void THNN_(TemporalConvolution_accGradParameters)(
                               nFrame, inputFrameStride*input->size[1],
                               kW*input->size[1], 1);
 
-      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, 
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
                               gradOutput->storageOffset + k*gradOutput->size[1],
                               nFrame, outputFrameStride*gradOutput->size[1],
                               gradOutput->size[1], 1);
@@ -339,13 +340,13 @@ void THNN_(TemporalConvolution_accGradParameters)(
     THTensor *gradOutputSample = THTensor_(new)();
     THTensor *inputSample = THTensor_(new)();
     int nBatchFrame = input->size[0];
-    
+
     for(i = 0; i < nBatchFrame; i++)
     {
       THTensor_(select)(gradOutputSample, gradOutput, 0, i);
       THTensor_(select)(inputSample, input, 0, i);
       int nOutputSampleFrame = nOutputFrame;
-      
+
       /* bias first */
       for(k = 0; k < nOutputFrame; k++)
       {
@@ -366,7 +367,7 @@ void THNN_(TemporalConvolution_accGradParameters)(
                                 nFrame, inputFrameStride*inputSample->size[1],
                                 kW*inputSample->size[1], 1);
 
-        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, 
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
                                 gradOutputSample->storageOffset + k*gradOutputSample->size[1],
                                 nFrame, outputFrameStride*gradOutputSample->size[1],
                                 gradOutputSample->size[1], 1);
diff --git a/lib/THNN/generic/TemporalRowConvolution.c b/lib/THNN/generic/TemporalRowConvolution.c
index 9e62939..b1cd173 100644
--- a/lib/THNN/generic/TemporalRowConvolution.c
+++ b/lib/THNN/generic/TemporalRowConvolution.c
@@ -81,11 +81,13 @@ static void THNN_(unfolded_acc_row)(
 
 			ix = (long long)(kw);
 			if (dW == 1) {
-				THVector_(add)(dst + (size_t)(ix), src, 1, nOutputFrame);
+			  real *dst_slice = dst + (size_t)(ix);
+			  THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
 			} else {
 				for (x = 0; x < nOutputFrame; x++) {
-					THVector_(add)(dst + (size_t)(ix + x * dW),
-					               src + (size_t)(x), 1, 1);
+				  real *dst_slice = dst + (size_t)(ix + x * dW);
+				  THVector_(cadd)(dst_slice, dst_slice,
+						  src + (size_t)(x), 1, 1);
 				}
 			}
 		}
@@ -410,8 +412,9 @@ void THNN_(TemporalRowConvolution_accGradParameters)(
 	int dW,
 	int padW,
 	bool featFirst,
-	real scale) {
+	accreal scale_) {
 
+    real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
 	int ndim = input->nDimension;
 
 	THTensor *tinput, *tgradOutput;
diff --git a/lib/THNN/generic/TemporalSubSampling.c b/lib/THNN/generic/TemporalSubSampling.c
index bfc7d30..8728d14 100644
--- a/lib/THNN/generic/TemporalSubSampling.c
+++ b/lib/THNN/generic/TemporalSubSampling.c
@@ -51,7 +51,7 @@ void THNN_(TemporalSubSampling_updateOutput)(
   THTensor *outputFrame, *inputWindow;
   int nInputFrame, nOutputFrame;
   long k;
-  
+
   THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize);
 
   outputFrame = THTensor_(new)();
@@ -63,7 +63,7 @@ void THNN_(TemporalSubSampling_updateOutput)(
   THTensor_(resize2d)(output,
                       nOutputFrame,
                       inputFrameSize);
-  
+
   for(k = 0; k < nOutputFrame; k++)
   {
     THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
@@ -124,8 +124,9 @@ void THNN_(TemporalSubSampling_accGradParameters)(
           THTensor *gradBias,
           int kW,
           int dW,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THTensor *gradOutputFrame;
   THTensor *inputWindow, *buffer;
   long k;
diff --git a/lib/THNN/generic/Threshold.c b/lib/THNN/generic/Threshold.c
index dd2a698..949c7a0 100644
--- a/lib/THNN/generic/Threshold.c
+++ b/lib/THNN/generic/Threshold.c
@@ -6,10 +6,12 @@ void THNN_(Threshold_updateOutput)(
           THNNState *state,
           THTensor *input,
           THTensor *output,
-          real threshold,
-          real val,
+          accreal threshold_,
+          accreal val_,
           bool inplace)
 {
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
   if (inplace)
   {
     TH_TENSOR_APPLY(real, input,
@@ -32,10 +34,12 @@ void THNN_(Threshold_updateGradInput)(
           THTensor *input,
           THTensor *gradOutput,
           THTensor *gradInput,
-          real threshold,
-          real val,
+          accreal threshold_,
+          accreal val_,
           bool inplace)
 {
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
   THNN_CHECK_NELEMENT(input, gradOutput);
   if (inplace)
   {
diff --git a/lib/THNN/generic/VolumetricConvolution.c b/lib/THNN/generic/VolumetricConvolution.c
index 4fd8ac3..be1aa82 100644
--- a/lib/THNN/generic/VolumetricConvolution.c
+++ b/lib/THNN/generic/VolumetricConvolution.c
@@ -50,10 +50,14 @@ void THNN_(VolumetricConvolution_updateOutput)(
     THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
 
     /* add bias */
-    for (i = 0; i < bias->size[0]; i++)
-    {
-      THTensor_(select)(outn, output, 0, i);
-      THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+    if (bias) {
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, output, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+    } else {
+      THTensor_(zero)(output);
     }
 
     /* do convolutions */
@@ -73,10 +77,14 @@ void THNN_(VolumetricConvolution_updateOutput)(
       THTensor_(select)(outb, output, 0, j);
 
       /* add bias */
-      for (i = 0; i < bias->size[0]; i++)
-      {
-        THTensor_(select)(outn, outb, 0, i);
-        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      if (bias) {
+        for (i = 0; i < bias->size[0]; i++)
+        {
+          THTensor_(select)(outn, outb, 0, i);
+          THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+        }
+      } else {
+        THTensor_(zero)(outb);
       }
 
       /* do convolutions */
@@ -170,8 +178,9 @@ void THNN_(VolumetricConvolution_accGradParameters)(
           int pT,
           int pW,
           int pH,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
 
   THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
@@ -179,10 +188,11 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 		"expected for gradWeight, but got: %s");
 
   int nOutputPlane = (int)gradWeight->size[0];
-
-  THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
-    "gradBias tensor has wrong size"
-  );
+  if (gradBias) {
+    THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+      "gradBias tensor has wrong size"
+    );
+  }
 
   long k;
   real *gradBias_data;
@@ -200,14 +210,16 @@ void THNN_(VolumetricConvolution_accGradParameters)(
   if (gradOutput->nDimension == 4) /* non-batch mode */
   {
     /* gradient to bias */
-    gradBias_data = THTensor_(data)(gradBias);
-    gradOutSlice = THTensor_(new)();
-    for (k = 0; k < nOutputPlane; k++)
-    {
-      THTensor_(select)(gradOutSlice, gradOutput, 0, k);
-      gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+    if (gradBias) {
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
+      THTensor_(free)(gradOutSlice);
     }
-    THTensor_(free)(gradOutSlice);
 
     /* gradient to kernels */
     THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
@@ -226,14 +238,16 @@ void THNN_(VolumetricConvolution_accGradParameters)(
       THTensor_(select)(goutb, gradOutput, 0, j);
 
       /* gradient to bias */
-      gradBias_data = THTensor_(data)(gradBias);
-      gradOutSlice = THTensor_(new)();
-      for (k = 0; k < nOutputPlane; k++)
-      {
-        THTensor_(select)(gradOutSlice, goutb, 0, k);
-        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      if (gradBias) {
+        gradBias_data = THTensor_(data)(gradBias);
+        gradOutSlice = THTensor_(new)();
+        for (k = 0; k < nOutputPlane; k++)
+        {
+          THTensor_(select)(gradOutSlice, goutb, 0, k);
+          gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+        }
+        THTensor_(free)(gradOutSlice);
       }
-      THTensor_(free)(gradOutSlice);
 
       /* gradient to kernels */
       THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
diff --git a/lib/THNN/generic/VolumetricConvolutionMM.c b/lib/THNN/generic/VolumetricConvolutionMM.c
index 4085e2b..4aaaa95 100644
--- a/lib/THNN/generic/VolumetricConvolutionMM.c
+++ b/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -152,7 +152,8 @@ static void THNN_(unfolded_acc_vol)(
                   }
                   else
                   {
-                    THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                    real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+                    THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                   }
                 }
               }
@@ -169,7 +170,8 @@ static void THNN_(unfolded_acc_vol)(
                 for(x = 0; x < outputWidth; x++)
                 {
                   ix = x*dW + kw;
-                  THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                  real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+                  THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                 }
               }
             }
@@ -300,13 +302,17 @@ static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
     outputDepth*outputHeight*outputWidth, -1
   );
 
-  for (i = 0; i < nOutputPlane; i++)
-  {
-    THVector_(fill)(
-      output->storage->data+output->storageOffset+output->stride[0]*i,
-      THTensor_(get1d)(bias, i),
-      outputDepth*outputHeight*outputWidth
-    );
+  if (bias) {
+      for (i = 0; i < nOutputPlane; i++)
+      {
+        THVector_(fill)(
+          output->storage->data+output->storageOffset+output->stride[0]*i,
+          THTensor_(get1d)(bias, i),
+          outputDepth*outputHeight*outputWidth
+        );
+      }
+  } else {
+    THTensor_(zero)(output);
   }
 
   THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
@@ -551,15 +557,17 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
   THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
   THTensor_(transpose)(finput, finput, 0, 1);
 
-  for (i = 0; i < gradBias->size[0]; i++)
-  {
-    long k;
-    real sum = 0;
-    real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
-    for (k = 0; k < gradOutput2d->size[1]; k++)
-      sum += data[k];
+  if (gradBias) {
+    for (i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for (k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
 
-    (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
+    }
   }
 
   THTensor_(free)(gradOutput2d);
@@ -575,8 +583,9 @@ void THNN_(VolumetricConvolutionMM_accGradParameters)(
           int kT, int kW, int kH,
           int dT, int dW, int dH,
           int pT, int pW, int pH,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   int freeWeight;
   int nOutputPlane = (int)gradWeight->size[0];
 
diff --git a/lib/THNN/generic/VolumetricDilatedConvolution.c b/lib/THNN/generic/VolumetricDilatedConvolution.c
index d2d5c88..e31ff2b 100644
--- a/lib/THNN/generic/VolumetricDilatedConvolution.c
+++ b/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -299,8 +299,9 @@ void THNN_(VolumetricDilatedConvolution_accGradParameters)(
           int dT, int dW, int dH,
           int padT, int padW, int padH,
           int dilationT, int dilationW, int dilationH,
-          real scale)
+          accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   THNN_(VolumetricDilatedConvolution_shapeCheck)(
         input, gradOutput, gradWeight, gradBias,
         kT, kH, kW, dT, dH, dW, padT, padH, padW,
diff --git a/lib/THNN/generic/VolumetricFractionalMaxPooling.c b/lib/THNN/generic/VolumetricFractionalMaxPooling.c
new file mode 100644
index 0000000..236986b
--- /dev/null
+++ b/lib/THNN/generic/VolumetricFractionalMaxPooling.c
@@ -0,0 +1,279 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.c"
+#else
+
+static long* THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  THIndex_t* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputT, long inputW, long inputH,
+  long outputT, long outputW, long outputH,
+  int poolSizeT, int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 3 random samples, one for T, one for W, and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 3;
+
+    /* Generate interval sequence */
+    long* sequenceT =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputT, outputT, poolSizeT);
+    long* sequenceW =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[2], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w, t;
+
+    real* inputForPlane = input + plane * inputT * inputW * inputH;
+    real* outputForPlane = output + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        for (t = 0; t < outputT; ++t) {
+          long inputTStart = sequenceT[t];
+
+          real maxVal = -THInf;
+          long maxIndex = -1;
+
+          long h2, w2, t2;
+          for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+            for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+              for (t2 = inputTStart; t2 < inputTStart + poolSizeT; ++t2) {
+                THAssert(h2 >= 0 && h2 < inputH);
+                THAssert(w2 >= 0 && w2 < inputW);
+                THAssert(t2 >= 0 && t2 < inputT);
+
+                long planeIndex = h2 * inputW * inputT + w2 * inputT + t2;
+                real val = inputForPlane[planeIndex];
+                if (val > maxVal) {
+                  maxVal = val;
+                  maxIndex = planeIndex;
+                }
+              }
+            }
+          }
+
+          THAssert(maxVal != -THInf);
+          THAssert(maxIndex != -1);
+
+          outputForPlane[h * outputW * outputT + w * outputT + t] = maxVal;
+          /* +1 to lua index */
+          indicesForPlane[h * outputW * outputT + w * outputT + t] = maxIndex + TH_INDEX_BASE;
+        }
+      }
+    }
+
+    THFree(sequenceT);
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices,
+    THTensor *randomSamples) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(numInputDims == 4 || numInputDims == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+  long inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 9,
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 8,
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
+  THArgCheck(outputT + poolSizeT - 1 < inputT, 7,
+             "poolSizeT (%d) too large relative to input time (%d)",
+	     poolSizeT, inputT);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 4) {
+    /* resize output */
+    THTensor_(resize4d)(output, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, numPlanes, outputH, outputW, outputT);
+
+    THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THIndexTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputT, inputW, inputH,
+      outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize5d)(output, numBatch, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize5d)(indices, numBatch, numPlanes, outputH, outputW, outputT);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 3,
+        numPlanes, inputT, inputW, inputH,
+        outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  THIndex_t* indices,
+  long numPlanes,
+  long inputT, long inputW, long inputH,
+  long outputT, long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputT * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    long h, w, t;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        for (t = 0; t < outputT; ++t) {
+          long outputIndex = h * outputW * outputT + w * outputT + t;
+          long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+          THAssert(index >= 0 && index < inputT * inputW * inputH);
+
+          gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+  long inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputT == THTensor_(size)(gradOutput, timeDim), 3,
+             "gradOutput time unexpected");
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 4) {
+    THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THIndexTensor_(data)(indices),
+      numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/lib/THNN/generic/VolumetricFullConvolution.c b/lib/THNN/generic/VolumetricFullConvolution.c
index b6ef1cd..62d0d74 100644
--- a/lib/THNN/generic/VolumetricFullConvolution.c
+++ b/lib/THNN/generic/VolumetricFullConvolution.c
@@ -255,15 +255,17 @@ void THNN_(VolumetricFullConvolution_updateOutput)(
     const long k_ = 1;
 
     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THBlas_(gemm)(
-      't', 'n',
-      n_, m_, k_,
-      1,
-      THTensor_(data)(ones), k_,
-      THTensor_(data)(bias), k_,
-      1,
-      THTensor_(data)(output_n), n_
-    );
+	if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        1,
+        THTensor_(data)(output_n), n_
+      );
+    }
   }
 
   // Free
@@ -402,8 +404,9 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
   int dT, int dW, int dH,   // stride
   int pT, int pW, int pH,   // padding
   int aT, int aW, int aH,   // extra output adjustment
-  real scale)
+  accreal scale_)
 {
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
   // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
   THNN_(VolumetricFullConvolution_shapeCheck)(
         input, gradOutput, gradWeight, gradBias,
@@ -498,15 +501,17 @@ void THNN_(VolumetricFullConvolution_accGradParameters)(
     const long k_ = outputDepth * outputHeight * outputWidth;
 
     // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    THBlas_(gemv)(
-      't',
-      k_, m_,
-      scale,
-      THTensor_(data)(gradOutput_n), k_,
-      THTensor_(data)(ones), 1,
-      1,
-      THTensor_(data)(gradBias), 1
-    );
+    if (gradBias) {
+      THBlas_(gemv)(
+        't',
+        k_, m_,
+        scale,
+        THTensor_(data)(gradOutput_n), k_,
+        THTensor_(data)(ones), 1,
+        1,
+        THTensor_(data)(gradBias), 1
+      );
+    }
   }
 
   // Free
diff --git a/lib/THNN/generic/unfold.c b/lib/THNN/generic/unfold.c
index 25146c0..e718320 100644
--- a/lib/THNN/generic/unfold.c
+++ b/lib/THNN/generic/unfold.c
@@ -52,14 +52,17 @@ void THNN_(unfolded_acc)(
                  ix = (long long)(0 - padW + kw);
                  lpad = fmaxf(0,(int)(padW-kw));
                  rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
-                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+                 real *dst_slice = dst+(size_t)(iy*inputWidth+ix+lpad);
+                 THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
               }
               else{
                 for (x=0; x<outputWidth; x++){
                    ix = (long long)(x*dW - padW + kw);
                    if (ix < 0 || ix >= inputWidth){
-                   }else
-                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
+                   }else{
+                     real *dst_slice = dst+(size_t)(iy*inputWidth+ix);
+                     THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth+x), 1, 1);
+                   }
                 }
               }
             }
@@ -68,11 +71,14 @@ void THNN_(unfolded_acc)(
           for(y = 0; y < outputHeight; y++) {
             iy = (long long)(y*dH + kh);
             ix = (long long)(0 + kw);
-            if (dW == 1 )
-               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
-            else{
-              for(x = 0; x < outputWidth; x++)
-                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
+            if (dW == 1 ) {
+               real *dst_slice = dst+(size_t)(iy*inputWidth+ix);
+               THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            }else{
+              for(x = 0; x < outputWidth; x++) {
+                real *dst_slice = dst+(size_t)(iy*inputWidth+ix+x*dW);
+                THVector_(cadd)(dst_slice, dst_slice, src+(size_t)(y*outputWidth+x), 1, 1);
+              }
             }
           }
         }
diff --git a/lib/THNN/init.c b/lib/THNN/init.c
index 990775d..8aae97b 100644
--- a/lib/THNN/init.c
+++ b/lib/THNN/init.c
@@ -200,6 +200,9 @@
 #include "generic/SpatialAdaptiveMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialAdaptiveAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/SpatialAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -245,6 +248,9 @@
 #include "generic/VolumetricDilatedMaxPooling.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/VolumetricFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricMaxUnpooling.c"
 #include "THGenerateFloatTypes.h"
 
diff --git a/test.lua b/test.lua
old mode 100644
new mode 100755
index e5c92ab..ab1183d
--- a/test.lua
+++ b/test.lua
@@ -37,7 +37,6 @@ for test_name, component in pairs(tostringTestModules) do
     end
 end
 
-
 function nntest.Add()
    local inj_vals = {math.random(3,5), 1}  -- Also test the inj = 1 spatial case
    local ini = math.random(3,5)
@@ -121,6 +120,55 @@ function nntest.Bottle()
    mytester:eq(gradOutput1, gradOutput2, 0.0001, 'Bottle gradOutput not the same as Module')
 end
 
+function nntest.WeightNorm()
+   local input = torch.rand(10, 5)
+
+   -- temporal convolution
+   local model = nn.WeightNorm(nn.TemporalConvolution(5, 20, 2, 1))
+   local err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.bias, model.gradBias)
+   mytester:assert(err < precision, 'Temporal Convolution bias')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.g, model.gradG)
+   mytester:assert(err < precision, 'Temporal Convolution g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.v, model.gradV)
+   mytester:assert(err < precision, 'Temporal Convolution v')
+
+    -- linear
+   model = nn.WeightNorm(nn.Linear(5, 20))
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.bias, model.gradBias)
+   mytester:assert(err < precision, 'Linear bias')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'Linear g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.v, model.gradV)
+   mytester:assert(err < precision, 'Linear v')
+
+   -- euclidean with weight but no bias
+   input = torch.rand(10, 5)
+   model = nn.WeightNorm(nn.Euclidean(5, 20))
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'Euclidean g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                    model.v, model.gradV)
+   mytester:assert(err < precision, 'Euclidean v')
+
+   -- spatial convolution with 4D weights
+   input = torch.rand(5, 10, 10)
+   model = nn.WeightNorm(nn.SpatialConvolution(5, 20, 2, 2, 3, 3, 1, 1), 2)
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.bias, model.gradBias)
+   mytester:assert(err < precision, 'Spatial Convolution bias')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.g, model.gradG)
+   mytester:assert(err < precision, 'Spatial Convolution g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.v, model.gradV)
+   mytester:assert(err < precision, 'Spatial Convolution v')
+end
+
 function nntest.CAdd()
    local function testBackwardPass(module, input, params, dparams)
       local err = jac.testJacobian(module,input)
@@ -448,6 +496,22 @@ function nntest.CMul()
    mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
 end
 
+function nntest.Contiguous()
+   local module = nn.Contiguous()
+
+   -- Contiguous input
+   local input = torch.rand(30,20,10)
+   local output = module:forward(input)
+
+   mytester:assert(output:ne(input):sum() == 0, 'output not equal to input')
+
+   -- Make input non-contiguous
+   local input2 = output:transpose(1,2)
+   local output2 = module:forward(input2)
+
+   mytester:assert(output2:ne(output:contiguous()):sum() == 0, 'output not equal to input')
+end
+
 function nntest.Dropout()
    local p = 0.2 --prob of droping out a neuron
    local input = torch.Tensor(1000):fill((1-p))
@@ -2074,23 +2138,39 @@ function nntest.SpatialClassNLLCriterion()
 end
 
 function nntest.MultiLabelSoftMarginCriterion()
-    local cri = nn.MultiLabelSoftMarginCriterion()
+   -- test w/o weights
 
-    -- stochastic
-    local numLabels = math.random(5, 10)
-    local input = torch.randn(numLabels)
-    local target = torch.round(torch.rand(numLabels))
+   local cri = nn.MultiLabelSoftMarginCriterion()
 
-    criterionJacobianTest(cri, input, target)
+   -- stochastic
+   local numLabels = math.random(5, 10)
+   local input = torch.randn(numLabels)
+   local target = torch.round(torch.rand(numLabels))
+   criterionJacobianTest(cri, input, target)
 
-    -- batch
-    local numLabels = math.random(5, 10)
-    local bsz = math.random(3, 7)
-    local input = torch.randn(bsz, numLabels)
-    local target = torch.round(torch.rand(bsz, numLabels))
+   -- batch
+   local numLabels = math.random(5, 10)
+   local bsz = math.random(3, 7)
+   local input = torch.randn(bsz, numLabels)
+   local target = torch.round(torch.rand(bsz, numLabels))
+   criterionJacobianTest(cri, input, target)
+
+   -- test weights
 
-    criterionJacobianTest(cri, input, target)
+   local numLabels = math.random(5, 10)
+   local weights = torch.randn(numLabels)
+   local cri = nn.MultiLabelSoftMarginCriterion(weights)
 
+   -- stochastic
+   local input = torch.randn(numLabels)
+   local target = torch.round(torch.rand(numLabels))
+   criterionJacobianTest(cri, input, target)
+
+   -- batch
+   local bsz = math.random(3, 7)
+   local input = torch.randn(bsz, numLabels)
+   local target = torch.round(torch.rand(bsz, numLabels))
+   criterionJacobianTest(cri, input, target)
 end
 
 function nntest.CrossEntropyCriterion()
@@ -4023,6 +4103,62 @@ function nntest.SpatialAdaptiveMaxPooling()
 
 end
 
+function nntest.SpatialAdaptiveAveragePooling()
+   local from = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local ini = math.random(1,16)
+   local inj = math.random(1,16)
+
+   local module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+   local input = torch.rand(from,ini,inj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj)
+   module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+
+   -- non-contiguous
+
+   input = torch.rand(from,ini,inj):transpose(2,3)
+   module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+
+   -- non-contiguous batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj):transpose(1,3):transpose(2,4)
+   local inputc = input:contiguous() -- contiguous
+   module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+
+end
+
 function nntest.SpatialLPPooling()
    local fanin = math.random(1,4)
    local osizex = math.random(1,4)
@@ -4508,18 +4644,30 @@ function nntest.VolumetricFullConvolution()
 
     local input = torch.Tensor(bs, from, int, ini, inj):zero()
 
-    local err = jac.testJacobian(module, input)
-    mytester:assertlt(err, precision, 'error on state ')
+    local function jacTests(module)
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state ')
 
-    local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
-    mytester:assertlt(err , precision, 'error on weight ')
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'error on weight ')
 
-    local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
-    mytester:assertlt(err , precision, 'error on bias ')
+      if module.bias then
+        local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+        mytester:assertlt(err , precision, 'error on bias ')
+      end
 
-    local ferr, berr = jac.testIO(module, input)
-    mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
-    mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+    end
+
+    jacTests(module)
+    module:noBias()
+    jacTests(module)
+    module.bias = torch.Tensor(module.nOutputPlane):zero()
+    module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+    module:reset()
+    jacTests(module)
 end
 
 function nntest.VolumetricFullConvolutionDualInput()
@@ -4592,34 +4740,50 @@ function nntest.VolumetricConvolution()
    local module = nn.VolumetricConvolution(from, to, kt, ki, kj, st, si, sj, padT, padW, padH)
    local input = torch.Tensor(from, int, inj, ini):zero()
 
-   local err = jac.testJacobian(module, input)
-   mytester:assertlt(err, precision, 'error on state ')
+   local function jacTests(module)
+     local err = jac.testJacobian(module, input)
+     mytester:assertlt(err, precision, 'error on state ')
 
-   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
-   mytester:assertlt(err , precision, 'error on weight ')
+     local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+     mytester:assertlt(err , precision, 'error on weight ')
 
-   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
-   mytester:assertlt(err , precision, 'error on bias ')
+     if module.bias then
+       local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+       mytester:assertlt(err , precision, 'error on bias ')
+     end
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
-   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+     local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+     mytester:assertlt(err , precision, 'error on weight [direct update] ')
 
-   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
-   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+     if module.bias then
+       local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+       mytester:assertlt(err , precision, 'error on bias [direct update] ')
+     end
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
-      mytester:assertlt(err, precision, string.format(
-                         'error on weight [%s]', t))
-   end
+     for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+        mytester:assertlt(err, precision, string.format(
+                           'error on weight [%s]', t))
+     end
 
-   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
-      mytester:assertlt(err, precision, string.format(
-                         'error on bias [%s]', t))
+     if module.bias then
+       for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+         mytester:assertlt(err, precision, string.format(
+                            'error on bias [%s]', t))
+       end
+     end
+
+     local ferr, berr = jac.testIO(module, input)
+     mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+     mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
    end
 
-   local ferr, berr = jac.testIO(module, input)
-   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
-   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+   jacTests(module)
+   module:noBias()
+   jacTests(module)
+   module.bias = torch.Tensor(module.nOutputPlane):zero()
+   module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+   module:reset()
+   jacTests(module)
 end
 
 function nntest.VolumetricDilatedConvolution()
@@ -4878,6 +5042,92 @@ function nntest.VolumetricDilatedMaxPooling()
   end
 end
 
+function nntest.VolumetricFractionalMaxPooling()
+   local batch = math.random(1, 3)
+   local plane = math.random(1, 3)
+   local outT = math.random(1, 7)
+   local outW = math.random(1, 7)
+   local outH = math.random(1, 7)
+   local poolSizeT = math.random(2, 4)
+   local poolSizeW = math.random(2, 4)
+   local poolSizeH = math.random(2, 4)
+
+   local minInT = outT + poolSizeT
+   local minInW = outW + poolSizeW
+   local minInH = outH + poolSizeH
+
+   local inT = math.random(minInT, minInT + 6)
+   local inW = math.random(minInW, minInW + 6)
+   local inH = math.random(minInH, minInH + 6)
+
+   -- fix the pooling regions so they aren't regenerated with every
+   -- forward(), so testJacobian can work properly
+   local module =
+      nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH, outT, outW, outH)
+      :fixPoolingRegions()
+   local input = nil
+   if batch == 1 then
+      input = torch.Tensor(plane, inH, inW, inT):zero()
+   else
+      input = torch.Tensor(batch, plane, inH, inW, inT):zero()
+   end
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state')
+end
+
+function nntest.VolumetricFractionalMaxPooling_Ratio()
+   -- Fix a reduction ratio, and test with two different input sizes
+   local reductionRatioT = torch.uniform(0.4, 0.74)
+   local reductionRatioW = torch.uniform(0.4, 0.74)
+   local reductionRatioH = torch.uniform(0.4, 0.74)
+
+   for tries = 1, 2 do
+      local batch = math.random(1, 3)
+      local plane = math.random(1, 3)
+      local poolSizeT = math.random(2, 3)
+      local poolSizeW = math.random(2, 3)
+      local poolSizeH = math.random(2, 3)
+
+      local minInT = math.random(5, 8) + poolSizeT
+      local minInW = math.random(5, 8) + poolSizeW
+      local minInH = math.random(5, 8) + poolSizeH
+
+      local inT = math.random(minInT, minInT + 6)
+      local inW = math.random(minInW, minInW + 6)
+      local inH = math.random(minInH, minInH + 6)
+
+      -- fix the pooling regions so they aren't regenerated with every
+      -- forward(), so testJacobian can work properly
+      local module =
+         nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH,
+                                        reductionRatioT, reductionRatioW,
+                                        reductionRatioH)
+         :fixPoolingRegions()
+      local input = nil
+      if batch == 1 then
+         input = torch.Tensor(plane, inH, inW, inT):zero()
+      else
+         input = torch.Tensor(batch, plane, inH, inW, inT):zero()
+      end
+
+      -- Make sure that the output size is based on our ratio
+      local output = module:updateOutput(input)
+      if batch == 1 then
+         mytester:asserteq(output:size(4), math.floor(reductionRatioT * inT))
+         mytester:asserteq(output:size(3), math.floor(reductionRatioW * inW))
+         mytester:asserteq(output:size(2), math.floor(reductionRatioH * inH))
+      else
+         mytester:asserteq(output:size(5), math.floor(reductionRatioT * inT))
+         mytester:asserteq(output:size(4), math.floor(reductionRatioW * inW))
+         mytester:asserteq(output:size(3), math.floor(reductionRatioH * inH))
+      end
+
+      local err = nn.Jacobian.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state')
+   end
+end
+
 function nntest.VolumetricMaxUnpooling()
    local from = math.random(2,3)
    local kt = math.random(3,4)
@@ -5585,6 +5835,15 @@ function nntest.AddConstant()
   local err = (input1-input2):abs():max()
   mytester:asserteq(err, 0, torch.typename(module1) ..
                           ' - inplace input change err ')
+
+  local module3 = nn.AddConstant(torch.Tensor{1,2,3})
+  local out3 = module3:forward(torch.Tensor{-1,-2,-3})
+  mytester:asserteq(0, out3:abs():max(), torch.typename(module3) ..
+                      ' - tensor constant forward err ')
+  local module4 = nn.AddConstant(torch.Tensor{1,2,3})
+  local out4 = module3:forward(torch.Tensor{{-1,-2,-3},{-1,-2,-3}})
+  mytester:asserteq(0, out4:abs():max(), torch.typename(module4) ..
+                      ' - batch tensor constant forward err ')
 end
 
 function nntest.MulConstant()

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-nn.git