[lua-torch-optim] 01/01: Imported Upstream version 0~20160808-g6c59c35
Zhou Mo
cdluminate-guest at moszumanska.debian.org
Sun Aug 14 14:14:22 UTC 2016
This is an automated email from the git hooks/post-receive script.
cdluminate-guest pushed a commit to branch master
in repository lua-torch-optim.
commit b6c716c5f38f1d6dea85211a0439b2583dc070ae
Author: Zhou Mo <cdluminate at gmail.com>
Date: Sun Aug 14 14:12:59 2016 +0000
Imported Upstream version 0~20160808-g6c59c35
---
.dokx | 4 +
.gitignore | 1 +
CMakeLists.txt | 14 ++
COPYRIGHT.txt | 35 +++++
ConfusionMatrix.lua | 361 ++++++++++++++++++++++++++++++++++++++++++++++
Logger.lua | 186 ++++++++++++++++++++++++
README.md | 8 ++
adadelta.lua | 55 +++++++
adagrad.lua | 55 +++++++
adam.lua | 72 ++++++++++
adamax.lua | 66 +++++++++
asgd.lua | 73 ++++++++++
cg.lua | 208 +++++++++++++++++++++++++++
checkgrad.lua | 41 ++++++
cmaes.lua | 270 +++++++++++++++++++++++++++++++++++
de.lua | 109 ++++++++++++++
doc/algos.md | 364 +++++++++++++++++++++++++++++++++++++++++++++++
doc/intro.md | 41 ++++++
doc/logger.md | 73 ++++++++++
doc/logger_plot.png | Bin 0 -> 45532 bytes
fista.lua | 192 +++++++++++++++++++++++++
init.lua | 33 +++++
lbfgs.lua | 268 ++++++++++++++++++++++++++++++++++
lswolfe.lua | 192 +++++++++++++++++++++++++
mkdocs.yml | 8 ++
nag.lua | 86 +++++++++++
optim-1.0.3-0.rockspec | 29 ++++
optim-1.0.3-1.rockspec | 29 ++++
optim-1.0.4-0.rockspec | 28 ++++
optim-1.0.5-0.rockspec | 27 ++++
polyinterp.lua | 234 ++++++++++++++++++++++++++++++
rmsprop.lua | 57 ++++++++
rprop.lua | 103 ++++++++++++++
sgd.lua | 90 ++++++++++++
test/l2.lua | 22 +++
test/rosenbrock.lua | 50 +++++++
test/sparsecoding.lua | 127 +++++++++++++++++
test/test_adadelta.lua | 23 +++
test/test_adagrad.lua | 23 +++
test/test_adam.lua | 19 +++
test/test_adamax.lua | 23 +++
test/test_cg.lua | 17 +++
test/test_cmaes.lua | 23 +++
test/test_confusion.lua | 39 +++++
test/test_de.lua | 24 ++++
test/test_fista.lua | 95 +++++++++++++
test/test_lbfgs.lua | 38 +++++
test/test_lbfgs_w_ls.lua | 20 +++
test/test_logger.lua | 20 +++
test/test_rmsprop.lua | 23 +++
test/test_sgd.lua | 23 +++
51 files changed, 4021 insertions(+)
diff --git a/.dokx b/.dokx
new file mode 100644
index 0000000..0d62f75
--- /dev/null
+++ b/.dokx
@@ -0,0 +1,4 @@
+return {
+ githubURL = "torch/optim",
+ exclude = {"test", "polyinterp.lua"}
+}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..26ec4de
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+IF(LUAROCKS_PREFIX)
+ MESSAGE(STATUS "Installing Torch through Luarocks")
+ STRING(REGEX REPLACE "(.*)lib/luarocks/rocks.*" "\\1" CMAKE_INSTALL_PREFIX "${LUAROCKS_PREFIX}")
+ MESSAGE(STATUS "Prefix inferred from Luarocks: ${CMAKE_INSTALL_PREFIX}")
+ENDIF()
+FIND_PACKAGE(Torch REQUIRED)
+
+SET(src)
+FILE(GLOB luasrc *.lua)
+ADD_TORCH_PACKAGE(optim "${src}" "${luasrc}")
+#ADD_TORCH_DOK(dok optim "Machine Learning" "Optimization" 3.2)
diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
new file mode 100644
index 0000000..2e4118c
--- /dev/null
+++ b/COPYRIGHT.txt
@@ -0,0 +1,35 @@
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+3. Neither the names of NEC Laboratories American and IDIAP Research
+ Institute nor the names of its contributors may be used to endorse or
+ promote products derived from this software without specific prior
+ written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/ConfusionMatrix.lua b/ConfusionMatrix.lua
new file mode 100644
index 0000000..8659a4e
--- /dev/null
+++ b/ConfusionMatrix.lua
@@ -0,0 +1,361 @@
+--[[ A Confusion Matrix class
+
+Example:
+
+ conf = optim.ConfusionMatrix( {'cat','dog','person'} ) -- new matrix
+ conf:zero() -- reset matrix
+ for i = 1,N do
+ conf:add( neuralnet:forward(sample), label ) -- accumulate errors
+ end
+ print(conf) -- print matrix
+ image.display(conf:render()) -- render matrix
+]]
+local ConfusionMatrix = torch.class('optim.ConfusionMatrix')
+
+function ConfusionMatrix:__init(nclasses, classes)
+ if type(nclasses) == 'table' then
+ classes = nclasses
+ nclasses = #classes
+ end
+ self.mat = torch.LongTensor(nclasses,nclasses):zero()
+ self.valids = torch.FloatTensor(nclasses):zero()
+ self.unionvalids = torch.FloatTensor(nclasses):zero()
+ self.nclasses = nclasses
+ self.totalValid = 0
+ self.averageValid = 0
+ self.classes = classes or {}
+ -- buffers
+ self._mat_flat = self.mat:view(-1)
+ self._target = torch.FloatTensor()
+ self._prediction = torch.FloatTensor()
+ self._max = torch.FloatTensor()
+ self._pred_idx = torch.LongTensor()
+ self._targ_idx = torch.LongTensor()
+end
+
+-- takes scalar prediction and target as input
+function ConfusionMatrix:_add(p, t)
+ assert(p and type(p) == 'number')
+ assert(t and type(t) == 'number')
+ -- non-positive values are considered missing
+ -- and therefore ignored
+ if t > 0 then
+ self.mat[t][p] = self.mat[t][p] + 1
+ end
+end
+
+function ConfusionMatrix:add(prediction, target)
+ if type(prediction) == 'number' then
+ -- comparing numbers
+ self:_add(prediction, target)
+ else
+ self._prediction:resize(prediction:size()):copy(prediction)
+ assert(prediction:dim() == 1)
+ if type(target) == 'number' then
+ -- prediction is a vector, then target assumed to be an index
+ self._max:max(self._pred_idx, self._prediction, 1)
+ self:_add(self._pred_idx[1], target)
+ else
+ -- both prediction and target are vectors
+ assert(target:dim() == 1)
+ self._target:resize(target:size()):copy(target)
+ self._max:max(self._targ_idx, self._target, 1)
+ self._max:max(self._pred_idx, self._prediction, 1)
+ self:_add(self._pred_idx[1], self._targ_idx[1])
+ end
+ end
+end
+
+function ConfusionMatrix:batchAdd(predictions, targets)
+ local preds, targs, __
+ self._prediction:resize(predictions:size()):copy(predictions)
+ if predictions:dim() == 1 then
+ -- predictions is a vector of classes
+ preds = self._prediction
+ elseif predictions:dim() == 2 then
+ -- prediction is a matrix of class likelihoods
+ if predictions:size(2) == 1 then
+ -- or prediction just needs flattening
+ preds = self._prediction:select(2,1)
+ else
+ self._max:max(self._pred_idx, self._prediction, 2)
+ preds = self._pred_idx:select(2,1)
+ end
+ else
+ error("predictions has invalid number of dimensions")
+ end
+
+ self._target:resize(targets:size()):copy(targets)
+ if targets:dim() == 1 then
+ -- targets is a vector of classes
+ targs = self._target
+ elseif targets:dim() == 2 then
+ -- targets is a matrix of one-hot rows
+ if targets:size(2) == 1 then
+ -- or targets just needs flattening
+ targs = self._target:select(2,1)
+ else
+ self._max:max(self._targ_idx, self._target, 2)
+ targs = self._targ_idx:select(2,1)
+ end
+ else
+ error("targets has invalid number of dimensions")
+ end
+
+ -- non-positive values are considered missing and therefore ignored
+ local mask = targs:ge(1)
+ targs = targs[mask]
+ preds = preds[mask]
+
+ self._mat_flat = self._mat_flat or self.mat:view(-1) -- for backward compatibility
+
+ preds = preds:typeAs(targs)
+
+ assert(self.mat:isContiguous() and self.mat:stride(2) == 1)
+ local indices = ((targs - 1) * self.mat:stride(1) + preds):typeAs(self.mat)
+ local ones = torch.ones(1):typeAs(self.mat):expand(indices:size(1))
+ self._mat_flat:indexAdd(1, indices, ones)
+end
+
+function ConfusionMatrix:zero()
+ self.mat:zero()
+ self.valids:zero()
+ self.unionvalids:zero()
+ self.totalValid = 0
+ self.averageValid = 0
+end
+
+local function isNaN(number)
+ return number ~= number
+end
+
+function ConfusionMatrix:updateValids()
+ local total = 0
+ for t = 1,self.nclasses do
+ self.valids[t] = self.mat[t][t] / self.mat:select(1,t):sum()
+ self.unionvalids[t] = self.mat[t][t] / (self.mat:select(1,t):sum()+self.mat:select(2,t):sum()-self.mat[t][t])
+ total = total + self.mat[t][t]
+ end
+ self.totalValid = total / self.mat:sum()
+ self.averageValid = 0
+ self.averageUnionValid = 0
+ local nvalids = 0
+ local nunionvalids = 0
+ for t = 1,self.nclasses do
+ if not isNaN(self.valids[t]) then
+ self.averageValid = self.averageValid + self.valids[t]
+ nvalids = nvalids + 1
+ end
+ if not isNaN(self.valids[t]) and not isNaN(self.unionvalids[t]) then
+ self.averageUnionValid = self.averageUnionValid + self.unionvalids[t]
+ nunionvalids = nunionvalids + 1
+ end
+ end
+ self.averageValid = self.averageValid / nvalids
+ self.averageUnionValid = self.averageUnionValid / nunionvalids
+end
+
+-- Calculating FAR/FRR associated with the confusion matrix
+
+function ConfusionMatrix:farFrr()
+ local cmat = self.mat
+ local noOfClasses = cmat:size()[1]
+ self._frrs = self._frrs or torch.zeros(noOfClasses)
+ self._frrs:zero()
+ self._classFrrs = self._classFrrs or torch.zeros(noOfClasses)
+ self._classFrrs:zero()
+ self._classFrrs:add(-1)
+ self._fars = self._fars or torch.zeros(noOfClasses)
+ self._fars:zero()
+ self._classFars = self._classFars or torch.zeros(noOfClasses)
+ self._classFars:zero()
+ self._classFars:add(-1)
+ local classSamplesCount = cmat:sum(2)
+ local indx = 1
+ for i=1,noOfClasses do
+ if classSamplesCount[i][1] ~= 0 then
+ self._frrs[indx] = 1 - cmat[i][i]/classSamplesCount[i][1]
+ self._classFrrs[i] = self._frrs[indx]
+ -- Calculating FARs
+ local farNumerator = 0
+ local farDenominator = 0
+ for j=1, noOfClasses do
+ if i ~= j then
+ if classSamplesCount[j][1] ~= 0 then
+ farNumerator = farNumerator + cmat[j][i]/classSamplesCount[j][1]
+ farDenominator = farDenominator + 1
+ end
+ end
+ end
+ self._fars[indx] = farNumerator/farDenominator
+ self._classFars[i] = self._fars[indx]
+ indx = indx + 1
+ end
+ end
+ indx = indx - 1
+ local returnFrrs = self._frrs[{{1, indx}}]
+ local returnFars = self._fars[{{1, indx}}]
+ return self._classFrrs, self._classFars, returnFrrs, returnFars
+end
+
+local function log10(n)
+ if math.log10 then
+ return math.log10(n)
+ else
+ return math.log(n) / math.log(10)
+ end
+end
+
+function ConfusionMatrix:__tostring__()
+ self:updateValids()
+ local str = {'ConfusionMatrix:\n'}
+ local nclasses = self.nclasses
+ table.insert(str, '[')
+ local maxCnt = self.mat:max()
+ local nDigits = math.max(8, 1 + math.ceil(log10(maxCnt)))
+ for t = 1,nclasses do
+ local pclass = self.valids[t] * 100
+ pclass = string.format('%2.3f', pclass)
+ if t == 1 then
+ table.insert(str, '[')
+ else
+ table.insert(str, ' [')
+ end
+ for p = 1,nclasses do
+ table.insert(str, string.format('%' .. nDigits .. 'd', self.mat[t][p]))
+ end
+ if self.classes and self.classes[1] then
+ if t == nclasses then
+ table.insert(str, ']] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
+ else
+ table.insert(str, '] ' .. pclass .. '% \t[class: ' .. (self.classes[t] or '') .. ']\n')
+ end
+ else
+ if t == nclasses then
+ table.insert(str, ']] ' .. pclass .. '% \n')
+ else
+ table.insert(str, '] ' .. pclass .. '% \n')
+ end
+ end
+ end
+ table.insert(str, ' + average row correct: ' .. (self.averageValid*100) .. '% \n')
+ table.insert(str, ' + average rowUcol correct (VOC measure): ' .. (self.averageUnionValid*100) .. '% \n')
+ table.insert(str, ' + global correct: ' .. (self.totalValid*100) .. '%')
+ return table.concat(str)
+end
+
+function ConfusionMatrix:render(sortmode, display, block, legendwidth)
+ -- args
+ local confusion = self.mat:double()
+ local classes = self.classes
+ local sortmode = sortmode or 'score' -- 'score' or 'occurrence'
+ local block = block or 25
+ local legendwidth = legendwidth or 200
+ local display = display or false
+
+ -- legends
+ local legend = {
+ ['score'] = 'Confusion matrix [sorted by scores, global accuracy = %0.3f%%, per-class accuracy = %0.3f%%]',
+ ['occurrence'] = 'Confusiong matrix [sorted by occurences, accuracy = %0.3f%%, per-class accuracy = %0.3f%%]'
+ }
+
+ -- parse matrix / normalize / count scores
+ local diag = torch.FloatTensor(#classes)
+ local freqs = torch.FloatTensor(#classes)
+ local unconf = confusion
+ local confusion = confusion:clone()
+ local corrects = 0
+ local total = 0
+ for target = 1,#classes do
+ freqs[target] = confusion[target]:sum()
+ corrects = corrects + confusion[target][target]
+ total = total + freqs[target]
+ confusion[target]:div( math.max(confusion[target]:sum(),1) )
+ diag[target] = confusion[target][target]
+ end
+
+ -- accuracies
+ local accuracy = corrects / total * 100
+ local perclass = 0
+ local total = 0
+ for target = 1,#classes do
+ if confusion[target]:sum() > 0 then
+ perclass = perclass + diag[target]
+ total = total + 1
+ end
+ end
+ perclass = perclass / total * 100
+ freqs:div(unconf:sum())
+
+ -- sort matrix
+ if sortmode == 'score' then
+ _,order = torch.sort(diag,1,true)
+ elseif sortmode == 'occurrence' then
+ _,order = torch.sort(freqs,1,true)
+ else
+ error('sort mode must be one of: score | occurrence')
+ end
+
+ -- render matrix
+ local render = torch.zeros(#classes*block, #classes*block)
+ for target = 1,#classes do
+ for prediction = 1,#classes do
+ render[{ { (target-1)*block+1,target*block }, { (prediction-1)*block+1,prediction*block } }] = confusion[order[target]][order[prediction]]
+ end
+ end
+
+ -- add grid
+ for target = 1,#classes do
+ render[{ {target*block},{} }] = 0.1
+ render[{ {},{target*block} }] = 0.1
+ end
+
+ -- create rendering
+ require 'image'
+ require 'qtwidget'
+ require 'qttorch'
+ local win1 = qtwidget.newimage( (#render)[2]+legendwidth, (#render)[1] )
+ image.display{image=render, win=win1}
+
+ -- add legend
+ for i in ipairs(classes) do
+ -- background cell
+ win1:setcolor{r=0,g=0,b=0}
+ win1:rectangle((#render)[2],(i-1)*block,legendwidth,block)
+ win1:fill()
+
+ -- %
+ win1:setfont(qt.QFont{serif=false, size=fontsize})
+ local gscale = freqs[order[i]]/freqs:max()*0.9+0.1 --3/4
+ win1:setcolor{r=gscale*0.5+0.2,g=gscale*0.5+0.2,b=gscale*0.8+0.2}
+ win1:moveto((#render)[2]+10,i*block-block/3)
+ win1:show(string.format('[%2.2f%% labels]',math.floor(freqs[order[i]]*10000+0.5)/100))
+
+ -- legend
+ win1:setfont(qt.QFont{serif=false, size=fontsize})
+ local gscale = diag[order[i]]*0.8+0.2
+ win1:setcolor{r=gscale,g=gscale,b=gscale}
+ win1:moveto(120+(#render)[2]+10,i*block-block/3)
+ win1:show(classes[order[i]])
+
+ for j in ipairs(classes) do
+ -- scores
+ local score = confusion[order[j]][order[i]]
+ local gscale = (1-score)*(score*0.8+0.2)
+ win1:setcolor{r=gscale,g=gscale,b=gscale}
+ win1:moveto((i-1)*block+block/5,(j-1)*block+block*2/3)
+ win1:show(string.format('%02.0f',math.floor(score*100+0.5)))
+ end
+ end
+
+ -- generate tensor
+ local t = win1:image():toTensor()
+
+ -- display
+ if display then
+ image.display{image=t, legend=string.format(legend[sortmode],accuracy,perclass)}
+ end
+
+ -- return rendering
+ return t
+end
diff --git a/Logger.lua b/Logger.lua
new file mode 100644
index 0000000..48915ef
--- /dev/null
+++ b/Logger.lua
@@ -0,0 +1,186 @@
+--[[ Logger: a simple class to log symbols during training,
+ and automate plot generation
+
+Example:
+ logger = optim.Logger('somefile.log') -- file to save stuff
+
+ for i = 1,N do -- log some symbols during
+ train_error = ... -- training/testing
+ test_error = ...
+ logger:add{['training error'] = train_error,
+ ['test error'] = test_error}
+ end
+
+ logger:style{['training error'] = '-', -- define styles for plots
+ ['test error'] = '-'}
+ logger:plot() -- and plot
+
+---- OR ---
+
+ logger = optim.Logger('somefile.log') -- file to save stuff
+ logger:setNames{'training error', 'test error'}
+
+ for i = 1,N do -- log some symbols during
+ train_error = ... -- training/testing
+ test_error = ...
+ logger:add{train_error, test_error}
+ end
+
+ logger:style{'-', '-'} -- define styles for plots
+ logger:plot() -- and plot
+
+-----------
+
+ logger:setlogscale(true) -- enable logscale on Y-axis
+ logger:plot() -- and plot
+]]
+require 'xlua'
+local Logger = torch.class('optim.Logger')
+
+function Logger:__init(filename, timestamp)
+ if filename then
+ self.name = filename
+ os.execute('mkdir ' .. (sys.uname() ~= 'windows' and '-p ' or '') .. ' "' .. paths.dirname(filename) .. '"')
+ if timestamp then
+ -- append timestamp to create unique log file
+ filename = filename .. '-'..os.date("%Y_%m_%d_%X")
+ end
+ self.file = io.open(filename,'w')
+ self.epsfile = self.name .. '.eps'
+ else
+ self.file = io.stdout
+ self.name = 'stdout'
+ print('<Logger> warning: no path provided, logging to std out')
+ end
+ self.empty = true
+ self.symbols = {}
+ self.styles = {}
+ self.names = {}
+ self.idx = {}
+ self.figure = nil
+ self.showPlot = true
+ self.plotRawCmd = nil
+ self.defaultStyle = '+'
+ self.logscale = false
+end
+
+function Logger:setNames(names)
+ self.names = names
+ self.empty = false
+ self.nsymbols = #names
+ for k,key in pairs(names) do
+ self.file:write(key .. '\t')
+ self.symbols[k] = {}
+ self.styles[k] = {self.defaultStyle}
+ self.idx[key] = k
+ end
+ self.file:write('\n')
+ self.file:flush()
+ return self
+end
+
+function Logger:add(symbols)
+ -- (1) first time ? print symbols' names on first row
+ if self.empty then
+ self.empty = false
+ self.nsymbols = #symbols
+ for k,val in pairs(symbols) do
+ self.file:write(k .. '\t')
+ self.symbols[k] = {}
+ self.styles[k] = {self.defaultStyle}
+ self.names[k] = k
+ end
+ self.idx = self.names
+ self.file:write('\n')
+ end
+ -- (2) print all symbols on one row
+ for k,val in pairs(symbols) do
+ if type(val) == 'number' then
+ self.file:write(string.format('%11.4e',val) .. '\t')
+ elseif type(val) == 'string' then
+ self.file:write(val .. '\t')
+ else
+ xlua.error('can only log numbers and strings', 'Logger')
+ end
+ end
+ self.file:write('\n')
+ self.file:flush()
+ -- (3) save symbols in internal table
+ for k,val in pairs(symbols) do
+ table.insert(self.symbols[k], val)
+ end
+end
+
+function Logger:style(symbols)
+ for name,style in pairs(symbols) do
+ if type(style) == 'string' then
+ self.styles[name] = {style}
+ elseif type(style) == 'table' then
+ self.styles[name] = style
+ else
+ xlua.error('style should be a string or a table of strings','Logger')
+ end
+ end
+ return self
+end
+
+function Logger:setlogscale(value)
+ self.logscale = value
+end
+
+function Logger:plot(...)
+ if not xlua.require('gnuplot') then
+ if not self.warned then
+ print('<Logger> warning: cannot plot with this version of Torch')
+ self.warned = true
+ end
+ return
+ end
+ local plotit = false
+ local plots = {}
+ local plotsymbol =
+ function(name,list)
+ if #list > 1 then
+ local nelts = #list
+ local plot_y = torch.Tensor(nelts)
+ for i = 1,nelts do
+ plot_y[i] = list[i]
+ end
+ for _,style in ipairs(self.styles[name]) do
+ table.insert(plots, {self.names[name], plot_y, style})
+ end
+ plotit = true
+ end
+ end
+ local args = {...}
+ if not args[1] then -- plot all symbols
+ for name,list in pairs(self.symbols) do
+ plotsymbol(name,list)
+ end
+ else -- plot given symbols
+ for _,name in ipairs(args) do
+ plotsymbol(self.idx[name], self.symbols[self.idx[name]])
+ end
+ end
+ if plotit then
+ if self.showPlot then
+ self.figure = gnuplot.figure(self.figure)
+ if self.logscale then gnuplot.logscale('on') end
+ gnuplot.plot(plots)
+ if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
+ gnuplot.grid('on')
+ gnuplot.title('<Logger::' .. self.name .. '>')
+ end
+ if self.epsfile then
+ os.execute('rm -f "' .. self.epsfile .. '"')
+ local epsfig = gnuplot.epsfigure(self.epsfile)
+ if self.logscale then gnuplot.logscale('on') end
+ gnuplot.plot(plots)
+ if self.plotRawCmd then gnuplot.raw(self.plotRawCmd) end
+ gnuplot.grid('on')
+ gnuplot.title('<Logger::' .. self.name .. '>')
+ gnuplot.plotflush()
+ gnuplot.close(epsfig)
+ end
+ end
+end
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..561621b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,8 @@
+<a name='optim.dok'></a>
+# Optimization package
+
+This package contains several optimization routines and a logger for [Torch](https://github.com/torch/torch7/blob/master/README.md):
+
+ * [Overview](doc/intro.md);
+ * [Optimization algorithms](doc/algos.md);
+ * [Logger](doc/logger.md).
diff --git a/adadelta.lua b/adadelta.lua
new file mode 100644
index 0000000..7cc058d
--- /dev/null
+++ b/adadelta.lua
@@ -0,0 +1,55 @@
+--[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.5701
+
+ARGS:
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX
+- `x` : the initial point
+- `config` : a table of hyper-parameters
+- `config.rho` : interpolation parameter
+- `config.eps` : for numerical stability
+- `config.weightDecay` : weight decay
+- `state` : a table describing the state of the optimizer; after each
+ call the state is modified
+- `state.paramVariance` : vector of temporal variances of parameters
+- `state.accDelta` : vector of accummulated delta of gradients
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+]]
+function optim.adadelta(opfunc, x, config, state)
+ -- (0) get/update state
+ if config == nil and state == nil then
+ print('no state table, ADADELTA initializing')
+ end
+ local config = config or {}
+ local state = state or config
+ local rho = config.rho or 0.9
+ local eps = config.eps or 1e-6
+ local wd = config.weightDecay or 0
+ state.evalCounter = state.evalCounter or 0
+ -- (1) evaluate f(x) and df/dx
+ local fx,dfdx = opfunc(x)
+
+ -- (2) weight decay
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
+
+ -- (3) parameter update
+ if not state.paramVariance then
+ state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
+ state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
+ state.delta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
+ state.accDelta = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
+ end
+ state.paramVariance:mul(rho):addcmul(1-rho,dfdx,dfdx)
+ state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):add(eps):sqrt()
+ state.delta:resizeAs(state.paramVariance):copy(state.accDelta):add(eps):sqrt():cdiv(state.paramStd):cmul(dfdx)
+ x:add(-1, state.delta)
+ state.accDelta:mul(rho):addcmul(1-rho, state.delta, state.delta)
+ -- (4) update evaluation counter
+ state.evalCounter = state.evalCounter + 1
+
+ -- return x*, f(x) before optimization
+ return x,{fx}
+end
diff --git a/adagrad.lua b/adagrad.lua
new file mode 100644
index 0000000..6860c43
--- /dev/null
+++ b/adagrad.lua
@@ -0,0 +1,55 @@
+--[[ ADAGRAD implementation for SGD
+
+ARGS:
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX
+- `x` : the initial point
+- `state` : a table describing the state of the optimizer; after each
+ call the state is modified
+- `state.learningRate` : learning rate
+- `state.paramVariance` : vector of temporal variances of parameters
+- `state.weightDecay` : scalar that controls weight decay
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+
+]]
+function optim.adagrad(opfunc, x, config, state)
+ -- (0) get/update state
+ if config == nil and state == nil then
+ print('no state table, ADAGRAD initializing')
+ end
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 1e-3
+ local lrd = config.learningRateDecay or 0
+ local wd = config.weightDecay or 0
+ state.evalCounter = state.evalCounter or 0
+ local nevals = state.evalCounter
+
+ -- (1) evaluate f(x) and df/dx
+ local fx,dfdx = opfunc(x)
+
+ -- (2) weight decay with a single parameter
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
+
+ -- (3) learning rate decay (annealing)
+ local clr = lr / (1 + nevals*lrd)
+
+ -- (4) parameter update with single or individual learning rates
+ if not state.paramVariance then
+ state.paramVariance = torch.Tensor():typeAs(x):resizeAs(dfdx):zero()
+ state.paramStd = torch.Tensor():typeAs(x):resizeAs(dfdx)
+ end
+ state.paramVariance:addcmul(1,dfdx,dfdx)
+ state.paramStd:resizeAs(state.paramVariance):copy(state.paramVariance):sqrt()
+ x:addcdiv(-clr, dfdx,state.paramStd:add(1e-10))
+
+ -- (5) update evaluation counter
+ state.evalCounter = state.evalCounter + 1
+
+ -- return x*, f(x) before optimization
+ return x,{fx}
+end
diff --git a/adam.lua b/adam.lua
new file mode 100644
index 0000000..bc80b5e
--- /dev/null
+++ b/adam.lua
@@ -0,0 +1,72 @@
+--[[ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf
+
+ARGS:
+
+- 'opfunc' : a function that takes a single input (X), the point
+ of a evaluation, and returns f(X) and df/dX
+- 'x' : the initial point
+- 'config` : a table with configuration parameters for the optimizer
+- 'config.learningRate' : learning rate
+- `config.learningRateDecay` : learning rate decay
+- 'config.beta1' : first moment coefficient
+- 'config.beta2' : second moment coefficient
+- 'config.epsilon' : for numerical stability
+- 'config.weightDecay' : weight decay
+- 'state' : a table describing the state of the optimizer; after each
+ call the state is modified
+
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+
+]]
+
+function optim.adam(opfunc, x, config, state)
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 0.001
+ local lrd = config.learningRateDecay or 0
+
+ local beta1 = config.beta1 or 0.9
+ local beta2 = config.beta2 or 0.999
+ local epsilon = config.epsilon or 1e-8
+ local wd = config.weightDecay or 0
+
+ -- (1) evaluate f(x) and df/dx
+ local fx, dfdx = opfunc(x)
+
+ -- (2) weight decay
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
+
+ -- Initialization
+ state.t = state.t or 0
+ -- Exponential moving average of gradient values
+ state.m = state.m or x.new(dfdx:size()):zero()
+ -- Exponential moving average of squared gradient values
+ state.v = state.v or x.new(dfdx:size()):zero()
+ -- A tmp tensor to hold the sqrt(v) + epsilon
+ state.denom = state.denom or x.new(dfdx:size()):zero()
+
+ -- (3) learning rate decay (annealing)
+ local clr = lr / (1 + state.t*lrd)
+
+ state.t = state.t + 1
+
+ -- Decay the first and second moment running average coefficient
+ state.m:mul(beta1):add(1-beta1, dfdx)
+ state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
+
+ state.denom:copy(state.v):sqrt():add(epsilon)
+
+ local biasCorrection1 = 1 - beta1^state.t
+ local biasCorrection2 = 1 - beta2^state.t
+ local stepSize = clr * math.sqrt(biasCorrection2)/biasCorrection1
+ -- (4) update x
+ x:addcdiv(-stepSize, state.m, state.denom)
+
+ -- return x*, f(x) before optimization
+ return x, {fx}
+end
diff --git a/adamax.lua b/adamax.lua
new file mode 100644
index 0000000..2b64877
--- /dev/null
+++ b/adamax.lua
@@ -0,0 +1,66 @@
+--[[ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
+
+ARGS:
+
+- 'opfunc' : a function that takes a single input (X), the point
+ of a evaluation, and returns f(X) and df/dX
+- 'x' : the initial point
+- 'config` : a table with configuration parameters for the optimizer
+- 'config.learningRate' : learning rate
+- 'config.beta1' : first moment coefficient
+- 'config.beta2' : second moment coefficient
+- 'config.epsilon' : for numerical stability
+- 'state' : a table describing the state of the optimizer;
+ after each call the state is modified.
+
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+
+]]
+
+function optim.adamax(opfunc, x, config, state)
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 0.002
+
+ local beta1 = config.beta1 or 0.9
+ local beta2 = config.beta2 or 0.999
+ local epsilon = config.epsilon or 1e-38
+ local wd = config.weightDecay or 0
+
+ -- (1) evaluate f(x) and df/dx
+ local fx, dfdx = opfunc(x)
+
+ -- (2) weight decay
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
+
+ -- Initialization
+ state.t = state.t or 0
+ -- Exponential moving average of gradient values
+ state.m = state.m or x.new(dfdx:size()):zero()
+ -- Exponential moving average of the infinity norm
+ state.u = state.u or x.new(dfdx:size()):zero()
+ -- A tmp tensor to hold the input to max()
+ state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()
+
+ state.t = state.t + 1
+
+ -- Update biased first moment estimate.
+ state.m:mul(beta1):add(1-beta1, dfdx)
+ -- Update the exponentially weighted infinity norm.
+ state.max[1]:copy(state.u):mul(beta2)
+ state.max[2]:copy(dfdx):abs():add(epsilon)
+ state.u:max(state.max, 1)
+
+ local biasCorrection1 = 1 - beta1^state.t
+ local stepSize = lr/biasCorrection1
+ -- (2) update x
+ x:addcdiv(-stepSize, state.m, state.u)
+
+ -- return x*, f(x) before optimization
+ return x, {fx}
+end
diff --git a/asgd.lua b/asgd.lua
new file mode 100644
index 0000000..cc1c459
--- /dev/null
+++ b/asgd.lua
@@ -0,0 +1,73 @@
+--[[ An implementation of ASGD
+
+ASGD:
+
+ x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
+ a := a + mu_t [ x - a ]
+
+ eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
+ mu_t = 1/max(1,t-t0)
+
+implements ASGD algoritm as in L.Bottou's sgd-2.0
+
+ARGS:
+
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX
+- `x` : the initial point
+- `state` : a table describing the state of the optimizer; after each
+ call the state is modified
+- `state.eta0` : learning rate
+- `state.lambda` : decay term
+- `state.alpha` : power for eta update
+- `state.t0` : point at which to start averaging
+
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+- `ax` : the averaged x vector
+
+(Clement Farabet, 2012)
+--]]
+function optim.asgd(opfunc, x, config, state)
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ config.eta0 = config.eta0 or 1e-4
+ config.lambda = config.lambda or 1e-4
+ config.alpha = config.alpha or 0.75
+ config.t0 = config.t0 or 1e6
+
+ -- (hidden state)
+ state.eta_t = state.eta_t or config.eta0
+ state.mu_t = state.mu_t or 1
+ state.t = state.t or 0
+
+ -- (1) evaluate f(x) and df/dx
+ local fx,dfdx = opfunc(x)
+
+ -- (2) decay term
+ x:mul(1 - config.lambda*state.eta_t)
+
+ -- (3) update x
+ x:add(-state.eta_t, dfdx)
+
+ -- (4) averaging
+ state.ax = state.ax or torch.Tensor():typeAs(x):resizeAs(x):zero()
+ state.tmp = state.tmp or torch.Tensor():typeAs(state.ax):resizeAs(state.ax)
+ if state.mu_t ~= 1 then
+ state.tmp:copy(x)
+ state.tmp:add(-1,state.ax):mul(state.mu_t)
+ state.ax:add(state.tmp)
+ else
+ state.ax:copy(x)
+ end
+
+ -- (5) update eta_t and mu_t
+ state.t = state.t + 1
+ state.eta_t = config.eta0 / math.pow((1 + config.lambda * config.eta0 * state.t), config.alpha)
+ state.mu_t = 1 / math.max(1, state.t - config.t0)
+
+ -- return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
+ return x,{fx},state.ax
+end
diff --git a/cg.lua b/cg.lua
new file mode 100644
index 0000000..842a7d5
--- /dev/null
+++ b/cg.lua
@@ -0,0 +1,208 @@
+--[[
+
+This cg implementation is a rewrite of minimize.m written by Carl
+E. Rasmussen. It is supposed to produce exactly same results (give
+or take numerical accuracy due to some changed order of
+operations). You can compare the result on rosenbrock with minimize.m.
+http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html
+
+ [x fx c] = minimize([0 0]', 'rosenbrock', -25)
+
+Note that we limit the number of function evaluations only, it seems much
+more important in practical use.
+
+ARGS:
+
+- `opfunc` : a function that takes a single input, the point of evaluation.
+- `x` : the initial point
+- `state` : a table of parameters and temporary allocations.
+- `state.maxEval` : max number of function evaluations
+- `state.maxIter` : max number of iterations
+- `state.df[0,1,2,3]` : if you pass torch.Tensor they will be used for temp storage
+- `state.[s,x0]` : if you pass torch.Tensor they will be used for temp storage
+
+RETURN:
+
+- `x*` : the new x vector, at the optimal point
+- `f` : a table of all function values where
+ `f[1]` is the value of the function before any optimization and
+ `f[#f]` is the final fully optimized value, at x*
+
+(Koray Kavukcuoglu, 2012)
+--]]
+function optim.cg(opfunc, x, config, state)
+ -- parameters
+ local config = config or {}
+ local state = state or config
+ local rho = config.rho or 0.01
+ local sig = config.sig or 0.5
+ local int = config.int or 0.1
+ local ext = config.ext or 3.0
+ local maxIter = config.maxIter or 20
+ local ratio = config.ratio or 100
+ local maxEval = config.maxEval or maxIter*1.25
+ local red = 1
+
+ local verbose = config.verbose or 0
+
+ local i = 0
+ local ls_failed = 0
+ local fx = {}
+
+ -- we need three points for the interpolation/extrapolation stuff
+ local z1,z2,z3 = 0,0,0
+ local d1,d2,d3 = 0,0,0
+ local f1,f2,f3 = 0,0,0
+
+ local df1 = state.df1 or x.new()
+ local df2 = state.df2 or x.new()
+ local df3 = state.df3 or x.new()
+ local tdf
+
+ df1:resizeAs(x)
+ df2:resizeAs(x)
+ df3:resizeAs(x)
+
+ -- search direction
+ local s = state.s or x.new()
+ s:resizeAs(x)
+
+ -- we need a temp storage for X
+ local x0 = state.x0 or x.new()
+ local f0 = 0
+ local df0 = state.df0 or x.new()
+ x0:resizeAs(x)
+ df0:resizeAs(x)
+
+ -- evaluate at initial point
+ f1,tdf = opfunc(x)
+ fx[#fx+1] = f1
+ df1:copy(tdf)
+ i=i+1
+
+ -- initial search direction
+ s:copy(df1):mul(-1)
+
+ d1 = -s:dot(s ) -- slope
+ z1 = red/(1-d1) -- initial step
+
+ while i < math.abs(maxEval) do
+
+ x0:copy(x)
+ f0 = f1
+ df0:copy(df1)
+
+ x:add(z1,s)
+ f2,tdf = opfunc(x)
+ df2:copy(tdf)
+ i=i+1
+ d2 = df2:dot(s)
+ f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1
+ local m = math.min(maxIter,maxEval-i)
+ local success = 0
+ local limit = -1
+
+ while true do
+ while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do
+ limit = z1
+ if f2 > f1 then
+ z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
+ else
+ local A = 6*(f2-f3)/z3+3*(d2+d3)
+ local B = 3*(f3-f2)-z3*(d3+2*d2)
+ z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A
+ end
+ if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then
+ z2 = z3/2;
+ end
+ z2 = math.max(math.min(z2, int*z3),(1-int)*z3);
+ z1 = z1 + z2;
+ x:add(z2,s)
+ f2,tdf = opfunc(x)
+ df2:copy(tdf)
+ i=i+1
+ m = m - 1
+ d2 = df2:dot(s)
+ z3 = z3-z2;
+ end
+ if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then
+ break
+ elseif d2 > sig*d1 then
+ success = 1;
+ break;
+ elseif m == 0 then
+ break;
+ end
+ local A = 6*(f2-f3)/z3+3*(d2+d3);
+ local B = 3*(f3-f2)-z3*(d3+2*d2);
+ z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3))
+
+ if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then
+ if limit < -0.5 then
+ z2 = z1 * (ext -1)
+ else
+ z2 = (limit-z1)/2
+ end
+ elseif (limit > -0.5) and (z2+z1) > limit then
+ z2 = (limit-z1)/2
+ elseif limit < -0.5 and (z2+z1) > z1*ext then
+ z2 = z1*(ext-1)
+ elseif z2 < -z3*int then
+ z2 = -z3*int
+ elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then
+ z2 = (limit-z1)*(1-int)
+ end
+ f3=f2; d3=d2; z3=-z2;
+ z1 = z1+z2;
+ x:add(z2,s)
+
+ f2,tdf = opfunc(x)
+ df2:copy(tdf)
+ i=i+1
+ m = m - 1
+ d2 = df2:dot(s)
+ end
+ if success == 1 then
+ f1 = f2
+ fx[#fx+1] = f1;
+ local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1)
+ s:mul(ss)
+ s:add(-1,df2)
+ local tmp = df1:clone()
+ df1:copy(df2)
+ df2:copy(tmp)
+ d2 = df1:dot(s)
+ if d2> 0 then
+ s:copy(df1)
+ s:mul(-1)
+ d2 = -s:dot(s)
+ end
+
+ z1 = z1 * math.min(ratio, d1/(d2-1e-320))
+ d1 = d2
+ ls_failed = 0
+ else
+ x:copy(x0)
+ f1 = f0
+ df1:copy(df0)
+ if ls_failed or i>maxEval then
+ break
+ end
+ local tmp = df1:clone()
+ df1:copy(df2)
+ df2:copy(tmp)
+ s:copy(df1)
+ s:mul(-1)
+ d1 = -s:dot(s)
+ z1 = 1/(1-d1)
+ ls_failed = 1
+ end
+ end
+ state.df0 = df0
+ state.df1 = df1
+ state.df2 = df2
+ state.df3 = df3
+ state.x0 = x0
+ state.s = s
+ return x,fx,i
+end
diff --git a/checkgrad.lua b/checkgrad.lua
new file mode 100644
index 0000000..aecb969
--- /dev/null
+++ b/checkgrad.lua
@@ -0,0 +1,41 @@
+--[[ An implementation of a simple numerical gradient checker.
+
+ARGS:
+
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX
+- `x` : the initial point
+- `eps` : the epsilon to use for the numerical check (default is 1e-7)
+
+RETURN:
+
+- `diff` : error in the gradient, should be near tol
+- `dC` : exact gradient at point
+- `dC_est` : numerically estimates gradient at point
+
+]]--
+
+
+-- function that numerically checks gradient of NCA loss:
+function optim.checkgrad(opfunc, x, eps)
+
+ -- compute true gradient:
+ local _,dC = opfunc(x)
+ dC:resize(x:size())
+
+ -- compute numeric approximations to gradient:
+ local eps = eps or 1e-7
+ local dC_est = torch.Tensor():typeAs(dC):resizeAs(dC)
+ for i = 1,dC:size(1) do
+ x[i] = x[i] + eps
+ local C1 = opfunc(x)
+ x[i] = x[i] - 2 * eps
+ local C2 = opfunc(x)
+ x[i] = x[i] + eps
+ dC_est[i] = (C1 - C2) / (2 * eps)
+ end
+
+ -- estimate error of gradient:
+ local diff = torch.norm(dC - dC_est) / torch.norm(dC + dC_est)
+ return diff,dC,dC_est
+end
diff --git a/cmaes.lua b/cmaes.lua
new file mode 100644
index 0000000..74cd58a
--- /dev/null
+++ b/cmaes.lua
@@ -0,0 +1,270 @@
+require 'torch'
+require 'math'
+
+local BestSolution = {}
+--[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy),
+ported from https://www.lri.fr/~hansen/barecmaes2.html.
+
+Parameters
+----------
+ARGS:
+
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX. Note that df/dX is not used
+- `x` : the initial point
+- `state.sigma`
+ float, initial step-size (standard deviation in each
+ coordinate)
+- `state.maxEval`
+ int, maximal number of function evaluations
+- `state.ftarget`
+ float, target function value
+- `state.popsize`
+ population size. If this is left empty,
+ 4 + int(3 * log(|x|)) will be used
+- `state.ftarget`
+ stop if fitness < ftarget
+- `state.verb_disp`
+ int, display on console every verb_disp iteration, 0 for never
+
+RETURN:
+- `x*` : the new `x` vector, at the optimal point
+- `f` : a table of all function values:
+ `f[1]` is the value of the function before any optimization and
+ `f[#f]` is the final fully optimized value, at `x*`
+--]]
+function optim.cmaes(opfunc, x, config, state)
+ if (x.triu == nil or x.diag == nil) then
+ error('Unsupported Tensor ' .. x:type() .. " please use Float- or DoubleTensor for x")
+ end
+ -- process input parameters
+ local config = config or {}
+ local state = state or config
+ local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
+ local N = xmean:size(1) -- number of objective variables/problem dimension
+ local sigma = state.sigma -- coordinate wise standard deviation (step size)
+ local ftarget = state.ftarget -- stop if fitness < ftarget
+ local maxEval = tonumber(state.maxEval) or 1e3*N^2
+ local objfunc = opfunc
+ local verb_disp = state.verb_disp -- display step size
+ local min_iterations = state.min_iterations or 1
+
+ local lambda = state.popsize -- population size, offspring number
+ -- Strategy parameter setting: Selection
+ if state.popsize == nil then
+ lambda = 4 + math.floor(3 * math.log(N))
+ end
+
+ local mu = lambda / 2 -- number of parents/points for recombination
+ local weights = torch.range(0,mu-1):apply(function(i)
+ return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights
+ weights:div(weights:sum()) -- normalize recombination weights array
+ local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i
+ weights = weights:typeAs(x)
+
+ -- Strategy parameter setting: Adaptation
+ local cc = (4 + mueff/N) / (N+4 + 2 * mueff/N) -- time constant for cumulation for C
+ local cs = (mueff + 2) / (N + mueff + 5) -- t-const for cumulation for sigma control
+ local c1 = 2 / ((N + 1.3)^2 + mueff) -- learning rate for rank-one update of C
+ local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update
+ local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1
+
+ -- Initialize dynamic (internal) state variables
+ local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C
+ local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma
+ local B = torch.eye(N):typeAs(x) -- B defines the coordinate system
+ local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling
+ local C = torch.eye(N):typeAs(x) -- covariance matrix
+ if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig
+ error('torch.symeig not available for ' .. x:type() ..
+ " please use Float- or DoubleTensor for x")
+ end
+ local candidates = torch.Tensor(lambda,N):typeAs(x)
+ local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2
+ local eigeneval = 0 -- tracking the update of B and D
+ local counteval = 0
+ local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination
+ local fitvals = torch.Tensor(lambda)-- fitness values
+ local best = BestSolution.new(nil,nil,counteval)
+ local iteration = 0 -- iteration of the optimize loop
+
+
+ local function ask()
+ --[[return a list of lambda candidate solutions according to
+ m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I)
+ --]]
+ -- Eigendecomposition: first update B, D and invsqrtC from C
+ -- postpone in case to achieve O(N^2)
+ if counteval - eigeneval > lambda/(c1+cmu)/C:size(1)/10 then
+ eigeneval = counteval
+ C = torch.triu(C) + torch.triu(C,1):t() -- enforce symmetry
+ D, B = torch.symeig(C,'V') -- eigen decomposition, B==normalized eigenvectors, O(N^3)
+ D = torch.sqrt(D) -- D contains standard deviations now
+ invsqrtC = (B * torch.diag(torch.pow(D,-1)) * B:t())
+ end
+ for k=1,lambda do --repeat lambda times
+ local z = D:clone():normal(0,1):cmul(D)
+ candidates[{k,{}}] = torch.add(xmean, (B * z) * sigma)
+ end
+
+ return candidates
+ end
+
+
+ local function tell(arx)
+ --[[update the evolution paths and the distribution parameters m,
+ sigma, and C within CMA-ES.
+
+ Parameters
+ ----------
+ `arx`
+ a list of solutions, presumably from `ask()`
+ `fitvals`
+ the corresponding objective function values --]]
+ -- bookkeeping, preparation
+ counteval = counteval + lambda -- slightly artificial to do here
+ local xold = xmean:clone()
+
+ -- Sort by fitness and compute weighted mean into xmean
+ local arindex = nil --sorted indices
+ fitvals, arindex = torch.sort(fitvals)
+ arx = arx:index(1, arindex[{{1, mu}}]) -- sorted candidate solutions
+
+ table.insert(f_hist, fitvals[1]) --append best fitness to history
+ best:update(arx[1], fitvals[1], counteval)
+
+ xmean:zero()
+ xmean:addmv(arx:t(), weights) --dot product
+
+ -- Cumulation: update evolution paths
+ local y = xmean - xold
+ local z = invsqrtC * y -- == C^(-1/2) * (xnew - xold)
+
+ local c = (cs * (2-cs) * mueff)^0.5 / sigma
+ ps = ps - ps * cs + z * c -- exponential decay on ps
+ local hsig = (torch.sum(torch.pow(ps,2)) /
+ (1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1))
+ hsig = hsig and 1.0 or 0.0 --use binary numbers
+
+ c = (cc * (2-cc) * mueff)^0.5 / sigma
+ pc = pc - pc * cc + y * c * hsig -- exponential decay on pc
+
+ -- Adapt covariance matrix C
+ local c1a = c1 - (1-hsig^2) * c1 * cc * (2-cc)
+ -- for a minor adjustment to the variance loss by hsig
+ for i=1,N do
+ for j=1,N do
+ local r = torch.range(1,mu)
+ r:apply(function(k)
+ return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end)
+ local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update
+ C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] +
+ c1 * pc[i]*pc[j] + cmu * Cmuij)
+ end
+ end
+
+ -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82
+ sigma = sigma * math.exp(math.min(0.6,
+ (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2))
+ end
+
+ local function stop()
+ --[[return satisfied termination conditions in a table like
+ {'termination reason':value, ...}, for example {'tolfun':1e-12},
+ or the empty table {}--]]
+ local res = {}
+ if counteval > 0 then
+ if counteval >= maxEval then
+ res['evals'] = maxEval
+ end
+ if ftarget ~= nil and fitvals:nElement() > 0 and fitvals[1] <= ftarget then
+ res['ftarget'] = ftarget
+ end
+ if torch.max(D) > 1e7 * torch.min(D) then
+ res['condition'] = 1e7
+ end
+ if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then
+ res['tolfun'] = 1e-12
+ end
+ if sigma * torch.max(D) < 1e-11 then
+ -- remark: max(D) >= max(diag(C))^0.5
+ res['tolx'] = 1e-11
+ end
+ end
+ return res
+ end
+
+ local function disp(verb_modulo)
+ --[[display some iteration info--]]
+ if verb_disp == 0 then
+ return nil
+ end
+ local iteration = counteval / lambda
+
+ if iteration == 1 or iteration % (10*verb_modulo) == 0 then
+ print('evals:\t ax-ratio max(std) f-value')
+ end
+ if iteration <= 2 or iteration % verb_modulo == 0 then
+ local max_std = math.sqrt(torch.max(torch.diag(C)))
+ print(tostring(counteval).. ': ' ..
+ string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std)
+ .. tostring(fitvals[1]))
+ end
+
+ return nil
+ end
+
+ while next(stop()) == nil or iteration < min_iterations do
+ iteration = iteration + 1
+
+ local X = ask() -- deliver candidate solutions
+ for i=1, lambda do
+ -- put candidate tensor back in input shape and evaluate in opfunc
+ local candidate = X[i]:viewAs(x)
+ fitvals[i] = objfunc(candidate)
+ end
+
+ tell(X)
+ disp(verb_disp)
+ end
+
+ local bestmu, f, c = best:get()
+ if verb_disp > 0 then
+ for k, v in pairs(stop()) do
+ print('termination by', k, '=', v)
+ end
+ print('best f-value =', f)
+ print('solution = ')
+ print(bestmu)
+ print('best found at iteration: ', c/lambda, ' , total iterations: ', iteration)
+ end
+ table.insert(f_hist, f)
+
+ return bestmu, f_hist, counteval
+end
+
+
+
+BestSolution.__index = BestSolution
+function BestSolution.new(x, f, evals)
+ local self = setmetatable({}, BestSolution)
+ self.x = x
+ self.f = f
+ self.evals = evals
+ return self
+end
+
+function BestSolution.update(self, arx, arf, evals)
+ --[[initialize the best solution with `x`, `f`, and `evals`.
+ Better solutions have smaller `f`-values.--]]
+ if self.f == nil or arf < self.f then
+ self.x = arx:clone()
+ self.f = arf
+ self.evals = evals
+ end
+ return self
+end
+
+function BestSolution.get(self)
+ return self.x, self.f, self.evals
+end
diff --git a/de.lua b/de.lua
new file mode 100644
index 0000000..72a2c8a
--- /dev/null
+++ b/de.lua
@@ -0,0 +1,109 @@
+--[[ An implementation of `DE` (Differential Evolution),
+
+ARGS:
+
+ -`opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX. Note that df/dX is not used
+ -`x` : the initial point
+ -`state.popsize`: population size. If this is left empty, 10*d will be used
+ -`state.scaleFactor`: float, usually between 0.4 and 1
+ -`state.crossoverRate`: float, usually between 0.1 and 0.9
+ -`state.maxEval`: int, maximal number of function evaluations
+
+RETURN:
+ - `x*` : the new `x` vector, at the optimal point
+ - `f` : a table of all function values:
+ `f[1]` is the value of the function before any optimization and
+ `f[#f]` is the final fully optimized value, at `x*`
+]]
+
+require 'torch'
+
+function optim.de(opfunc, x, config, state)
+ -- process input parameters
+ local config = config or {}
+ local state = state
+ local popsize = config.popsize -- population size
+ local scaleFactor = config.scaleFactor -- scale factor
+ local crossoverRate = config.crossoverRate -- crossover rate
+ local maxFEs = tonumber(config.maxFEs) -- maximal number of function evaluations
+ local maxRegion = config.maxRegion -- upper bound of search region
+ local minRegion = config.minRegion -- lower bound of search region
+ local xmean = x:clone():view(-1) -- distribution mean, a flattened copy
+ local D = xmean:size(1) -- number of objective variables/problem dimension
+
+ if config.popsize == nil then
+ popsize = 10 * D
+ end
+ if config.maxRegion == nil then
+ maxRegion = 30
+ end
+ if config.minRegion == nil then
+ minRegion = -30
+ end
+
+ -- Initialize population
+ local fx = x.new(maxFEs)
+ local pop = x.new(popsize, D)
+ local children = x.new(popsize, D)
+ local fitness = x.new(popsize)
+ local children_fitness = x.new(popsize)
+ local fes = 1 -- number of function evaluations
+ local best_fitness
+ local best_solution = x.new(D)
+
+ -- Initialize population and evaluate the its fitness value
+ local gen = torch.Generator()
+ torch.manualSeed(gen, 1)
+
+ pop:uniform(gen, minRegion, maxRegion)
+ for i = 1, popsize do
+ fitness[i] = opfunc(pop[i])
+ fx[fes] = fitness[i]
+ fes = fes + 1
+ end
+
+ -- Find the best solution
+ local index
+ best_fitness, index = fitness:max(1)
+ best_fitness = best_fitness[1]
+ index = index[1]
+ best_solution:copy(pop[index])
+
+ -- Main loop
+ while fes < maxFEs do
+ local r1, r2
+ for i = 1, popsize do
+ repeat
+ r1 = torch.random(gen, 1, popsize)
+ until(r1 ~= i)
+ repeat
+ r2 = torch.random(gen, 1, popsize)
+ until(r2 ~= r1 and r2 ~= i)
+
+ local jrand = torch.random(gen, 1, D)
+ for j = 1, D do
+ if torch.uniform(gen, 0, 1) < crossoverRate or i == jrand then
+ children[i][j] = best_solution[j] + scaleFactor * (pop[r1][j] - pop[r2][j])
+ else
+ children[i][j] = pop[i][j]
+ end
+ end
+ children_fitness[i] = opfunc(children[i])
+ fx[fes] = children_fitness[i]
+ fes = fes + 1
+ end
+
+ for i = 1, popsize do
+ if children_fitness[i] <= fitness[i] then
+ pop[i]:copy(children[i])
+ fitness[i] = children_fitness[i]
+ if fitness[i] < best_fitness then
+ best_fitness = fitness[i]
+ best_solution:copy(children[i])
+ end
+ end
+ end
+ end
+ return best_solution, fx
+end
diff --git a/doc/algos.md b/doc/algos.md
new file mode 100644
index 0000000..a3ce681
--- /dev/null
+++ b/doc/algos.md
@@ -0,0 +1,364 @@
+<a name='optim.algos'></a>
+# Optimization algorithms
+
+The following algorithms are provided:
+
+ * [*Stochastic Gradient Descent*](#optim.sgd)
+ * [*Averaged Stochastic Gradient Descent*](#optim.asgd)
+ * [*L-BFGS*](#optim.lbfgs)
+ * [*Congugate Gradients*](#optim.cg)
+ * [*AdaDelta*](#optim.adadelta)
+ * [*AdaGrad*](#optim.adagrad)
+ * [*Adam*](#optim.adam)
+ * [*AdaMax*](#optim.adamax)
+ * [*FISTA with backtracking line search*](#optim.FistaLS)
+ * [*Nesterov's Accelerated Gradient method*](#optim.nag)
+ * [*RMSprop*](#optim.rmsprop)
+ * [*Rprop*](#optim.rprop)
+ * [*CMAES*](#optim.cmaes)
+
+All these algorithms are designed to support batch optimization as well as stochastic optimization.
+It's up to the user to construct an objective function that represents the batch, mini-batch, or single sample on which to evaluate the objective.
+
+Some of these algorithms support a line search, which can be passed as a function (*L-BFGS*), whereas others only support a learning rate (*SGD*).
+
+General interface:
+
+```lua
+x*, {f}, ... = optim.method(opfunc, x[, config][, state])
+```
+
+
+<a name='optim.sgd'></a>
+## sgd(opfunc, x[, config][, state])
+
+An implementation of *Stochastic Gradient Descent* (*SGD*).
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.learningRate`: learning rate
+ * `config.learningRateDecay`: learning rate decay
+ * `config.weightDecay`: weight decay
+ * `config.weightDecays`: vector of individual weight decays
+ * `config.momentum`: momentum
+ * `config.dampening`: dampening for momentum
+ * `config.nesterov`: enables Nesterov momentum
+ * `state`: a table describing the state of the optimizer; after each call the state is modified
+ * `state.learningRates`: vector of individual learning rates
+
+Returns:
+
+ * `x*`: the new x vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.asgd'></a>
+## asgd(opfunc, x[, config][, state])
+
+An implementation of *Averaged Stochastic Gradient Descent* (*ASGD*):
+
+```lua
+x = (1 - lambda eta_t) x - eta_t df / dx(z, x)
+a = a + mu_t [ x - a ]
+
+eta_t = eta0 / (1 + lambda eta0 t) ^ 0.75
+mu_t = 1 / max(1, t - t0)
+```
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.eta0`: learning rate
+ * `config.lambda`: decay term
+ * `config.alpha`: power for eta update
+ * `config.t0`: point at which to start averaging
+
+Returns:
+
+ * `x*`: the new x vector
+ * `f(x)`: the function, evaluated before the update
+ * `ax`: the averaged x vector
+
+
+<a name='optim.lbfgs'></a>
+## lbfgs(opfunc, x[, config][, state])
+
+An implementation of *L-BFGS* that relies on a user-provided line search function (`state.lineSearch`).
+If this function is not provided, then a simple learning rate is used to produce fixed size steps.
+Fixed size steps are much less costly than line searches, and can be useful for stochastic problems.
+
+The learning rate is used even when a line search is provided.
+This is also useful for large-scale stochastic problems, where opfunc is a noisy approximation of `f(x)`.
+In that case, the learning rate allows a reduction of confidence in the step size.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.maxIter`: Maximum number of iterations allowed
+ * `config.maxEval`: Maximum number of function evaluations
+ * `config.tolFun`: Termination tolerance on the first-order optimality
+ * `config.tolX`: Termination tol on progress in terms of func/param changes
+ * `config.lineSearch`: A line search function
+ * `config.learningRate`: If no line search provided, then a fixed step size is used
+
+Returns:
+ * `x*`: the new `x` vector, at the optimal point
+ * `f`: a table of all function values:
+ * `f[1]` is the value of the function before any optimization and
+ * `f[#f]` is the final fully optimized value, at `x*`
+
+
+<a name='optim.cg'></a>
+## cg(opfunc, x[, config][, state])
+
+An implementation of the *Conjugate Gradient* method which is a rewrite of `minimize.m` written by Carl E. Rasmussen.
+It is supposed to produce exactly same results (give or take numerical accuracy due to some changed order of operations).
+You can compare the result on rosenbrock with [`minimize.m`](http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html).
+
+```lua
+x, fx, c = minimize([0, 0]', 'rosenbrock', -25)
+```
+
+Note that we limit the number of function evaluations only, it seems much more important in practical use.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input, the point of evaluation.
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.maxEval`: max number of function evaluations
+ * `config.maxIter`: max number of iterations
+ * `state`: a table of parameters and temporary allocations.
+ * `state.df[0, 1, 2, 3]`: if you pass `Tensor` they will be used for temp storage
+ * `state.[s, x0]`: if you pass `Tensor` they will be used for temp storage
+
+Returns:
+
+ * `x*`: the new `x` vector, at the optimal point
+ * `f`: a table of all function values where
+ * `f[1]` is the value of the function before any optimization and
+ * `f[#f]` is the final fully optimized value, at `x*`
+
+
+<a name='optim.adadelta'></a>
+## adadelta(opfunc, x[, config][, state])
+
+*AdaDelta* implementation for *SGD* http://arxiv.org/abs/1212.5701.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table of hyper-parameters
+ * `config.rho`: interpolation parameter
+ * `config.eps`: for numerical stability
+ * `state`: a table describing the state of the optimizer; after each call the state is modified
+ * `state.paramVariance`: vector of temporal variances of parameters
+ * `state.accDelta`: vector of accummulated delta of gradients
+
+Returns:
+
+ * `x*`: the new x vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.adagrad'></a>
+## adagrad(opfunc, x[, config][, state])
+
+*AdaGrad* implementation for *SGD*.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.learningRate`: learning rate
+ * `state`: a table describing the state of the optimizer; after each call the state is modified
+ * `state.paramVariance`: vector of temporal variances of parameters
+
+Returns:
+
+ * `x*`: the new `x` vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.adam'></a>
+## adam(opfunc, x[, config][, state])
+
+An implementation of *Adam* from http://arxiv.org/pdf/1412.6980.pdf.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.learningRate`: learning rate
+ * `config.learningRateDecay`: learning rate decay
+ * `config.beta1`: first moment coefficient
+ * `config.beta2`: second moment coefficient
+ * `config.epsilon`: for numerical stability
+ * `state`: a table describing the state of the optimizer; after each call the state is modified
+
+Returns:
+
+ * `x*`: the new x vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.adamax'></a>
+## adamax(opfunc, x[, config][, state])
+
+An implementation of *AdaMax* http://arxiv.org/pdf/1412.6980.pdf.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.learningRate`: learning rate
+ * `config.beta1`: first moment coefficient
+ * `config.beta2`: second moment coefficient
+ * `config.epsilon`: for numerical stability
+ * `state`: a table describing the state of the optimizer; after each call the state is modified
+
+Returns:
+
+ * `x*`: the new `x` vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.FistaLS'></a>
+## FistaLS(f, g, pl, xinit[, params])
+
+*Fista* with backtracking *Line Search*:
+
+ * `f`: smooth function
+ * `g`: non-smooth function
+ * `pl`: minimizer of intermediate problem Q(x, y)
+ * `xinit`: initial point
+ * `params`: table of parameters (**optional**)
+ * `params.L`: 1/(step size) for ISTA/FISTA iteration (0.1)
+ * `params.Lstep`: step size multiplier at each iteration (1.5)
+ * `params.maxiter`: max number of iterations (50)
+ * `params.maxline`: max number of line search iterations per iteration (20)
+ * `params.errthres`: Error thershold for convergence check (1e-4)
+ * `params.doFistaUpdate`: true : use FISTA, false: use ISTA (true)
+ * `params.verbose`: store each iteration solution and print detailed info (false)
+
+On output, `params` will contain these additional fields that can be reused.
+ * `params.L`: last used L value will be written.
+
+These are temporary storages needed by the algo and if the same params object is
+passed a second time, these same storages will be used without new allocation.
+ * `params.xkm`: previous iterarion point
+ * `params.y`: fista iteration
+ * `params.ply`: `ply = pl(y * 1/L grad(f))`
+
+Returns the solution `x` and history of `{function evals, number of line search , ...}`.
+
+Algorithm is published in http://epubs.siam.org/doi/abs/10.1137/080716542
+
+
+<a name='optim.nag'></a>
+## nag(opfunc, x[, config][, state])
+
+An implementation of *SGD* adapted with features of *Nesterov's Accelerated Gradient method*, based on the paper "On the Importance of Initialization and Momentum in Deep Learning" (Sutsveker et. al., ICML 2013) http://www.cs.toronto.edu/~fritz/absps/momentum.pdf.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.learningRate`: learning rate
+ * `config.learningRateDecay`: learning rate decay
+ * `config.weightDecay`: weight decay
+ * `config.momentum`: momentum
+ * `config.learningRates`: vector of individual learning rates
+
+Returns:
+
+ * `x*`: the new `x` vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.rmsprop'></a>
+## rmsprop(opfunc, x[, config][, state])
+
+An implementation of *RMSprop*.
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of a evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.learningRate`: learning rate
+ * `config.alpha`: smoothing constant
+ * `config.epsilon`: value with which to initialise m
+ * `state`: a table describing the state of the optimizer; after each call the state is modified
+ * `state.m`: leaky sum of squares of parameter gradients,
+ * `state.tmp`: and the square root (with epsilon smoothing)
+
+Returns:
+
+ * `x*`: the new x vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.rprop'></a>
+## rprop(opfunc, x[, config][, state])
+
+A plain implementation of *Rprop* (Martin Riedmiller, Koray Kavukcuoglu 2013).
+
+Arguments:
+
+ * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`
+ * `x`: the initial point
+ * `config`: a table with configuration parameters for the optimizer
+ * `config.stepsize`: initial step size, common to all components
+ * `config.etaplus`: multiplicative increase factor, > 1 (default 1.2)
+ * `config.etaminus`: multiplicative decrease factor, < 1 (default 0.5)
+ * `config.stepsizemax`: maximum stepsize allowed (default 50)
+ * `config.stepsizemin`: minimum stepsize allowed (default 1e-6)
+ * `config.niter`: number of iterations (default 1)
+
+Returns:
+
+ * `x*`: the new x vector
+ * `f(x)`: the function, evaluated before the update
+
+
+<a name='optim.cmaes'></a>
+## cmaes(opfunc, x[, config][, state])
+
+An implementation of *CMAES* (*Covariance Matrix Adaptation Evolution Strategy*), ported from https://www.lri.fr/~hansen/barecmaes2.html.
+
+*CMAES* is a stochastic, derivative-free method for heuristic global optimization of non-linear or non-convex continuous optimization problems.
+Note that this method will on average take much more function evaluations to converge then a gradient based method.
+
+Arguments:
+
+If `state` is specified, then `config` is not used at all.
+Otherwise `state` is `config`.
+
+ * `opfunc`: a function that takes a single input `X`, the point of evaluation, and returns `f(X)` and `df/dX`. Note that `df/dX` is not used and can be left 0
+ * `x`: the initial point
+ * `state`: a table describing the state of the optimizer; after each call the state is modified
+ * `state.sigma`: float, initial step-size (standard deviation in each coordinate)
+ * `state.maxEval`: int, maximal number of function evaluations
+ * `state.ftarget`: float, target function value
+ * `state.popsize`: population size. If this is left empty, `4 + int(3 * log(|x|))` will be used
+ * `state.ftarget`: stop if `fitness < ftarget`
+ * `state.verb_disp`: display info on console every verb_disp iteration, 0 for never
+
+Returns:
+ * `x*`: the new `x` vector, at the optimal point
+ * `f`: a table of all function values:
+ * `f[1]` is the value of the function before any optimization and
+ * `f[#f]` is the final fully optimized value, at `x*`
diff --git a/doc/intro.md b/doc/intro.md
new file mode 100644
index 0000000..b387235
--- /dev/null
+++ b/doc/intro.md
@@ -0,0 +1,41 @@
+<a name='optim.overview'></a>
+# Overview
+
+Most optimization algorithms have the following interface:
+
+```lua
+x*, {f}, ... = optim.method(opfunc, x[, config][, state])
+```
+
+where:
+
+* `opfunc`: a user-defined closure that respects this API: `f, df/dx = func(x)`
+* `x`: the current parameter vector (a 1D `Tensor`)
+* `config`: a table of parameters, dependent upon the algorithm
+* `state`: a table of state variables, if `nil`, `config` will contain the state
+* `x*`: the new parameter vector that minimizes `f, x* = argmin_x f(x)`
+* `{f}`: a table of all `f` values, in the order they've been evaluated (for some simple algorithms, like SGD, `#f == 1`)
+
+
+<a name='optim.example'></a>
+## Example
+
+The state table is used to hold the state of the algorihtm.
+It's usually initialized once, by the user, and then passed to the optim function as a black box.
+Example:
+
+```lua
+config = {
+ learningRate = 1e-3,
+ momentum = 0.5
+}
+
+for i, sample in ipairs(training_samples) do
+ local func = function(x)
+ -- define eval function
+ return f, df_dx
+ end
+ optim.sgd(func, x, config)
+end
+```
+
diff --git a/doc/logger.md b/doc/logger.md
new file mode 100644
index 0000000..b7797d2
--- /dev/null
+++ b/doc/logger.md
@@ -0,0 +1,73 @@
+<a name='optim.logger'></a>
+# Logger
+
+`optim` provides also logging and live plotting capabilities via the `optim.Logger()` function.
+
+Live logging is essential to monitor the *network accuracy* and *cost function* during training and testing, for spotting *under-* and *over-fitting*, for *early stopping* or just for monitoring the health of the current optimisation task.
+
+
+## Logging data
+
+Let walk through an example to see how it works.
+
+We start with initialising our logger connected to a text file `accuracy.log`.
+
+```lua
+logger = optim.Logger('accuracy.log')
+```
+
+We can decide to log on it, for example, *training* and *testing accuracies*.
+
+```lua
+logger:setNames{'Training acc.', 'Test acc.'}
+```
+
+And now we can populate our logger randomly.
+
+```lua
+for i = 1, 10 do
+ trainAcc = math.random(0, 100)
+ testAcc = math.random(0, 100)
+ logger:add{trainAcc, testAcc}
+end
+```
+
+We can `cat` `accuracy.log` and see what's in it.
+
+```
+Training acc. Test acc.
+ 7.0000e+01 5.9000e+01
+ 7.6000e+01 8.0000e+00
+ 6.6000e+01 3.4000e+01
+ 7.4000e+01 4.3000e+01
+ 5.7000e+01 1.1000e+01
+ 5.0000e+00 9.8000e+01
+ 7.1000e+01 1.7000e+01
+ 9.8000e+01 2.7000e+01
+ 3.5000e+01 4.7000e+01
+ 6.8000e+01 5.8000e+01
+```
+
+## Visualising logs
+
+OK, cool, but how can we actually see what's going on?
+
+To have a better grasp of what's happening, we can plot our curves.
+We need first to specify the plotting style, choosing from:
+
+ * `.` for dots
+ * `+` for points
+ * `-` for lines
+ * `+-` for points and lines
+ * `~` for using smoothed lines with cubic interpolation
+ * `|` for using boxes
+ * custom string, one can also pass custom strings to use full capability of gnuplot.
+
+```lua
+logger:style{'+-', '+-'}
+logger:plot()
+```
+
+
+
+If we'd like an interactive visualisation, we can put the `logger:plot()` instruction within the `for` loop, and the chart will be updated at every iteration.
diff --git a/doc/logger_plot.png b/doc/logger_plot.png
new file mode 100644
index 0000000..c5e86ae
Binary files /dev/null and b/doc/logger_plot.png differ
diff --git a/fista.lua b/fista.lua
new file mode 100644
index 0000000..c8c6f5e
--- /dev/null
+++ b/fista.lua
@@ -0,0 +1,192 @@
+--[[ FISTA with backtracking line search
+
+- `f` : smooth function
+- `g` : non-smooth function
+- `pl` : minimizer of intermediate problem Q(x,y)
+- `xinit` : initial point
+- `params` : table of parameters (**optional**)
+- `params.L` : 1/(step size) for ISTA/FISTA iteration (0.1)
+- `params.Lstep` : step size multiplier at each iteration (1.5)
+- `params.maxiter` : max number of iterations (50)
+- `params.maxline` : max number of line search iterations per iteration (20)
+- `params.errthres`: Error thershold for convergence check (1e-4)
+- `params.doFistaUpdate` : true : use FISTA, false: use ISTA (true)
+- `params.verbose` : store each iteration solution and print detailed info (false)
+
+On output, `params` will contain these additional fields that can be reused.
+
+- `params.L` : last used L value will be written.
+
+These are temporary storages needed by the algo and if the same params object is
+passed a second time, these same storages will be used without new allocation.
+
+- `params.xkm` : previous iterarion point
+- `params.y` : fista iteration
+- `params.ply` : ply = pl(y - 1/L grad(f))
+
+Returns the solution x and history of {function evals, number of line search ,...}
+
+Algorithm is published in
+
+ @article{beck-fista-09,
+ Author = {Beck, Amir and Teboulle, Marc},
+ Journal = {SIAM J. Img. Sci.},
+ Number = {1},
+ Pages = {183--202},
+ Title = {A Fast Iterative Shrinkage-Thresholding Algorithm for Linear Inverse Problems},
+ Volume = {2},
+ Year = {2009}}
+]]
+function optim.FistaLS(f, g, pl, xinit, params)
+
+ local params = params or {}
+ local L = params.L or 0.1
+ local Lstep = params.Lstep or 1.5
+ local maxiter = params.maxiter or 50
+ local maxline = params.maxline or 20
+ local errthres = params.errthres or 1e-4
+ local doFistaUpdate = params.doFistaUpdate
+ local verbose = params.verbose
+
+ -- temporary allocations
+ params.xkm = params.xkm or torch.Tensor()
+ params.y = params.y or torch.Tensor()
+ params.ply = params.ply or torch.Tensor()
+ local xkm = params.xkm -- previous iteration
+ local y = params.y -- fista iteration
+ local ply = params.ply -- soft shrinked y
+
+ -- we start from all zeros
+ local xk = xinit
+ xkm:resizeAs(xk):zero()
+ ply:resizeAs(xk):zero()
+ y:resizeAs(xk):zero()
+
+ local history = {} -- keep track of stuff
+ local niter = 0 -- number of iterations done
+ local converged = false -- are we done?
+ local tk = 1 -- momentum param for FISTA
+ local tkp = 0
+
+
+ local gy = g(y)
+ local fval = math.huge -- fval = f+g
+ while not converged and niter < maxiter do
+
+ -- run through smooth function (code is input, input is target)
+ -- get derivatives from smooth function
+ local fy,gfy = f(y,'dx')
+ --local gfy = f(y)
+
+ local fply = 0
+ local gply = 0
+ local Q = 0
+
+ ----------------------------------------------
+ -- do line search to find new current location starting from fista loc
+ local nline = 0
+ local linesearchdone = false
+ while not linesearchdone do
+ -- take a step in gradient direction of smooth function
+ ply:copy(y)
+ ply:add(-1/L,gfy)
+
+ -- and solve for minimum of auxiliary problem
+ pl(ply,L)
+ -- this is candidate for new current iteration
+ xk:copy(ply)
+
+ -- evaluate this point F(ply)
+ fply = f(ply)
+
+ -- ply - y
+ ply:add(-1, y)
+ -- <ply-y , \Grad(f(y))>
+ local Q2 = gfy:dot(ply)
+ -- L/2 ||beta-y||^2
+ local Q3 = L/2 * ply:dot(ply)
+ -- Q(beta,y) = F(y) + <beta-y , \Grad(F(y))> + L/2||beta-y||^2 + G(beta)
+ Q = fy + Q2 + Q3
+
+ if verbose then
+ print(string.format('nline=%d L=%g fply=%g Q=%g fy=%g Q2=%g Q3=%g',nline,L,fply,Q,fy,Q2,Q3))
+ end
+ -- check if F(beta) < Q(pl(y),\t)
+ if fply <= Q then --and Fply + Gply <= F then
+ -- now evaluate G here
+ linesearchdone = true
+ elseif nline >= maxline then
+ linesearchdone = true
+ xk:copy(xkm) -- if we can't find a better point, current iter = previous iter
+ --print('oops')
+ else
+ L = L * Lstep
+ end
+ nline = nline + 1
+ end
+ -- end line search
+ ---------------------------------------------
+
+ ---------------------------------------------
+ -- FISTA
+ ---------------------------------------------
+ if doFistaUpdate then
+ -- do the FISTA step
+ tkp = (1 + math.sqrt(1 + 4*tk*tk)) / 2
+ -- x(k-1) = x(k-1) - x(k)
+ xkm:add(-1,xk)
+ -- y(k+1) = x(k) + (1-t(k)/t(k+1))*(x(k-1)-x(k))
+ y:copy(xk)
+ y:add( (1-tk)/tkp , xkm)
+ -- store for next iterations
+ -- x(k-1) = x(k)
+ xkm:copy(xk)
+ else
+ y:copy(xk)
+ end
+ -- t(k) = t(k+1)
+ tk = tkp
+ fply = f(y)
+ gply = g(y)
+ if verbose then
+ print(string.format('iter=%d eold=%g enew=%g',niter,fval,fply+gply))
+ end
+
+ niter = niter + 1
+
+ -- bookeeping
+ fval = fply + gply
+ history[niter] = {}
+ history[niter].nline = nline
+ history[niter].L = L
+ history[niter].F = fval
+ history[niter].Fply = fply
+ history[niter].Gply = gply
+ history[niter].Q = Q
+ params.L = L
+ if verbose then
+ history[niter].xk = xk:clone()
+ history[niter].y = y:clone()
+ end
+
+ -- are we done?
+ if niter > 1 and math.abs(history[niter].F - history[niter-1].F) <= errthres then
+ converged = true
+ xinit:copy(y)
+ return y,history
+ end
+
+ if niter >= maxiter then
+ xinit:copy(y)
+ return y,history
+ end
+
+ --if niter > 1 and history[niter].F > history[niter-1].F then
+ --print(niter, 'This was supposed to be a convex function, we are going up')
+ --converged = true
+ --return xk,history
+ --end
+ end
+ error('not supposed to be here')
+end
+
diff --git a/init.lua b/init.lua
new file mode 100644
index 0000000..a045bd8
--- /dev/null
+++ b/init.lua
@@ -0,0 +1,33 @@
+
+require 'torch'
+
+optim = {}
+
+-- optimizations
+require('optim.sgd')
+require('optim.cg')
+require('optim.asgd')
+require('optim.nag')
+require('optim.fista')
+require('optim.lbfgs')
+require('optim.adagrad')
+require('optim.rprop')
+require('optim.adam')
+require('optim.adamax')
+require('optim.rmsprop')
+require('optim.adadelta')
+require('optim.cmaes')
+require('optim.de')
+
+-- line search functions
+require('optim.lswolfe')
+
+-- helpers
+require('optim.polyinterp')
+require('optim.checkgrad')
+
+-- tools
+require('optim.ConfusionMatrix')
+require('optim.Logger')
+
+return optim
diff --git a/lbfgs.lua b/lbfgs.lua
new file mode 100644
index 0000000..d850fcb
--- /dev/null
+++ b/lbfgs.lua
@@ -0,0 +1,268 @@
+--[[ An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)
+
+This implementation of L-BFGS relies on a user-provided line
+search function (state.lineSearch). If this function is not
+provided, then a simple learningRate is used to produce fixed
+size steps. Fixed size steps are much less costly than line
+searches, and can be useful for stochastic problems.
+
+The learning rate is used even when a line search is provided.
+This is also useful for large-scale stochastic problems, where
+opfunc is a noisy approximation of f(x). In that case, the learning
+rate allows a reduction of confidence in the step size.
+
+ARGS:
+
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX
+- `x` : the initial point
+- `state` : a table describing the state of the optimizer; after each
+ call the state is modified
+- `state.maxIter` : Maximum number of iterations allowed
+- `state.maxEval` : Maximum number of function evaluations
+- `state.tolFun` : Termination tolerance on the first-order optimality
+- `state.tolX` : Termination tol on progress in terms of func/param changes
+- `state.lineSearch` : A line search function
+- `state.learningRate` : If no line search provided, then a fixed step size is used
+
+RETURN:
+- `x*` : the new `x` vector, at the optimal point
+- `f` : a table of all function values:
+ `f[1]` is the value of the function before any optimization and
+ `f[#f]` is the final fully optimized value, at `x*`
+
+(Clement Farabet, 2012)
+]]
+function optim.lbfgs(opfunc, x, config, state)
+ -- get/update state
+ local config = config or {}
+ local state = state or config
+ local maxIter = tonumber(config.maxIter) or 20
+ local maxEval = tonumber(config.maxEval) or maxIter*1.25
+ local tolFun = config.tolFun or 1e-5
+ local tolX = config.tolX or 1e-9
+ local nCorrection = config.nCorrection or 100
+ local lineSearch = config.lineSearch
+ local lineSearchOpts = config.lineSearchOptions
+ local learningRate = config.learningRate or 1
+ local isverbose = config.verbose or false
+
+ state.funcEval = state.funcEval or 0
+ state.nIter = state.nIter or 0
+
+ -- verbose function
+ local verbose
+ if isverbose then
+ verbose = function(...) print('<optim.lbfgs> ', ...) end
+ else
+ verbose = function() end
+ end
+
+ -- import some functions
+ local abs = math.abs
+ local min = math.min
+
+ -- evaluate initial f(x) and df/dx
+ local f,g = opfunc(x)
+ local f_hist = {f}
+ local currentFuncEval = 1
+ state.funcEval = state.funcEval + 1
+ local p = g:size(1)
+
+ -- check optimality of initial point
+ state.tmp1 = state.tmp1 or g.new(g:size()):zero(); local tmp1 = state.tmp1
+ tmp1:copy(g):abs()
+ if tmp1:sum() <= tolFun then
+ -- optimality condition below tolFun
+ verbose('optimality condition below tolFun')
+ return x,f_hist
+ end
+
+ if not state.dir_bufs then
+ -- reusable buffers for y's and s's, and their histories
+ verbose('creating recyclable direction/step/history buffers')
+ state.dir_bufs = state.dir_bufs or g.new(nCorrection+1, p):split(1)
+ state.stp_bufs = state.stp_bufs or g.new(nCorrection+1, p):split(1)
+ for i=1,#state.dir_bufs do
+ state.dir_bufs[i] = state.dir_bufs[i]:squeeze(1)
+ state.stp_bufs[i] = state.stp_bufs[i]:squeeze(1)
+ end
+ end
+
+ -- variables cached in state (for tracing)
+ local d = state.d
+ local t = state.t
+ local old_dirs = state.old_dirs
+ local old_stps = state.old_stps
+ local Hdiag = state.Hdiag
+ local g_old = state.g_old
+ local f_old = state.f_old
+
+ -- optimize for a max of maxIter iterations
+ local nIter = 0
+ while nIter < maxIter do
+ -- keep track of nb of iterations
+ nIter = nIter + 1
+ state.nIter = state.nIter + 1
+
+ ------------------------------------------------------------
+ -- compute gradient descent direction
+ ------------------------------------------------------------
+ if state.nIter == 1 then
+ d = g:clone():mul(-1) -- -g
+ old_dirs = {}
+ old_stps = {}
+ Hdiag = 1
+ else
+ -- do lbfgs update (update memory)
+ local y = table.remove(state.dir_bufs) -- pop
+ local s = table.remove(state.stp_bufs)
+ y:add(g, -1, g_old) -- g - g_old
+ s:mul(d, t) -- d*t
+ local ys = y:dot(s) -- y*s
+ if ys > 1e-10 then
+ -- updating memory
+ if #old_dirs == nCorrection then
+ -- shift history by one (limited-memory)
+ local removed1 = table.remove(old_dirs, 1)
+ local removed2 = table.remove(old_stps, 1)
+ table.insert(state.dir_bufs, removed1)
+ table.insert(state.stp_bufs, removed2)
+ end
+
+ -- store new direction/step
+ table.insert(old_dirs, s)
+ table.insert(old_stps, y)
+
+ -- update scale of initial Hessian approximation
+ Hdiag = ys / y:dot(y) -- (y*y)
+ else
+ -- put y and s back into the buffer pool
+ table.insert(state.dir_bufs, y)
+ table.insert(state.stp_bufs, s)
+ end
+
+ -- compute the approximate (L-BFGS) inverse Hessian
+ -- multiplied by the gradient
+ local k = #old_dirs
+
+ -- need to be accessed element-by-element, so don't re-type tensor:
+ state.ro = state.ro or torch.Tensor(nCorrection); local ro = state.ro
+ for i = 1,k do
+ ro[i] = 1 / old_stps[i]:dot(old_dirs[i])
+ end
+
+ -- iteration in L-BFGS loop collapsed to use just one buffer
+ local q = tmp1 -- reuse tmp1 for the q buffer
+ -- need to be accessed element-by-element, so don't re-type tensor:
+ state.al = state.al or torch.zeros(nCorrection) local al = state.al
+
+ q:mul(g, -1) -- -g
+ for i = k,1,-1 do
+ al[i] = old_dirs[i]:dot(q) * ro[i]
+ q:add(-al[i], old_stps[i])
+ end
+
+ -- multiply by initial Hessian
+ r = d -- share the same buffer, since we don't need the old d
+ r:mul(q, Hdiag) -- q[1] * Hdiag
+ for i = 1,k do
+ local be_i = old_stps[i]:dot(r) * ro[i]
+ r:add(al[i]-be_i, old_dirs[i])
+ end
+ -- final direction is in r/d (same object)
+ end
+ g_old = g_old or g:clone()
+ g_old:copy(g)
+ f_old = f
+
+ ------------------------------------------------------------
+ -- compute step length
+ ------------------------------------------------------------
+ -- directional derivative
+ local gtd = g:dot(d) -- g * d
+
+ -- check that progress can be made along that direction
+ if gtd > -tolX then
+ break
+ end
+
+ -- reset initial guess for step size
+ if state.nIter == 1 then
+ tmp1:copy(g):abs()
+ t = min(1,1/tmp1:sum()) * learningRate
+ else
+ t = learningRate
+ end
+
+ -- optional line search: user function
+ local lsFuncEval = 0
+ if lineSearch and type(lineSearch) == 'function' then
+ -- perform line search, using user function
+ f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
+ table.insert(f_hist, f)
+ else
+ -- no line search, simply move with fixed-step
+ x:add(t,d)
+ if nIter ~= maxIter then
+ -- re-evaluate function only if not in last iteration
+ -- the reason we do this: in a stochastic setting,
+ -- no use to re-evaluate that function here
+ f,g = opfunc(x)
+ lsFuncEval = 1
+ table.insert(f_hist, f)
+ end
+ end
+
+ -- update func eval
+ currentFuncEval = currentFuncEval + lsFuncEval
+ state.funcEval = state.funcEval + lsFuncEval
+
+ ------------------------------------------------------------
+ -- check conditions
+ ------------------------------------------------------------
+ if nIter == maxIter then
+ -- no use to run tests
+ verbose('reached max number of iterations')
+ break
+ end
+
+ if currentFuncEval >= maxEval then
+ -- max nb of function evals
+ verbose('max nb of function evals')
+ break
+ end
+
+ tmp1:copy(g):abs()
+ if tmp1:sum() <= tolFun then
+ -- check optimality
+ verbose('optimality condition below tolFun')
+ break
+ end
+
+ tmp1:copy(d):mul(t):abs()
+ if tmp1:sum() <= tolX then
+ -- step size below tolX
+ verbose('step size below tolX')
+ break
+ end
+
+ if abs(f-f_old) < tolX then
+ -- function value changing less than tolX
+ verbose('function value changing less than tolX')
+ break
+ end
+ end
+
+ -- save state
+ state.old_dirs = old_dirs
+ state.old_stps = old_stps
+ state.Hdiag = Hdiag
+ state.g_old = g_old
+ state.f_old = f_old
+ state.t = t
+ state.d = d
+
+ -- return optimal x, and history of f(x)
+ return x,f_hist,currentFuncEval
+end
diff --git a/lswolfe.lua b/lswolfe.lua
new file mode 100644
index 0000000..0afbdbe
--- /dev/null
+++ b/lswolfe.lua
@@ -0,0 +1,192 @@
+--[[ A Line Search satisfying the Wolfe conditions
+
+ARGS:
+- `opfunc` : a function (the objective) that takes a single input (X),
+ the point of evaluation, and returns f(X) and df/dX
+- `x` : initial point / starting location
+- `t` : initial step size
+- `d` : descent direction
+- `f` : initial function value
+- `g` : gradient at initial location
+- `gtd` : directional derivative at starting location
+- `options.c1` : sufficient decrease parameter
+- `options.c2` : curvature parameter
+- `options.tolX` : minimum allowable step length
+- `options.maxIter` : maximum nb of iterations
+
+RETURN:
+- `f` : function value at x+t*d
+- `g` : gradient value at x+t*d
+- `x` : the next x (=x+t*d)
+- `t` : the step length
+- `lsFuncEval` : the number of function evaluations
+]]
+function optim.lswolfe(opfunc,x,t,d,f,g,gtd,options)
+ -- options
+ options = options or {}
+ local c1 = options.c1 or 1e-4
+ local c2 = options.c2 or 0.9
+ local tolX = options.tolX or 1e-9
+ local maxIter = options.maxIter or 20
+ local isverbose = options.verbose or false
+
+ -- some shortcuts
+ local abs = torch.abs
+ local min = math.min
+ local max = math.max
+
+ -- verbose function
+ local function verbose(...)
+ if isverbose then print('<optim.lswolfe> ', ...) end
+ end
+
+ -- evaluate objective and gradient using initial step
+ local x_init = x:clone()
+ x:add(t,d)
+ local f_new,g_new = opfunc(x)
+ local lsFuncEval = 1
+ local gtd_new = g_new * d
+
+ -- bracket an interval containing a point satisfying the Wolfe
+ -- criteria
+ local LSiter,t_prev,done = 0,0,false
+ local f_prev,g_prev,gtd_prev = f,g:clone(),gtd
+ local bracket,bracketFval,bracketGval
+ while LSiter < maxIter do
+ -- check conditions:
+ if (f_new > (f + c1*t*gtd)) or (LSiter > 1 and f_new >= f_prev) then
+ bracket = x.new{t_prev,t}
+ bracketFval = x.new{f_prev,f_new}
+ bracketGval = x.new(2,g_new:size(1))
+ bracketGval[1] = g_prev
+ bracketGval[2] = g_new
+ break
+
+ elseif abs(gtd_new) <= -c2*gtd then
+ bracket = x.new{t}
+ bracketFval = x.new{f_new}
+ bracketGval = x.new(1,g_new:size(1))
+ bracketGval[1] = g_new
+ done = true
+ break
+
+ elseif gtd_new >= 0 then
+ bracket = x.new{t_prev,t}
+ bracketFval = x.new{f_prev,f_new}
+ bracketGval = x.new(2,g_new:size(1))
+ bracketGval[1] = g_prev
+ bracketGval[2] = g_new
+ break
+
+ end
+
+ -- interpolate:
+ local tmp = t_prev
+ t_prev = t
+ local minStep = t + 0.01*(t-tmp)
+ local maxStep = t*10
+ t = optim.polyinterp(x.new{{tmp,f_prev,gtd_prev},
+ {t,f_new,gtd_new}},
+ minStep, maxStep)
+
+ -- next step:
+ f_prev = f_new
+ g_prev = g_new:clone()
+ gtd_prev = gtd_new
+ x[{}] = x_init
+ x:add(t,d)
+ f_new,g_new = opfunc(x)
+ lsFuncEval = lsFuncEval + 1
+ gtd_new = g_new * d
+ LSiter = LSiter + 1
+ end
+
+ -- reached max nb of iterations?
+ if LSiter == maxIter then
+ bracket = x.new{0,t}
+ bracketFval = x.new{f,f_new}
+ bracketGval = x.new(2,g_new:size(1))
+ bracketGval[1] = g
+ bracketGval[2] = g_new
+ end
+
+ -- zoom phase: we now have a point satisfying the criteria, or
+ -- a bracket around it. We refine the bracket until we find the
+ -- exact point satisfying the criteria
+ local insufProgress = false
+ local LOposRemoved = 0
+ while not done and LSiter < maxIter do
+ -- find high and low points in bracket
+ local f_LO,LOpos = bracketFval:min(1)
+ LOpos = LOpos[1] f_LO = f_LO[1]
+ local HIpos = -LOpos+3
+
+ -- compute new trial value
+ t = optim.polyinterp(x.new{{bracket[1],bracketFval[1],bracketGval[1]*d},
+ {bracket[2],bracketFval[2],bracketGval[2]*d}})
+
+ -- test what we are making sufficient progress
+ if min(bracket:max()-t,t-bracket:min())/(bracket:max()-bracket:min()) < 0.1 then
+ if insufProgress or t>=bracket:max() or t <= bracket:min() then
+ if abs(t-bracket:max()) < abs(t-bracket:min()) then
+ t = bracket:max()-0.1*(bracket:max()-bracket:min())
+ else
+ t = bracket:min()+0.1*(bracket:max()-bracket:min())
+ end
+ insufProgress = false
+ else
+ insufProgress = true
+ end
+ else
+ insufProgress = false
+ end
+
+ -- Evaluate new point
+ x[{}] = x_init
+ x:add(t,d)
+ f_new,g_new = opfunc(x)
+ lsFuncEval = lsFuncEval + 1
+ gtd_new = g_new * d
+ LSiter = LSiter + 1
+ if f_new > f + c1*t*gtd or f_new >= f_LO then
+ -- Armijo condition not satisfied or not lower than lowest point
+ bracket[HIpos] = t
+ bracketFval[HIpos] = f_new
+ bracketGval[HIpos] = g_new
+ else
+ if abs(gtd_new) <= - c2*gtd then
+ -- Wolfe conditions satisfied
+ done = true
+ elseif gtd_new*(bracket[HIpos]-bracket[LOpos]) >= 0 then
+ -- Old HI becomes new LO
+ bracket[HIpos] = bracket[LOpos]
+ bracketFval[HIpos] = bracketFval[LOpos]
+ bracketGval[HIpos] = bracketGval[LOpos]
+ end
+ -- New point becomes new LO
+ bracket[LOpos] = t
+ bracketFval[LOpos] = f_new
+ bracketGval[LOpos] = g_new
+ end
+
+ -- done?
+ if not done and abs((bracket[1]-bracket[2])*gtd_new) < tolX then
+ break
+ end
+ end
+
+ -- be verbose
+ if LSiter == maxIter then
+ verbose('reached max number of iterations')
+ end
+
+ -- return stuff
+ local _,LOpos = bracketFval:min(1)
+ LOpos = LOpos[1]
+ t = bracket[LOpos]
+ f_new = bracketFval[LOpos]
+ g_new = bracketGval[LOpos]
+ x[{}] = x_init
+ x:add(t,d)
+ return f_new,g_new,x,t,lsFuncEval
+end
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..9624b2a
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,8 @@
+site_name: optim
+theme : simplex
+repo_url : https://github.com/torch/optim
+use_directory_urls : false
+markdown_extensions: [extra]
+docs_dir : doc
+pages:
+- [index.md, Optim]
diff --git a/nag.lua b/nag.lua
new file mode 100644
index 0000000..875d81e
--- /dev/null
+++ b/nag.lua
@@ -0,0 +1,86 @@
+----------------------------------------------------------------------
+-- An implementation of SGD adapted with features of Nesterov's
+-- Accelerated Gradient method, based on the paper
+-- On the Importance of Initialization and Momentum in Deep Learning
+-- Sutsveker et. al., ICML 2013
+--
+-- ARGS:
+-- opfunc : a function that takes a single input (X), the point of
+-- evaluation, and returns f(X) and df/dX
+-- x : the initial point
+-- state : a table describing the state of the optimizer; after each
+-- call the state is modified
+-- state.learningRate : learning rate
+-- state.learningRateDecay : learning rate decay
+-- state.weightDecay : weight decay
+-- state.momentum : momentum
+-- state.learningRates : vector of individual learning rates
+--
+-- RETURN:
+-- x : the new x vector
+-- f(x) : the function, evaluated before the update
+--
+-- (Dilip Krishnan, 2013)
+--
+
+function optim.nag(opfunc, x, config, state)
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 1e-3
+ local lrd = config.learningRateDecay or 0
+ local wd = config.weightDecay or 0
+ local mom = config.momentum or 0.9
+ local damp = config.dampening or mom
+ local lrs = config.learningRates
+ state.evalCounter = state.evalCounter or 0
+ local nevals = state.evalCounter
+
+ if mom <= 0 then
+ error('Momentum must be positive for Nesterov Accelerated Gradient')
+ end
+
+ -- (1) evaluate f(x) and df/dx
+ -- first step in the direction of the momentum vector
+
+ if state.dfdx then
+ x:add(mom, state.dfdx)
+ end
+ -- then compute gradient at that point
+ -- comment out the above line to get the original SGD
+ local fx,dfdx = opfunc(x)
+
+ -- (2) weight decay
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
+
+ -- (3) learning rate decay (annealing)
+ local clr = lr / (1 + nevals*lrd)
+
+ -- (4) apply momentum
+ if not state.dfdx then
+ state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):fill(0)
+ else
+ state.dfdx:mul(mom)
+ end
+
+ -- (5) parameter update with single or individual learning rates
+ if lrs then
+ if not state.deltaParameters then
+ state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
+ end
+ state.deltaParameters:copy(lrs):cmul(dfdx)
+ x:add(-clr, state.deltaParameters)
+ state.dfdx:add(-clr, state.deltaParameters)
+ else
+ x:add(-clr, dfdx)
+ state.dfdx:add(-clr, dfdx)
+ end
+
+ -- (6) update evaluation counter
+ state.evalCounter = state.evalCounter + 1
+
+ -- return x, f(x) before optimization
+ return x,{fx}
+end
diff --git a/optim-1.0.3-0.rockspec b/optim-1.0.3-0.rockspec
new file mode 100644
index 0000000..f9c45b6
--- /dev/null
+++ b/optim-1.0.3-0.rockspec
@@ -0,0 +1,29 @@
+package = "optim"
+version = "1.0.3-0"
+
+source = {
+ url = "git://github.com/torch/optim",
+ tag = "1.0.3-0"
+}
+
+description = {
+ summary = "An optimization library for Torch.",
+ detailed = [[
+This package contains several optimization routines for Torch.
+ ]],
+ homepage = "https://github.com/torch/optim",
+ license = "BSD"
+}
+
+dependencies = {
+ "torch >= 7.0",
+ "sys >= 1.0",
+}
+
+build = {
+ type = "command",
+ build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
+ ]],
+ install_command = "cd build && $(MAKE) install"
+}
diff --git a/optim-1.0.3-1.rockspec b/optim-1.0.3-1.rockspec
new file mode 100644
index 0000000..8bcb10e
--- /dev/null
+++ b/optim-1.0.3-1.rockspec
@@ -0,0 +1,29 @@
+package = "optim"
+version = "1.0.3-1"
+
+source = {
+ url = "git://github.com/torch/optim",
+ tag = "1.0.3-1"
+}
+
+description = {
+ summary = "An optimization library for Torch.",
+ detailed = [[
+This package contains several optimization routines for Torch.
+ ]],
+ homepage = "https://github.com/torch/optim",
+ license = "BSD"
+}
+
+dependencies = {
+ "torch >= 7.0",
+ "sys >= 1.0",
+}
+
+build = {
+ type = "command",
+ build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
+ ]],
+ install_command = "cd build && $(MAKE) install"
+}
diff --git a/optim-1.0.4-0.rockspec b/optim-1.0.4-0.rockspec
new file mode 100644
index 0000000..d0328fb
--- /dev/null
+++ b/optim-1.0.4-0.rockspec
@@ -0,0 +1,28 @@
+package = "optim"
+version = "1.0.4-0"
+
+source = {
+ url = "git://github.com/torch/optim",
+ tag = "1.0.4-0"
+}
+
+description = {
+ summary = "An optimization library for Torch.",
+ detailed = [[
+This package contains several optimization routines for Torch.
+ ]],
+ homepage = "https://github.com/torch/optim",
+ license = "BSD"
+}
+
+dependencies = {
+ "torch >= 7.0",
+}
+
+build = {
+ type = "command",
+ build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
+ ]],
+ install_command = "cd build && $(MAKE) install"
+}
diff --git a/optim-1.0.5-0.rockspec b/optim-1.0.5-0.rockspec
new file mode 100644
index 0000000..325e67d
--- /dev/null
+++ b/optim-1.0.5-0.rockspec
@@ -0,0 +1,27 @@
+package = "optim"
+version = "1.0.5-0"
+
+source = {
+ url = "git://github.com/torch/optim",
+}
+
+description = {
+ summary = "An optimization library for Torch.",
+ detailed = [[
+This package contains several optimization routines for Torch.
+ ]],
+ homepage = "https://github.com/torch/optim",
+ license = "BSD"
+}
+
+dependencies = {
+ "torch >= 7.0",
+}
+
+build = {
+ type = "command",
+ build_command = [[
+cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
+ ]],
+ install_command = "cd build && $(MAKE) install"
+}
diff --git a/polyinterp.lua b/polyinterp.lua
new file mode 100644
index 0000000..5975981
--- /dev/null
+++ b/polyinterp.lua
@@ -0,0 +1,234 @@
+local function isreal(x)
+ return x == x
+end
+
+local function isnan(x)
+ return not x == x
+end
+
+local function roots(c)
+ local tol=1e-12
+ c[torch.lt(torch.abs(c),tol)]=0
+
+ local nonzero = torch.ne(c,0)
+ if nonzero:max() == 0 then
+ return 0
+ end
+
+ -- first non-zero
+ local _,pos = torch.max(nonzero,1)
+ pos = pos[1]
+ c=c[{ {pos,-1} }]
+
+ local nz = 0
+ for i=c:size(1),1,-1 do
+ if c[i] ~= 0 then
+ break
+ else
+ nz = nz + 1
+ end
+ end
+ c=c[{ {1,c:size(1)-nz} }]
+
+ local n = c:size(1)-1
+ if n == 1 then
+ local e = torch.Tensor({{-c[2]/c[1], 0}})
+ if nz > 0 then
+ return torch.cat(e, torch.zeros(nz, 2), 1)
+ else
+ return e
+ end
+ elseif n > 1 then
+ local A = torch.diag(torch.ones(n-1),-1)
+ A[1] = -c[{ {2,n+1} }]/c[1];
+ local e = torch.eig(A,'N')
+ if nz > 0 then
+ return torch.cat(e, torch.zeros(nz,2), 1)
+ else
+ return e
+ end
+ else
+ return torch.zeros(nz,2)
+ end
+end
+
+local function real(x)
+ if type(x) == number then return x end
+ return x[{ {} , 1}]
+end
+
+local function imag(x)
+ if type(x) == 'number' then return 0 end
+ if x:nDimension() == 1 then
+ return torch.zeros(x:size(1))
+ else
+ return x[{ {}, 2}]
+ end
+end
+
+local function polyval(p,x)
+ local pwr = p:size(1)
+ if type(x) == 'number' then
+ local val = 0
+ p:apply(function(pc) pwr = pwr-1; val = val + pc*x^pwr; return pc end)
+ return val
+ else
+ local val = x.new(x:size(1))
+ p:apply(function(pc) pwr = pwr-1; val:add(pc,torch.pow(x,pwr)); return pc end)
+ return val
+ end
+end
+
+----------------------------------------------------------------------
+-- Minimum of interpolating polynomial based on function and
+-- derivative values
+--
+-- ARGS:
+-- points : N triplets (x,f,g), must be a Tensor
+-- xmin : min value that brackets minimum (default: min of points)
+-- xmax : max value that brackets maximum (default: max of points)
+--
+-- RETURN:
+-- minPos : position of minimum
+--
+function optim.polyinterp(points,xminBound,xmaxBound)
+ -- locals
+ local sqrt = torch.sqrt
+ local mean = torch.mean
+ local Tensor = torch.Tensor
+ local zeros = torch.zeros
+ local max = math.max
+ local min = math.min
+
+ -- nb of points / order of polynomial
+ local nPoints = points:size(1)
+ local order = nPoints*2-1
+
+ -- returned values
+ local minPos
+
+ -- Code for most common case:
+ -- + cubic interpolation of 2 points w/ function and derivative values for both
+ -- + no xminBound/xmaxBound
+ if nPoints == 2 and order == 3 and not xminBound and not xmaxBound then
+ -- Solution in this case (where x2 is the farthest point):
+ -- d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
+ -- d2 = sqrt(d1^2 - g1*g2);
+ -- minPos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
+ -- t_new = min(max(minPos,x1),x2);
+ local minVal,minPos = points[{ {},1 }]:min(1)
+ minVal = minVal[1] minPos = minPos[1]
+ local notMinPos = -minPos+3;
+
+ local d1 = points[{minPos,3}] + points[{notMinPos,3}]
+ - 3*(points[{minPos,2}]-points[{notMinPos,2}])
+ / (points[{minPos,1}]-points[{notMinPos,1}]);
+ local d2 = sqrt(d1^2 - points[{minPos,3}]*points[{notMinPos,3}]);
+
+ if isreal(d2) then -- isreal()
+ local t = points[{notMinPos,1}] - (points[{notMinPos,1}]
+ - points[{minPos,1}]) * ((points[{notMinPos,3}] + d2 - d1)
+ / (points[{notMinPos,3}] - points[{minPos,3}] + 2*d2))
+
+ minPos = min(max(t,points[{minPos,1}]),points[{notMinPos,1}])
+ else
+ minPos = mean(points[{{},1}])
+ end
+ return minPos
+ end
+
+ -- TODO: get the code below to work!
+ --error('<optim.polyinterp> extrapolation not implemented yet...')
+
+ -- Compute Bounds of Interpolation Area
+ local xmin = points[{{},1}]:min()
+ local xmax = points[{{},1}]:max()
+ xminBound = xminBound or xmin
+ xmaxBound = xmaxBound or xmax
+
+ -- Add constraints on function values
+ local A = zeros(nPoints*2,order+1)
+ local b = zeros(nPoints*2,1)
+ for i = 1,nPoints do
+ local constraint = zeros(order+1)
+ for j = order,0,-1 do
+ constraint[order-j+1] = points[{i,1}]^j
+ end
+ A[i] = constraint
+ b[i] = points[{i,2}]
+ end
+
+ -- Add constraints based on derivatives
+ for i = 1,nPoints do
+ local constraint = zeros(order+1)
+ for j = 1,order do
+ constraint[j] = (order-j+1)*points[{i,1}]^(order-j)
+ end
+ A[nPoints+i] = constraint
+ b[nPoints+i] = points[{i,3}]
+ end
+
+ -- Find interpolating polynomial
+ local res = torch.gels(b,A)
+ local params = res[{ {1,nPoints*2} }]:squeeze()
+
+ --print(A)
+ --print(b)
+ --print(params)
+ params[torch.le(torch.abs(params),1e-12)]=0
+
+ -- Compute Critical Points
+ local dParams = zeros(order);
+ for i = 1,params:size(1)-1 do
+ dParams[i] = params[i]*(order-i+1)
+ end
+
+ -- nan/inf?
+ local nans = false
+ if torch.ne(dParams,dParams):max() > 0 or torch.eq(dParams,math.huge):max() > 0 then
+ nans = true
+ end
+ -- for i = 1,dParams:size(1) do
+ -- if dParams[i] ~= dParams[i] or dParams[i] == math.huge then
+ -- nans = true
+ -- break
+ -- end
+ -- end
+ local cp = torch.cat(Tensor{xminBound,xmaxBound},points[{{},1}])
+ if not nans then
+ local cproots = roots(dParams)
+ local cpi = zeros(cp:size(1),2)
+ cpi[{ {1,cp:size(1)} , 1 }] = cp
+ cp = torch.cat(cpi,cproots,1)
+ end
+
+ --print(dParams)
+ --print(cp)
+
+ -- Test Critical Points
+ local fmin = math.huge
+ -- Default to Bisection if no critical points valid:
+ minPos = (xminBound+xmaxBound)/2
+ --print(minPos,fmin)
+ --print(xminBound,xmaxBound)
+ for i = 1,cp:size(1) do
+ local xCP = cp[{ {i,i} , {} }]
+ --print('xcp=')
+ --print(xCP)
+ local ixCP = imag(xCP)[1]
+ local rxCP = real(xCP)[1]
+ if ixCP == 0 and rxCP >= xminBound and rxCP <= xmaxBound then
+ local fCP = polyval(params,rxCP)
+ --print('fcp=')
+ --print(fCP)
+ --print(fCP < fmin)
+ if fCP < fmin then
+ minPos = rxCP
+ fmin = fCP
+ --print('u',minPos,fmin)
+ end
+ --print('v',minPos,fmin)
+ end
+ end
+ return minPos,fmin
+end
diff --git a/rmsprop.lua b/rmsprop.lua
new file mode 100644
index 0000000..1eb526d
--- /dev/null
+++ b/rmsprop.lua
@@ -0,0 +1,57 @@
+--[[ An implementation of RMSprop
+
+ARGS:
+
+- 'opfunc' : a function that takes a single input (X), the point
+ of a evaluation, and returns f(X) and df/dX
+- 'x' : the initial point
+- 'config` : a table with configuration parameters for the optimizer
+- 'config.learningRate' : learning rate
+- 'config.alpha' : smoothing constant
+- 'config.epsilon' : value with which to initialise m
+- 'config.weightDecay' : weight decay
+- 'state' : a table describing the state of the optimizer;
+ after each call the state is modified
+- 'state.m' : leaky sum of squares of parameter gradients,
+- 'state.tmp' : and the square root (with epsilon smoothing)
+
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+
+]]
+
+function optim.rmsprop(opfunc, x, config, state)
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 1e-2
+ local alpha = config.alpha or 0.99
+ local epsilon = config.epsilon or 1e-8
+ local wd = config.weightDecay or 0
+
+ -- (1) evaluate f(x) and df/dx
+ local fx, dfdx = opfunc(x)
+
+ -- (2) weight decay
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
+
+ -- (3) initialize mean square values and square gradient storage
+ if not state.m then
+ state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(1)
+ state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx)
+ end
+
+ -- (4) calculate new (leaky) mean squared values
+ state.m:mul(alpha)
+ state.m:addcmul(1.0-alpha, dfdx, dfdx)
+
+ -- (5) perform update
+ state.tmp:sqrt(state.m):add(epsilon)
+ x:addcdiv(-lr, dfdx, state.tmp)
+
+ -- return x*, f(x) before optimization
+ return x, {fx}
+end
diff --git a/rprop.lua b/rprop.lua
new file mode 100644
index 0000000..d7af164
--- /dev/null
+++ b/rprop.lua
@@ -0,0 +1,103 @@
+--[[ A plain implementation of RPROP
+
+ARGS:
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX
+- `x` : the initial point
+- `state` : a table describing the state of the optimizer; after each
+ call the state is modified
+- `state.stepsize` : initial step size, common to all components
+- `state.etaplus` : multiplicative increase factor, > 1 (default 1.2)
+- `state.etaminus` : multiplicative decrease factor, < 1 (default 0.5)
+- `state.stepsizemax` : maximum stepsize allowed (default 50)
+- `state.stepsizemin` : minimum stepsize allowed (default 1e-6)
+- `state.niter` : number of iterations (default 1)
+
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+
+(Martin Riedmiller, Koray Kavukcuoglu 2013)
+--]]
+function optim.rprop(opfunc, x, config, state)
+ if config == nil and state == nil then
+ print('no state table RPROP initializing')
+ end
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local stepsize = config.stepsize or 0.1
+ local etaplus = config.etaplus or 1.2
+ local etaminus = config.etaminus or 0.5
+ local stepsizemax = config.stepsizemax or 50.0
+ local stepsizemin = config.stepsizemin or 1E-06
+ local niter = config.niter or 1
+
+ local hfx = {}
+
+ for i=1,niter do
+
+ -- (1) evaluate f(x) and df/dx
+ local fx,dfdx = opfunc(x)
+
+ -- init temp storage
+ if not state.delta then
+ state.delta = dfdx.new(dfdx:size()):zero()
+ state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
+ state.sign = dfdx.new(dfdx:size())
+ state.psign = torch.ByteTensor(dfdx:size())
+ state.nsign = torch.ByteTensor(dfdx:size())
+ state.zsign = torch.ByteTensor(dfdx:size())
+ state.dminmax = torch.ByteTensor(dfdx:size())
+ if torch.type(x)=='torch.CudaTensor' then
+ -- Push to GPU
+ state.psign = state.psign:cuda()
+ state.nsign = state.nsign:cuda()
+ state.zsign = state.zsign:cuda()
+ state.dminmax = state.dminmax:cuda()
+ end
+ end
+
+ -- sign of derivative from last step to this one
+ torch.cmul(state.sign, dfdx, state.delta)
+ torch.sign(state.sign, state.sign)
+
+ -- get indices of >0, <0 and ==0 entries
+ state.sign.gt(state.psign, state.sign, 0)
+ state.sign.lt(state.nsign, state.sign, 0)
+ state.sign.eq(state.zsign, state.sign, 0)
+
+ -- get step size updates
+ state.sign[state.psign] = etaplus
+ state.sign[state.nsign] = etaminus
+ state.sign[state.zsign] = 1
+
+ -- update stepsizes with step size updates
+ state.stepsize:cmul(state.sign)
+
+ -- threshold step sizes
+ -- >50 => 50
+ state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
+ state.stepsize[state.dminmax] = stepsizemax
+ -- <1e-6 ==> 1e-6
+ state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
+ state.stepsize[state.dminmax] = stepsizemin
+
+ -- for dir<0, dfdx=0
+ -- for dir>=0 dfdx=dfdx
+ dfdx[state.nsign] = 0
+ -- state.sign = sign(dfdx)
+ torch.sign(state.sign,dfdx)
+
+ -- update weights
+ x:addcmul(-1,state.sign,state.stepsize)
+
+ -- update state.dfdx with current dfdx
+ state.delta:copy(dfdx)
+
+ table.insert(hfx,fx)
+ end
+
+ -- return x*, f(x) before optimization
+ return x,hfx
+end
diff --git a/sgd.lua b/sgd.lua
new file mode 100644
index 0000000..e21c696
--- /dev/null
+++ b/sgd.lua
@@ -0,0 +1,90 @@
+--[[ A plain implementation of SGD
+
+ARGS:
+
+- `opfunc` : a function that takes a single input (X), the point
+ of a evaluation, and returns f(X) and df/dX
+- `x` : the initial point
+- `config` : a table with configuration parameters for the optimizer
+- `config.learningRate` : learning rate
+- `config.learningRateDecay` : learning rate decay
+- `config.weightDecay` : weight decay
+- `config.weightDecays` : vector of individual weight decays
+- `config.momentum` : momentum
+- `config.dampening` : dampening for momentum
+- `config.nesterov` : enables Nesterov momentum
+- `config.learningRates` : vector of individual learning rates
+- `state` : a table describing the state of the optimizer; after each
+ call the state is modified
+- `state.evalCounter` : evaluation counter (optional: 0, by default)
+
+RETURN:
+- `x` : the new x vector
+- `f(x)` : the function, evaluated before the update
+
+(Clement Farabet, 2012)
+]]
+function optim.sgd(opfunc, x, config, state)
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 1e-3
+ local lrd = config.learningRateDecay or 0
+ local wd = config.weightDecay or 0
+ local mom = config.momentum or 0
+ local damp = config.dampening or mom
+ local nesterov = config.nesterov or false
+ local lrs = config.learningRates
+ local wds = config.weightDecays
+ state.evalCounter = state.evalCounter or 0
+ local nevals = state.evalCounter
+ assert(not nesterov or (mom > 0 and damp == 0), "Nesterov momentum requires a momentum and zero dampening")
+
+ -- (1) evaluate f(x) and df/dx
+ local fx,dfdx = opfunc(x)
+
+ -- (2) weight decay with single or individual parameters
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ elseif wds then
+ if not state.decayParameters then
+ state.decayParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
+ end
+ state.decayParameters:copy(wds):cmul(x)
+ dfdx:add(state.decayParameters)
+ end
+
+ -- (3) apply momentum
+ if mom ~= 0 then
+ if not state.dfdx then
+ state.dfdx = torch.Tensor():typeAs(dfdx):resizeAs(dfdx):copy(dfdx)
+ else
+ state.dfdx:mul(mom):add(1-damp, dfdx)
+ end
+ if nesterov then
+ dfdx:add(mom, state.dfdx)
+ else
+ dfdx = state.dfdx
+ end
+ end
+
+ -- (4) learning rate decay (annealing)
+ local clr = lr / (1 + nevals*lrd)
+
+ -- (5) parameter update with single or individual learning rates
+ if lrs then
+ if not state.deltaParameters then
+ state.deltaParameters = torch.Tensor():typeAs(x):resizeAs(dfdx)
+ end
+ state.deltaParameters:copy(lrs):cmul(dfdx)
+ x:add(-clr, state.deltaParameters)
+ else
+ x:add(-clr, dfdx)
+ end
+
+ -- (6) update evaluation counter
+ state.evalCounter = state.evalCounter + 1
+
+ -- return x*, f(x) before optimization
+ return x,{fx}
+end
diff --git a/test/l2.lua b/test/l2.lua
new file mode 100644
index 0000000..7866464
--- /dev/null
+++ b/test/l2.lua
@@ -0,0 +1,22 @@
+require 'torch'
+-- rosenbrock.m This function returns the function value, partial derivatives
+-- and Hessian of the (general dimension) rosenbrock function, given by:
+--
+-- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
+--
+-- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1).
+--
+-- Carl Edward Rasmussen, 2001-07-21.
+
+function l2(x)
+
+ local xx = x:clone()
+ xx:cmul(xx)
+ local fout = xx:sum()
+
+ local dx = torch.Tensor():resizeAs(x):copy(x)
+ dx:mul(2)
+ --print('l2 eval = ', fout)
+ return fout,dx
+
+end
\ No newline at end of file
diff --git a/test/rosenbrock.lua b/test/rosenbrock.lua
new file mode 100644
index 0000000..06f1676
--- /dev/null
+++ b/test/rosenbrock.lua
@@ -0,0 +1,50 @@
+require 'torch'
+-- rosenbrock.m This function returns the function value, partial derivatives
+-- and Hessian of the (general dimension) rosenbrock function, given by:
+--
+-- f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
+--
+-- where D is the dimension of x. The true minimum is 0 at x = (1 1 ... 1).
+--
+-- Carl Edward Rasmussen, 2001-07-21.
+
+function rosenbrock(x)
+
+ -- (1) compute f(x)
+ local d = x:size(1)
+ -- x1 = x(i)^2
+ local x1 = x.new(d-1):copy(x:narrow(1,1,d-1))
+ -- x(i+1) - x(i)^2
+ x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1))
+
+ -- 100*(x(i+1) - x(i)^2)^2
+ x1:cmul(x1):mul(100)
+
+ -- x(i)
+ local x0 = x.new(d-1):copy(x:narrow(1,1,d-1))
+ -- 1-x(i)
+ x0:mul(-1):add(1)
+ -- (1-x(i))^2
+ x0:cmul(x0)
+ -- 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
+ x1:add(x0)
+ local fout = x1:sum()
+
+ -- (2) compute f(x)/dx
+ local dxout = x.new():resizeAs(x):zero()
+ -- df(1:D-1) = - 400*x(1:D-1).*(x(2:D)-x(1:D-1).^2) - 2*(1-x(1:D-1));
+
+ x1:copy(x:narrow(1,1,d-1))
+ x1:cmul(x1):mul(-1):add(x:narrow(1,2,d-1)):cmul(x:narrow(1,1,d-1)):mul(-400)
+ x0:copy(x:narrow(1,1,d-1)):mul(-1):add(1):mul(-2)
+ x1:add(x0)
+ dxout:narrow(1,1,d-1):copy(x1)
+
+ -- df(2:D) = df(2:D) + 200*(x(2:D)-x(1:D-1).^2);
+ x0:copy(x:narrow(1,1,d-1))
+ x0:cmul(x0):mul(-1):add(x:narrow(1,2,d-1)):mul(200)
+ dxout:narrow(1,2,d-1):add(x0)
+
+ return fout,dxout
+
+end
diff --git a/test/sparsecoding.lua b/test/sparsecoding.lua
new file mode 100644
index 0000000..c1d443e
--- /dev/null
+++ b/test/sparsecoding.lua
@@ -0,0 +1,127 @@
+require 'kex'
+
+-- L1 FISTA Solution
+-- L1 solution with a linear dictionary ||Ax-b||^2 + \lambda ||x||_1
+-- D : dictionary, each column is a dictionary element
+-- params: set of params to pass to FISTA and possibly temp allocation (**optional**)
+-- check unsup.FistaLS function for details.
+-- returns fista : a table with the following entries
+-- fista.run(x,lambda) : run L1 sparse coding algorithm with input x and lambda.
+-- The following entries will be allocated and reused by each call to fista.run(x,lambda)
+-- fista.reconstruction: reconstructed input.
+-- fista.gradf : gradient of L2 part of the problem wrt x
+-- fista.code : the solution of L1 problem
+-- The following entries just point to data passed to fista.run(x)
+-- fista.input : points to the tensor 'x' used in the last fista.run(x,lambda)
+-- fista.lambda : the lambda value used in the last fista.run(x,lambda)
+function optim.FistaL1(D, params)
+
+ -- this is for keeping parameters related to fista algorithm
+ local params = params or {}
+ -- this is for temporary variables and such
+ local fista = {}
+
+ -- related to FISTA
+ params.L = params.L or 0.1
+ params.Lstep = params.Lstep or 1.5
+ params.maxiter = params.maxiter or 50
+ params.maxline = params.maxline or 20
+ params.errthres = params.errthres or 1e-4
+
+ -- temporary stuff that might be good to keep around
+ fista.reconstruction = torch.Tensor()
+ fista.gradf = torch.Tensor()
+ fista.gradg = torch.Tensor()
+ fista.code = torch.Tensor()
+
+ -- these will be assigned in run(x)
+ -- fista.input points to the last input that was run
+ -- fista.lambda is the lambda value from the last run
+ fista.input = nil
+ fista.lambda = nil
+
+ -- CREATE FUNCTION CLOSURES
+ -- smooth function
+ fista.f = function (x,mode)
+
+ local reconstruction = fista.reconstruction
+ local input = fista.input
+ -- -------------------
+ -- function evaluation
+ if x:dim() == 1 then
+ --print(D:size(),x:size())
+ reconstruction:resize(D:size(1))
+ reconstruction:addmv(0,1,D,x)
+ elseif x:dim(2) then
+ reconstruction:resize(x:size(1),D:size(1))
+ reconstruction:addmm(0,1,x,D:t())
+ end
+ local fval = input:dist(reconstruction)^2
+
+ -- ----------------------
+ -- derivative calculation
+ if mode and mode:match('dx') then
+ local gradf = fista.gradf
+ reconstruction:add(-1,input):mul(2)
+ gradf:resizeAs(x)
+ if input:dim() == 1 then
+ gradf:addmv(0,1,D:t(),reconstruction)
+ else
+ gradf:addmm(0,1,reconstruction, D)
+ end
+ ---------------------------------------
+ -- return function value and derivative
+ return fval, gradf, reconstruction
+ end
+
+ ------------------------
+ -- return function value
+ return fval, reconstruction
+ end
+
+ -- non-smooth function L1
+ fista.g = function (x)
+
+ local fval = fista.lambda*x:norm(1)
+
+ if mod and mode:match('dx') then
+ local gradg = fista.gradg
+ gradg:resizAs(x)
+ gradg:sign():mul(fista.lambda)
+ return fval,gradg
+ end
+ return fval
+ end
+
+ -- argmin_x Q(x,y), just shrinkage for L1
+ fista.pl = function (x,L)
+ x:shrinkage(fista.lambda/L)
+ end
+
+ fista.run = function(x, lam, codeinit)
+ local code = fista.code
+ fista.input = x
+ fista.lambda = lam
+
+ -- resize code, maybe a different number of dimensions
+ -- fill with zeros, initial point
+ if codeinit then
+ code:resizeAs(codeinit)
+ code:copy(codeinit)
+ else
+ if x:dim() == 1 then
+ code:resize(D:size(2))
+ elseif x:dim() == 2 then
+ code:resize(x:size(1),D:size(2))
+ else
+ error(' I do not know how to handle ' .. x:dim() .. ' dimensional input')
+ end
+ code:fill(0)
+ end
+ -- return the result of unsup.FistaLS call.
+ return optim.FistaLS(fista.f, fista.g, fista.pl, fista.code, params)
+ end
+
+ return fista
+end
+
diff --git a/test/test_adadelta.lua b/test/test_adadelta.lua
new file mode 100644
index 0000000..ede3728
--- /dev/null
+++ b/test/test_adadelta.lua
@@ -0,0 +1,23 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+x = torch.Tensor(2):fill(0)
+fx = {}
+state = {}
+config = {eps=1e-10}
+for i = 1,10001 do
+ x,f=optim.adadelta(rosenbrock,x,config,state)
+ if (i-1)%1000 == 0 then
+ table.insert(fx,f[1])
+ end
+end
+
+print()
+print('Rosenbrock test')
+print()
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print((i-1)*1000+1,fx[i]); end
diff --git a/test/test_adagrad.lua b/test/test_adagrad.lua
new file mode 100644
index 0000000..c93f301
--- /dev/null
+++ b/test/test_adagrad.lua
@@ -0,0 +1,23 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+x = torch.Tensor(2):fill(0)
+fx = {}
+
+config = {learningRate=1e-1}
+for i = 1,10001 do
+ x,f=optim.adagrad(rosenbrock,x,config)
+ if (i-1)%1000 == 0 then
+ table.insert(fx,f[1])
+ end
+end
+
+print()
+print('Rosenbrock test')
+print()
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print((i-1)*1000+1,fx[i]); end
diff --git a/test/test_adam.lua b/test/test_adam.lua
new file mode 100644
index 0000000..2968056
--- /dev/null
+++ b/test/test_adam.lua
@@ -0,0 +1,19 @@
+require 'torch'
+require 'optim'
+require 'rosenbrock'
+require 'l2'
+x = torch.Tensor(2):fill(0)
+fx = {}
+config = {learningRate=0.002}
+for i = 1,10001 do
+x,f=optim.adam(rosenbrock,x,config)
+if (i-1)%1000 == 0 then
+table.insert(fx,f[1])
+end
+end
+print()
+print('Rosenbrock test')
+print()
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print((i-1)*1000+1,fx[i]); end
diff --git a/test/test_adamax.lua b/test/test_adamax.lua
new file mode 100644
index 0000000..a62a9a5
--- /dev/null
+++ b/test/test_adamax.lua
@@ -0,0 +1,23 @@
+
+require 'torch'
+require 'optim'
+require 'rosenbrock'
+require 'l2'
+
+x = torch.Tensor(2):fill(0)
+fx = {}
+state = {}
+config = {}
+for i = 1,10001 do
+ x,f=optim.adamax(rosenbrock,x,config,state)
+ if (i-1)%1000 == 0 then
+ table.insert(fx,f[1])
+ end
+end
+
+print()
+print('Rosenbrock test')
+print()
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print((i-1)*1000+1,fx[i]); end
diff --git a/test/test_cg.lua b/test/test_cg.lua
new file mode 100644
index 0000000..cc20fd9
--- /dev/null
+++ b/test/test_cg.lua
@@ -0,0 +1,17 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+
+x = torch.Tensor(2):fill(0)
+x,fx,i=optim.cg(rosenbrock,x,{maxIter=50})
+
+print()
+print('Rosenbrock test: compare with http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html')
+print()
+print('Number of function evals = ',i)
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print(i,fx[i]); end
diff --git a/test/test_cmaes.lua b/test/test_cmaes.lua
new file mode 100644
index 0000000..8dd6878
--- /dev/null
+++ b/test/test_cmaes.lua
@@ -0,0 +1,23 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+-- 10-D rosenbrock
+x = torch.Tensor(10):fill(0)
+config = {maxEval=10000, sigma=0.5, verb_disp=0}
+
+-- will take some time
+x,fx,i=optim.cmaes(rosenbrock,x,config)
+
+
+print('Rosenbrock test')
+print()
+-- approx 6500 function evals expected
+print('Number of function evals = ',i)
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print(i,fx[i]); end
+print()
+print()
\ No newline at end of file
diff --git a/test/test_confusion.lua b/test/test_confusion.lua
new file mode 100644
index 0000000..2a08349
--- /dev/null
+++ b/test/test_confusion.lua
@@ -0,0 +1,39 @@
+require 'torch'
+require 'optim'
+
+n_feature = 3
+classes = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+print'ConfusionMatrix:__init() test'
+cm = optim.ConfusionMatrix(#classes, classes)
+
+target = 3
+prediction = torch.randn(#classes)
+
+print'ConfusionMatrix:add() test'
+cm:add(prediction, target)
+cm:add(prediction, torch.randn(#classes))
+
+batch_size = 8
+
+targets = torch.randperm(batch_size)
+predictions = torch.randn(batch_size, #classes)
+
+print'ConfusionMatrix:batchAdd() test'
+cm:batchAdd(predictions, targets)
+assert(cm.mat:sum() == batch_size + 2, 'missing examples')
+
+print'ConfusionMatrix:updateValids() test'
+cm:updateValids()
+
+print'ConfusionMatrix:__tostring__() test'
+print(cm)
+
+target = 0
+cm:add(prediction, target)
+assert(cm.mat:sum() == batch_size + 2, 'too many examples')
+
+-- FAR/FRR testing on identify matrix. FRR/FAR should be zero for identity.
+cm.mat = torch.eye(#classes, #classes)
+classFrrs, classFars, frrs, fars = cm:farFrr()
+assert(classFrrs:sum() + classFars:sum() == 0, "Incorrect values")
diff --git a/test/test_de.lua b/test/test_de.lua
new file mode 100644
index 0000000..666ebdb
--- /dev/null
+++ b/test/test_de.lua
@@ -0,0 +1,24 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+
+-- 10-D rosenbrock
+x = torch.Tensor(2):fill(0)
+config = {popsize=50, scaleFactor=0.5, crossoverRate=0.9, maxFEs=3000}
+
+-- will take some time
+x,fx=optim.de(rosenbrock,x,config)
+
+
+print('Rosenbrock test')
+print()
+-- approx 6500 function evals expected
+print('Number of function evals = ',i)
+print('x=');print(x)
+print('fx=')
+for i=1,config.maxFEs do print(i,fx[i]); end
+print()
+print()
diff --git a/test/test_fista.lua b/test/test_fista.lua
new file mode 100644
index 0000000..0876375
--- /dev/null
+++ b/test/test_fista.lua
@@ -0,0 +1,95 @@
+
+require 'unsup'
+require 'torch'
+require 'gnuplot'
+require 'sparsecoding'
+
+-- gnuplot.setgnuplotexe('/usr/bin/gnuplot44')
+-- gnuplot.setgnuplotterminal('x11')
+
+function gettableval(tt,v)
+ local x = torch.Tensor(#tt)
+ for i=1,#tt do x[i] = tt[i][v] end
+ return x
+end
+function doplots(v)
+ v = v or 'F'
+ local fistaf = torch.DiskFile('fista2.bin'):binary()
+ local istaf = torch.DiskFile('ista2.bin'):binary()
+
+ local hfista = fistaf:readObject()
+ fistaf:close()
+ local hista = istaf:readObject()
+ istaf:close()
+
+ gnuplot.figure()
+ gnuplot.plot({'fista ' .. v,gettableval(hfista,v)},{'ista ' .. v, gettableval(hista,v)})
+end
+
+seed = seed or 123
+if dofista == nil then
+ dofista = true
+else
+ dofista = not dofista
+end
+
+torch.manualSeed(seed)
+math.randomseed(seed)
+nc = 3
+ni = 30
+no = 100
+x = torch.Tensor(ni):zero()
+
+--- I am keeping these just to make sure random init stays same
+fista = unsup.LinearFistaL1(ni,no,0.1)
+fista = nil
+
+fistaparams = {}
+fistaparams.doFistaUpdate = dofista
+fistaparams.maxline = 10
+fistaparams.maxiter = 200
+fistaparams.verbose = true
+
+D=torch.randn(ni,no)
+for i=1,D:size(2) do
+ D:select(2,i):div(D:select(2,i):std()+1e-12)
+end
+
+mixi = torch.Tensor(nc)
+mixj = torch.Tensor(nc)
+for i=1,nc do
+ local ii = math.random(1,no)
+ local cc = torch.uniform(0,1/nc)
+ mixi[i] = ii;
+ mixj[i] = cc;
+ print(ii,cc)
+ x:add(cc, D:select(2,ii))
+end
+
+fista = optim.FistaL1(D,fistaparams)
+code,h = fista.run(x,0.1)
+
+--fista.reconstruction:addmv(0,1,D,code)
+rec = fista.reconstruction
+--code,rec,h = fista:forward(x);
+
+gnuplot.figure(1)
+gnuplot.plot({'data',mixi,mixj,'+'},{'code',torch.linspace(1,no,no),code,'+'})
+gnuplot.title('Fista = ' .. tostring(fistaparams.doFistaUpdate))
+
+gnuplot.figure(2)
+gnuplot.plot({'input',torch.linspace(1,ni,ni),x,'+-'},{'reconstruction',torch.linspace(1,ni,ni),rec,'+-'});
+gnuplot.title('Reconstruction Error : ' .. x:dist(rec) .. ' ' .. 'Fista = ' .. tostring(fistaparams.doFistaUpdate))
+--w2:axis(0,ni+1,-1,1)
+
+if dofista then
+ print('Running FISTA')
+ fname = 'fista2.bin'
+else
+ print('Running ISTA')
+ fname = 'ista2.bin'
+end
+ff = torch.DiskFile(fname,'w'):binary()
+ff:writeObject(h)
+ff:close()
+
diff --git a/test/test_lbfgs.lua b/test/test_lbfgs.lua
new file mode 100644
index 0000000..268af15
--- /dev/null
+++ b/test/test_lbfgs.lua
@@ -0,0 +1,38 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+print('--- regular batch test ---')
+
+x = torch.Tensor(2):fill(0)
+x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, learningRate=1e-1})
+
+print()
+print('Rosenbrock test')
+print()
+print('Number of function evals = ',i)
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print(i,fx[i]); end
+print()
+print()
+
+print('--- stochastic test ---')
+
+x = torch.Tensor(2):fill(0)
+fx = {}
+config = {learningRate=1e-1, maxIter=1}
+for i = 1,100 do
+ x,f=optim.lbfgs(rosenbrock,x,config)
+ table.insert(fx,f[1])
+end
+
+print()
+print('Rosenbrock test')
+print()
+print('Number of function evals = ',i)
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print(i,fx[i]); end
diff --git a/test/test_lbfgs_w_ls.lua b/test/test_lbfgs_w_ls.lua
new file mode 100644
index 0000000..cc1eb64
--- /dev/null
+++ b/test/test_lbfgs_w_ls.lua
@@ -0,0 +1,20 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+print('--- batch test w/ line search ---')
+
+x = torch.Tensor(2):fill(0)
+x,fx,i=optim.lbfgs(rosenbrock,x,{maxIter=100, lineSearch=optim.lswolfe})
+
+print()
+print('Rosenbrock test')
+print()
+print('Number of function evals = ',i)
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print(i,fx[i]); end
+print()
+print()
diff --git a/test/test_logger.lua b/test/test_logger.lua
new file mode 100644
index 0000000..6ee8cc4
--- /dev/null
+++ b/test/test_logger.lua
@@ -0,0 +1,20 @@
+require 'optim'
+
+
+logger_former = optim.Logger('accuracy-former.log')
+logger_new = optim.Logger('accuracy-new.log')
+
+logger_new:setNames({'channel 1', 'channel 2', 'channel 3'})
+
+for i = 1, 20 do
+ logger_former:add({['channel 1'] = 1 , ['channel 2'] = 0.1 * i, ['channel 3'] = 1 - 0.2 * i})
+ logger_new:add({1 , 0.1 * i, 1 - 0.2 * i})
+end
+
+logger_former:style({['channel 1'] = '-' , ['channel 2'] = '-', ['channel 3'] = '-'})
+logger_new:style{'-', '-', '-'}
+
+logger_former:plot()
+logger_new:plot()
+
+
diff --git a/test/test_rmsprop.lua b/test/test_rmsprop.lua
new file mode 100644
index 0000000..069810f
--- /dev/null
+++ b/test/test_rmsprop.lua
@@ -0,0 +1,23 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+x = torch.Tensor(2):fill(0)
+fx = {}
+
+config = {learningRate=5e-4}
+for i = 1,10001 do
+ x,f=optim.rmsprop(rosenbrock,x,config)
+ if (i-1)%1000 == 0 then
+ table.insert(fx,f[1])
+ end
+end
+
+print()
+print('Rosenbrock test')
+print()
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print((i-1)*1000+1,fx[i]); end
diff --git a/test/test_sgd.lua b/test/test_sgd.lua
new file mode 100644
index 0000000..3518d6e
--- /dev/null
+++ b/test/test_sgd.lua
@@ -0,0 +1,23 @@
+require 'torch'
+require 'optim'
+
+require 'rosenbrock'
+require 'l2'
+
+x = torch.Tensor(2):fill(0)
+fx = {}
+
+config = {learningRate=1e-3}
+for i = 1,10001 do
+ x,f=optim.sgd(rosenbrock,x,config)
+ if (i-1)%1000 == 0 then
+ table.insert(fx,f[1])
+ end
+end
+
+print()
+print('Rosenbrock test')
+print()
+print('x=');print(x)
+print('fx=')
+for i=1,#fx do print((i-1)*1000+1,fx[i]); end
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/lua-torch-optim.git
More information about the debian-science-commits
mailing list