[caffe-contrib] 251/362: Imported Upstream version 1.0.0~rc3

Tue May 3 09:24:39 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository caffe-contrib.

commit 6b43d5a3cc3a95abdf75f1bcb8b0dbf9634c937f
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Tue Mar 8 14:56:41 2016 +0000

    Imported Upstream version 1.0.0~rc3
---
 CMakeLists.txt                                     |   5 +
 Makefile                                           |  36 +-
 Makefile.config.example                            |   5 +
 cmake/Dependencies.cmake                           |   4 +-
 cmake/Summary.cmake                                |   2 +-
 data/ilsvrc12/get_ilsvrc_aux.sh                    |   2 +-
 docs/installation.md                               |  14 +-
 examples/feature_extraction/readme.md              |   2 +-
 examples/finetune_flickr_style/readme.md           |   4 +-
 examples/mnist/readme.md                           |   4 +-
 include/caffe/common.hpp                           |   4 +
 include/caffe/layers/base_conv_layer.hpp           |  22 +-
 .../layers/{im2col_layer.hpp => bias_layer.hpp}    |  45 +-
 include/caffe/layers/conv_layer.hpp                |   3 +
 include/caffe/layers/elu_layer.hpp                 |  86 ++++
 include/caffe/layers/im2col_layer.hpp              |   2 +
 include/caffe/layers/scale_layer.hpp               |  83 ++++
 include/caffe/net.hpp                              |  15 +
 include/caffe/solver.hpp                           |   3 +
 include/caffe/util/im2col.hpp                      |  20 +-
 matlab/+caffe/private/caffe_.cpp                   |   8 +
 matlab/+caffe/version.m                            |   7 +
 python/caffe/__init__.py                           |   1 +
 python/caffe/_caffe.cpp                            |   7 +
 python/caffe/draw.py                               |  11 +-
 python/caffe/io.py                                 |   2 +-
 python/caffe/pycaffe.py                            |  18 +
 src/caffe/CMakeLists.txt                           |   4 +
 src/caffe/layer_factory.cpp                        |  30 +-
 src/caffe/layers/base_conv_layer.cpp               |  20 +-
 src/caffe/layers/bias_layer.cpp                    | 121 +++++
 src/caffe/layers/bias_layer.cu                     |  59 +++
 src/caffe/layers/conv_layer.cpp                    |   4 +-
 src/caffe/layers/deconv_layer.cpp                  |   4 +-
 src/caffe/layers/elu_layer.cpp                     |  47 ++
 src/caffe/layers/elu_layer.cu                      |  62 +++
 src/caffe/layers/flatten_layer.cpp                 |   2 +
 src/caffe/layers/im2col_layer.cpp                  |  25 +-
 src/caffe/layers/im2col_layer.cu                   |   6 +-
 src/caffe/layers/reshape_layer.cpp                 |   2 +
 src/caffe/layers/scale_layer.cpp                   | 219 +++++++++
 src/caffe/layers/scale_layer.cu                    | 135 ++++++
 src/caffe/proto/caffe.proto                        |  86 +++-
 src/caffe/solver.cpp                               |  37 +-
 src/caffe/solvers/adadelta_solver.cpp              |  66 +--
 src/caffe/solvers/adadelta_solver.cu               |  30 ++
 src/caffe/solvers/adagrad_solver.cpp               |  37 +-
 src/caffe/solvers/adagrad_solver.cu                |  26 ++
 src/caffe/solvers/adam_solver.cpp                  |  39 +-
 src/caffe/solvers/adam_solver.cu                   |  29 ++
 src/caffe/solvers/nesterov_solver.cpp              |  29 +-
 src/caffe/solvers/nesterov_solver.cu               |  27 ++
 src/caffe/solvers/rmsprop_solver.cpp               |  35 +-
 src/caffe/solvers/rmsprop_solver.cu                |  28 ++
 src/caffe/solvers/sgd_solver.cpp                   |  16 +-
 src/caffe/solvers/sgd_solver.cu                    |  24 +
 src/caffe/test/test_bias_layer.cpp                 | 467 +++++++++++++++++++
 src/caffe/test/test_convolution_layer.cpp          | 129 +++++-
 src/caffe/test/test_data_transformer.cpp           | 136 +++---
 src/caffe/test/test_embed_layer.cpp                |   4 -
 src/caffe/test/test_im2col_kernel.cu               |  26 +-
 src/caffe/test/test_im2col_layer.cpp               |  51 ++-
 src/caffe/test/test_neuron_layer.cpp               |  59 +++
 src/caffe/test/test_scale_layer.cpp                | 507 +++++++++++++++++++++
 src/caffe/util/im2col.cpp                          | 124 +++--
 src/caffe/util/im2col.cu                           | 246 ++++++----
 tools/caffe.cpp                                    |   5 +-
 67 files changed, 2933 insertions(+), 485 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c446c60..32cc42a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,11 @@ endif()
 # ---[ Caffe project
 project(Caffe C CXX)
 
+# ---[ Caffe version
+set(CAFFE_TARGET_VERSION "1.0.0-rc3")
+set(CAFFE_TARGET_SOVERSION "1.0.0-rc3")
+add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
+
 # ---[ Using cmake scripts and modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
diff --git a/Makefile b/Makefile
index 985fffd..598d28d 100644
--- a/Makefile
+++ b/Makefile
@@ -29,9 +29,17 @@ SRC_DIRS := $(shell find * -type d -exec bash -c "find {} -maxdepth 1 \
 	\( -name '*.cpp' -o -name '*.proto' \) | grep -q ." \; -print)
 
 # The target shared library name
+LIBRARY_NAME := $(PROJECT)
 LIB_BUILD_DIR := $(BUILD_DIR)/lib
-STATIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).a
-DYNAMIC_NAME := $(LIB_BUILD_DIR)/lib$(PROJECT).so
+STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
+DYNAMIC_VERSION_MAJOR 		:= 1
+DYNAMIC_VERSION_MINOR 		:= 0
+DYNAMIC_VERSION_REVISION 	:= 0-rc3
+DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
+#DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
+DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
+DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT)
+COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
 
 ##############################
 # Get all source files
@@ -191,7 +199,7 @@ ifeq ($(USE_OPENCV), 1)
 	endif
 		
 endif
-PYTHON_LIBRARIES := boost_python python2.7
+PYTHON_LIBRARIES ?= boost_python python2.7
 WARNINGS := -Wall -Wno-sign-compare
 
 ##############################
@@ -253,6 +261,7 @@ ifeq ($(LINUX), 1)
 	# boost::thread is reasonably called boost_thread (compare OS X)
 	# We will also explicitly add stdc++ to the link target.
 	LIBRARIES += boost_thread stdc++
+	VERSIONFLAGS += -Wl,-soname,$(DYNAMIC_VERSIONED_NAME_SHORT) -Wl,-rpath,$(ORIGIN)/../lib
 endif
 
 # OS X:
@@ -276,6 +285,7 @@ ifeq ($(OSX), 1)
 	# we need to explicitly ask for the rpath to be obeyed
 	DYNAMIC_FLAGS := -install_name @rpath/libcaffe.so
 	ORIGIN := @loader_path
+	VERSIONFLAGS += -Wl,-install_name,$(DYNAMIC_VERSIONED_NAME_SHORT) -Wl,-rpath,$(ORIGIN)/../../build/lib
 else
 	ORIGIN := \$$ORIGIN
 endif
@@ -478,7 +488,7 @@ py: $(PY$(PROJECT)_SO) $(PROTO_GEN_PY)
 $(PY$(PROJECT)_SO): $(PY$(PROJECT)_SRC) $(PY$(PROJECT)_HXX) | $(DYNAMIC_NAME)
 	@ echo CXX/LD -o $@ $<
 	$(Q)$(CXX) -shared -o $@ $(PY$(PROJECT)_SRC) \
-		-o $@ $(LINKFLAGS) -l$(PROJECT) $(PYTHON_LDFLAGS) \
+		-o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(PYTHON_LDFLAGS) \
 		-Wl,-rpath,$(ORIGIN)/../../build/lib
 
 mat$(PROJECT): mat
@@ -542,7 +552,8 @@ $(ALL_BUILD_DIRS): | $(BUILD_DIR_LINK)
 
 $(DYNAMIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo LD -o $@
-	$(Q)$(CXX) -shared -o $@ $(OBJS) $(LINKFLAGS) $(LDFLAGS) $(DYNAMIC_FLAGS)
+	$(Q)$(CXX) -shared -o $@ $(OBJS) $(VERSIONFLAGS) $(LINKFLAGS) $(LDFLAGS) $(DYNAMIC_FLAGS)
+	@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_NAME_SHORT)
 
 $(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo AR -o $@
@@ -573,33 +584,33 @@ $(TEST_ALL_BIN): $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
 		| $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo CXX/LD -o $@ $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $(TEST_OBJS) $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
+		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib
 
 $(TEST_CU_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CU_BUILD_DIR)/%.o \
 	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo LD $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
+		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib
 
 $(TEST_CXX_BINS): $(TEST_BIN_DIR)/%.testbin: $(TEST_CXX_BUILD_DIR)/%.o \
 	$(GTEST_OBJ) | $(DYNAMIC_NAME) $(TEST_BIN_DIR)
 	@ echo LD $<
 	$(Q)$(CXX) $(TEST_MAIN_SRC) $< $(GTEST_OBJ) \
-		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(PROJECT) -Wl,-rpath,$(ORIGIN)/../lib
+		-o $@ $(LINKFLAGS) $(LDFLAGS) -l$(LIBRARY_NAME) -Wl,-rpath,$(ORIGIN)/../lib
 
 # Target for extension-less symlinks to tool binaries with extension '*.bin'.
 $(TOOL_BUILD_DIR)/%: $(TOOL_BUILD_DIR)/%.bin | $(TOOL_BUILD_DIR)
 	@ $(RM) $@
-	@ ln -s $(abspath $<) $@
+	@ ln -s $(notdir $<) $@
 
 $(TOOL_BINS): %.bin : %.o | $(DYNAMIC_NAME)
 	@ echo CXX/LD -o $@
-	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \
+	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(LDFLAGS) \
 		-Wl,-rpath,$(ORIGIN)/../lib
 
 $(EXAMPLE_BINS): %.bin : %.o | $(DYNAMIC_NAME)
 	@ echo CXX/LD -o $@
-	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(PROJECT) $(LDFLAGS) \
+	$(Q)$(CXX) $< -o $@ $(LINKFLAGS) -l$(LIBRARY_NAME) $(LDFLAGS) \
 		-Wl,-rpath,$(ORIGIN)/../../lib
 
 proto: $(PROTO_GEN_CC) $(PROTO_GEN_HEADER)
@@ -651,6 +662,8 @@ superclean: clean supercleanfiles
 $(DIST_ALIASES): $(DISTRIBUTE_DIR)
 
 $(DISTRIBUTE_DIR): all py | $(DISTRIBUTE_SUBDIRS)
+	# add proto
+	cp -r src/caffe/proto $(DISTRIBUTE_DIR)/
 	# add include
 	cp -r include $(DISTRIBUTE_DIR)/
 	mkdir -p $(DISTRIBUTE_DIR)/include/caffe/proto
@@ -661,6 +674,7 @@ $(DISTRIBUTE_DIR): all py | $(DISTRIBUTE_SUBDIRS)
 	# add libraries
 	cp $(STATIC_NAME) $(DISTRIBUTE_DIR)/lib
 	install -m 644 $(DYNAMIC_NAME) $(DISTRIBUTE_DIR)/lib
+	cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_NAME_SHORT)
 	# add python - it's not the standard way, indeed...
 	cp -r python $(DISTRIBUTE_DIR)/python
 
diff --git a/Makefile.config.example b/Makefile.config.example
index 1dd6a8f..8fd49c9 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -70,6 +70,11 @@ PYTHON_INCLUDE := /usr/include/python2.7 \
 		# $(ANACONDA_HOME)/include/python2.7 \
 		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
 
+# Uncomment to use Python 3 (default is Python 2)
+# PYTHON_LIBRARIES := boost_python3 python3.5m
+# PYTHON_INCLUDE := /usr/include/python3.5m \
+#                 /usr/lib/python3.5/dist-packages/numpy/core/include
+
 # We need to be able to find libpythonX.X.so or .dylib.
 PYTHON_LIB := /usr/lib
 # PYTHON_LIB := $(ANACONDA_HOME)/lib
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 51a803c..c7b6a17 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -114,14 +114,14 @@ if(BUILD_python)
     # Find the matching boost python implementation
     set(version ${PYTHONLIBS_VERSION_STRING})
     
-    STRING( REPLACE "." "" boost_py_version ${version} )
+    STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
     find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
     set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
     
     while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
       STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
       
-      STRING( REPLACE "." "" boost_py_version ${version} )
+      STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
       find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
       set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
       
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 557a6f0..ba025cf 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -101,7 +101,7 @@ function(caffe_print_configuration_summary)
   caffe_status("")
   caffe_status("******************* Caffe Configuration Summary *******************")
   caffe_status("General:")
-  caffe_status("  Version           :   ${Caffe_VERSION}")
+  caffe_status("  Version           :   ${CAFFE_TARGET_VERSION}")
   caffe_status("  Git               :   ${Caffe_GIT_VERSION}")
   caffe_status("  System            :   ${CMAKE_SYSTEM_NAME}")
   caffe_status("  C++ compiler      :   ${CMAKE_CXX_COMPILER}")
diff --git a/data/ilsvrc12/get_ilsvrc_aux.sh b/data/ilsvrc12/get_ilsvrc_aux.sh
index b9b85d2..90935f2 100755
--- a/data/ilsvrc12/get_ilsvrc_aux.sh
+++ b/data/ilsvrc12/get_ilsvrc_aux.sh
@@ -12,7 +12,7 @@ cd $DIR
 
 echo "Downloading..."
 
-wget http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz
+wget -c http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz
 
 echo "Unzipping..."
 
diff --git a/docs/installation.md b/docs/installation.md
index cce7ec3..ef781e8 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -87,15 +87,20 @@ There is an unofficial Windows port of Caffe at [niuzhiheng/caffe:windows](https
 
 ## Compilation
 
-Now that you have the prerequisites, edit your `Makefile.config` to change the paths for your setup The defaults should work, but uncomment the relevant lines if using Anaconda Python.
+Caffe can be compiled with either Make or CMake. Make is officially supported while CMake is supported by the community.
+
+### Compilation with Make
+
+Configure the build by copying and modifying the example `Makefile.config` for your setup. The defaults should work, but uncomment the relevant lines if using Anaconda Python.
 
     cp Makefile.config.example Makefile.config
-    # Adjust Makefile.config (for example, if using Anaconda Python)
+    # Adjust Makefile.config (for example, if using Anaconda Python, or if cuDNN is desired)
     make all
     make test
     make runtest
 
-- For cuDNN acceleration, you should uncomment the `USE_CUDNN := 1` switch in `Makefile.config`.
+- For CPU & GPU accelerated Caffe, no changes are needed.
+- For cuDNN acceleration using NVIDIA's proprietary cuDNN software, uncomment the `USE_CUDNN := 1` switch in `Makefile.config`. cuDNN is sometimes but not always faster than Caffe's GPU acceleration.
 - For CPU-only Caffe, uncomment `CPU_ONLY := 1` in `Makefile.config`.
 
 To compile the Python and MATLAB wrappers do `make pycaffe` and `make matcaffe` respectively.
@@ -107,7 +112,7 @@ Be sure to set your MATLAB and Python paths in `Makefile.config` first!
 
 Now that you have installed Caffe, check out the [MNIST tutorial](gathered/examples/mnist.html) and the [reference ImageNet model tutorial](gathered/examples/imagenet.html).
 
-### CMake Compilation
+### Compilation with CMake
 
 In lieu of manually editing `Makefile.config` to configure the build, Caffe offers an unofficial CMake build thanks to @Nerei, @akosiorek, and other members of the community. It requires CMake version >= 2.8.7.
 The basic steps are as follows:
@@ -116,6 +121,7 @@ The basic steps are as follows:
     cd build
     cmake ..
     make all
+    make install
     make runtest
 
 See [PR #1667](https://github.com/BVLC/caffe/pull/1667) for options and details.
diff --git a/examples/feature_extraction/readme.md b/examples/feature_extraction/readme.md
index 2bc3dac..5612b02 100644
--- a/examples/feature_extraction/readme.md
+++ b/examples/feature_extraction/readme.md
@@ -51,7 +51,7 @@ Extract Features
 
 Now everything necessary is in place.
 
-    ./build/tools/extract_features.bin models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel examples/_temp/imagenet_val.prototxt fc7 examples/_temp/features 10 lmdb
+    ./build/tools/extract_features.bin models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel examples/_temp/imagenet_val.prototxt fc7 examples/_temp/features 10 leveldb
 
 The name of feature blob that you extract is `fc7`, which represents the highest level feature of the reference model.
 We can use any other layer, as well, such as `conv5` or `pool3`.
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index ecb9d3d..4e9d41f 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -22,10 +22,10 @@ Because we are predicting 20 classes instead of a 1,000, we do need to change th
 Therefore, we change the name of the last layer from `fc8` to `fc8_flickr` in our prototxt.
 Since there is no layer named that in the `bvlc_reference_caffenet`, that layer will begin training with random weights.
 
-We will also decrease the overall learning rate `base_lr` in the solver prototxt, but boost the `blobs_lr` on the newly introduced layer.
+We will also decrease the overall learning rate `base_lr` in the solver prototxt, but boost the `lr_mult` on the newly introduced layer.
 The idea is to have the rest of the model change very slowly with new data, but let the new layer learn fast.
 Additionally, we set `stepsize` in the solver to a lower value than if we were training from scratch, since we're virtually far along in training and therefore want the learning rate to go down faster.
-Note that we could also entirely prevent fine-tuning of all layers other than `fc8_flickr` by setting their `blobs_lr` to 0.
+Note that we could also entirely prevent fine-tuning of all layers other than `fc8_flickr` by setting their `lr_mult` to 0.
 
 ## Procedure
 
diff --git a/examples/mnist/readme.md b/examples/mnist/readme.md
index 413d4a1..b87a0f5 100644
--- a/examples/mnist/readme.md
+++ b/examples/mnist/readme.md
@@ -41,11 +41,13 @@ Currently, we will read the MNIST data from the lmdb we created earlier in the d
     layer {
       name: "mnist"
       type: "Data"
+      transform_param {
+        scale: 0.00390625
+      }
       data_param {
         source: "mnist_train_lmdb"
         backend: LMDB
         batch_size: 64
-        scale: 0.00390625
       }
       top: "data"
       top: "label"
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 1df6b9a..6b902a4 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -18,6 +18,10 @@
 
 #include "caffe/util/device_alternate.hpp"
 
+// Convert macro to string
+#define STRINGIFY(m) #m
+#define AS_STRING(m) STRINGIFY(m)
+
 // gflags 2.1 issue: namespace google was changed to gflags without warning.
 // Luckily we will be able to use GFLAGS_GFLAGS_H_ to detect if it is version
 // 2.1. If yes, we will add a temporary solution to redirect the namespace.
diff --git a/include/caffe/layers/base_conv_layer.hpp b/include/caffe/layers/base_conv_layer.hpp
index f3def16..0160a83 100644
--- a/include/caffe/layers/base_conv_layer.hpp
+++ b/include/caffe/layers/base_conv_layer.hpp
@@ -68,6 +68,8 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   Blob<int> stride_;
   /// @brief The spatial dimensions of the padding.
   Blob<int> pad_;
+  /// @brief The spatial dimensions of the dilation.
+  Blob<int> dilation_;
   /// @brief The spatial dimensions of the convolution input.
   Blob<int> conv_input_shape_;
   /// @brief The spatial dimensions of the col_buffer.
@@ -99,11 +101,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
           conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
     } else {
       im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
           col_buffer_shape_.data(), kernel_shape_.cpu_data(),
-          pad_.cpu_data(), stride_.cpu_data(), col_buff);
+          pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff);
     }
   }
   inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
@@ -112,11 +115,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
           conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], data);
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
     } else {
       col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
           col_buffer_shape_.data(), kernel_shape_.cpu_data(),
-          pad_.cpu_data(), stride_.cpu_data(), data);
+          pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data);
     }
   }
 #ifndef CPU_ONLY
@@ -126,12 +130,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
           conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
     } else {
       im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
           conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
           kernel_shape_.gpu_data(), pad_.gpu_data(),
-          stride_.gpu_data(), col_buff);
+          stride_.gpu_data(), dilation_.gpu_data(), col_buff);
     }
   }
   inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
@@ -140,12 +145,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
           conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
-          stride_.cpu_data()[0], stride_.cpu_data()[1], data);
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
     } else {
       col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
           conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
           kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          data);
+          dilation_.gpu_data(), data);
     }
   }
 #endif
diff --git a/include/caffe/layers/im2col_layer.hpp b/include/caffe/layers/bias_layer.hpp
similarity index 50%
copy from include/caffe/layers/im2col_layer.hpp
copy to include/caffe/layers/bias_layer.hpp
index 1d3b2eb..eedc3aa 100644
--- a/include/caffe/layers/im2col_layer.hpp
+++ b/include/caffe/layers/bias_layer.hpp
@@ -1,5 +1,5 @@
-#ifndef CAFFE_IM2COL_LAYER_HPP_
-#define CAFFE_IM2COL_LAYER_HPP_
+#ifndef CAFFE_BIAS_LAYER_HPP_
+#define CAFFE_BIAS_LAYER_HPP_
 
 #include <vector>
 
@@ -10,27 +10,29 @@
 namespace caffe {
 
 /**
- * @brief A helper for image operations that rearranges image regions into
- *        column vectors.  Used by ConvolutionLayer to perform convolution
- *        by matrix multiplication.
+ * @brief Computes a sum of two input Blobs, with the shape of the
+ *        latter Blob "broadcast" to match the shape of the former.
+ *        Equivalent to tiling the latter Blob, then computing the elementwise
+ *        sum.
  *
- * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ * The second input may be omitted, in which case it's learned as a parameter
+ * of the layer.
  */
 template <typename Dtype>
-class Im2colLayer : public Layer<Dtype> {
+class BiasLayer : public Layer<Dtype> {
  public:
-  explicit Im2colLayer(const LayerParameter& param)
+  explicit BiasLayer(const LayerParameter& param)
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
 
-  virtual inline const char* type() const { return "Im2col"; }
-  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline const char* type() const { return "Bias"; }
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
- protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -40,24 +42,13 @@ class Im2colLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  /// @brief The spatial dimensions of a filter kernel.
-  Blob<int> kernel_shape_;
-  /// @brief The spatial dimensions of the stride.
-  Blob<int> stride_;
-  /// @brief The spatial dimensions of the padding.
-  Blob<int> pad_;
-
-  int num_spatial_axes_;
-  int bottom_dim_;
-  int top_dim_;
+ private:
+  Blob<Dtype> bias_multiplier_;
+  int outer_dim_, bias_dim_, inner_dim_, dim_;
+};
 
-  int channel_axis_;
-  int num_;
-  int channels_;
 
-  bool force_nd_im2col_;
-};
 
 }  // namespace caffe
 
-#endif  // CAFFE_IM2COL_LAYER_HPP_
+#endif  // CAFFE_BIAS_LAYER_HPP_
diff --git a/include/caffe/layers/conv_layer.hpp b/include/caffe/layers/conv_layer.hpp
index 1557476..93a618d 100644
--- a/include/caffe/layers/conv_layer.hpp
+++ b/include/caffe/layers/conv_layer.hpp
@@ -44,6 +44,9 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
    *  convolution, given by pad for equal dimensions or pad_h and pad_w for
    *  different padding. Input padding is computed implicitly instead of
    *  actually padding.
+   *  - dilation (\b optional, default 1). The filter
+   *  dilation, given by dilation_size for equal dimensions for different
+   *  dilation. By default the convolution has dilation 1.
    *  - group (\b optional, default 1). The number of filter groups. Group
    *  convolution is a method for reducing parameterization by selectively
    *  connecting input and output channels. The input and output channel dimensions must be divisible
diff --git a/include/caffe/layers/elu_layer.hpp b/include/caffe/layers/elu_layer.hpp
new file mode 100644
index 0000000..0796e89
--- /dev/null
+++ b/include/caffe/layers/elu_layer.hpp
@@ -0,0 +1,86 @@
+#ifndef CAFFE_ELU_LAYER_HPP_
+#define CAFFE_ELU_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Exponential Linear Unit non-linearity @f$
+ *        y = \left\{
+ *        \begin{array}{lr}
+ *            x                  & \mathrm{if} \; x > 0 \\
+ *            \alpha (\exp(x)-1) & \mathrm{if} \; x \le 0
+ *        \end{array} \right.
+ *      @f$.  
+ */
+template <typename Dtype>
+class ELULayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ELUParameter elu_param,
+   *     with ELULayer options:
+   *   - alpha (\b optional, default 1).
+   *     the value @f$ \alpha @f$ by which controls saturation for negative inputs.
+   */
+  explicit ELULayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "ELU"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = \left\{
+   *        \begin{array}{lr}
+   *            x                  & \mathrm{if} \; x > 0 \\
+   *            \alpha (\exp(x)-1) & \mathrm{if} \; x \le 0
+   *        \end{array} \right.
+   *      @f$.  
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the ELU inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} = \left\{
+   *        \begin{array}{lr}
+   *            1           & \mathrm{if} \; x > 0 \\
+   *            y + \alpha  & \mathrm{if} \; x \le 0
+   *        \end{array} \right.
+   *      @f$ if propagate_down[0].
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_ELU_LAYER_HPP_
diff --git a/include/caffe/layers/im2col_layer.hpp b/include/caffe/layers/im2col_layer.hpp
index 1d3b2eb..71e32f7 100644
--- a/include/caffe/layers/im2col_layer.hpp
+++ b/include/caffe/layers/im2col_layer.hpp
@@ -46,6 +46,8 @@ class Im2colLayer : public Layer<Dtype> {
   Blob<int> stride_;
   /// @brief The spatial dimensions of the padding.
   Blob<int> pad_;
+  /// @brief The spatial dimensions of the dilation.
+  Blob<int> dilation_;
 
   int num_spatial_axes_;
   int bottom_dim_;
diff --git a/include/caffe/layers/scale_layer.hpp b/include/caffe/layers/scale_layer.hpp
new file mode 100644
index 0000000..924df2e
--- /dev/null
+++ b/include/caffe/layers/scale_layer.hpp
@@ -0,0 +1,83 @@
+#ifndef CAFFE_SCALE_LAYER_HPP_
+#define CAFFE_SCALE_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/bias_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes a product of two input Blobs, with the shape of the
+ *        latter Blob "broadcast" to match the shape of the former.
+ *        Equivalent to tiling the latter Blob, then computing the elementwise
+ *        product.
+ *
+ * The second input may be omitted, in which case it's learned as a parameter
+ * of the layer.
+ */
+template <typename Dtype>
+class ScaleLayer: public Layer<Dtype> {
+ public:
+  explicit ScaleLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Scale"; }
+  // Scale
+  virtual inline int MinBottomBlobs() const { return 1; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * In the below shape specifications, @f$ i @f$ denotes the value of the
+   * `axis` field given by `this->layer_param_.scale_param().axis()`, after
+   * canonicalization (i.e., conversion from negative to positive index,
+   * if applicable).
+   *
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the first factor @f$ x @f$
+   *   -# @f$ (d_i \times ... \times d_j) @f$
+   *      the second factor @f$ y @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (d_0 \times ... \times
+   *           d_i \times ... \times d_j \times ... \times d_n) @f$
+   *      the product @f$ z = x y @f$ computed after "broadcasting" y.
+   *      Equivalent to tiling @f$ y @f$ to have the same shape as @f$ x @f$,
+   *      then computing the elementwise product.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  shared_ptr<Layer<Dtype> > bias_layer_;
+  vector<Blob<Dtype>*> bias_bottom_vec_;
+  vector<bool> bias_propagate_down_;
+  int bias_param_id_;
+
+  Blob<Dtype> sum_multiplier_;
+  Blob<Dtype> sum_result_;
+  Blob<Dtype> temp_;
+  int axis_;
+  int outer_dim_, scale_dim_, inner_dim_;
+};
+
+
+}  // namespace caffe
+
+#endif  // CAFFE_SCALE_LAYER_HPP_
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 1bf07d2..543133e 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -149,6 +149,18 @@ class Net {
   inline const vector<vector<Blob<Dtype>*> >& top_vecs() const {
     return top_vecs_;
   }
+  /// @brief returns the ids of the top blobs of layer i
+  inline const vector<int> & top_ids(int i) const {
+    CHECK_GE(i, 0) << "Invalid layer id";
+    CHECK_LT(i, top_id_vecs_.size()) << "Invalid layer id";
+    return top_id_vecs_[i];
+  }
+  /// @brief returns the ids of the bottom blobs of layer i
+  inline const vector<int> & bottom_ids(int i) const {
+    CHECK_GE(i, 0) << "Invalid layer id";
+    CHECK_LT(i, bottom_id_vecs_.size()) << "Invalid layer id";
+    return bottom_id_vecs_[i];
+  }
   inline const vector<vector<bool> >& bottom_need_backward() const {
     return bottom_need_backward_;
   }
@@ -179,6 +191,9 @@ class Net {
     return param_names_index_;
   }
   inline const vector<int>& param_owners() const { return param_owners_; }
+  inline const vector<string>& param_display_names() const {
+    return param_display_names_;
+  }
   /// @brief Input and output blob numbers
   inline int num_inputs() const { return net_input_blobs_.size(); }
   inline int num_outputs() const { return net_output_blobs_.size(); }
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 26b8e8e..38259ed 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -107,6 +107,7 @@ class Solver {
   virtual void RestoreSolverStateFromHDF5(const string& state_file) = 0;
   virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0;
   void DisplayOutputBlobs(const int net_id);
+  void UpdateSmoothedLoss(Dtype loss, int start_iter, int average_loss);
 
   SolverParameter param_;
   int iter_;
@@ -114,6 +115,8 @@ class Solver {
   shared_ptr<Net<Dtype> > net_;
   vector<shared_ptr<Net<Dtype> > > test_nets_;
   vector<Callback*> callbacks_;
+  vector<Dtype> losses_;
+  Dtype smoothed_loss_;
 
   // The root solver that holds root nets (actually containing shared layers)
   // in data parallelism
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index d3eb6cc..a35bc6e 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -7,49 +7,53 @@ template <typename Dtype>
 void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col);
+    const int* dilation, Dtype* data_col);
 
 template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_col);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    Dtype* data_col);
 
 template <typename Dtype>
 void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im);
+    const int* dilation, Dtype* data_im);
 
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    Dtype* data_im);
 
 template <typename Dtype>
 void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
     const int col_size, const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col);
+    const int* dilation, Dtype* data_col);
 
 template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_col);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    Dtype* data_col);
 
 template <typename Dtype>
 void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
     const int im_size, const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im);
+    const int* dilation, Dtype* data_im);
 
 template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    Dtype* data_im);
 
 }  // namespace caffe
 
diff --git a/matlab/+caffe/private/caffe_.cpp b/matlab/+caffe/private/caffe_.cpp
index 1641e14..1b1b2bf 100644
--- a/matlab/+caffe/private/caffe_.cpp
+++ b/matlab/+caffe/private/caffe_.cpp
@@ -504,6 +504,13 @@ static void write_mean(MEX_ARGS) {
   mxFree(mean_proto_file);
 }
 
+// Usage: caffe_('version')
+static void version(MEX_ARGS) {
+  mxCHECK(nrhs == 0, "Usage: caffe_('version')");
+  // Return version string
+  plhs[0] = mxCreateString(AS_STRING(CAFFE_VERSION));
+}
+
 /** -----------------------------------------------------------------
  ** Available commands.
  **/
@@ -542,6 +549,7 @@ static handler_registry handlers[] = {
   { "reset",              reset           },
   { "read_mean",          read_mean       },
   { "write_mean",         write_mean      },
+  { "version",            version         },
   // The end.
   { "END",                NULL            },
 };
diff --git a/matlab/+caffe/version.m b/matlab/+caffe/version.m
new file mode 100644
index 0000000..61cae4f
--- /dev/null
+++ b/matlab/+caffe/version.m
@@ -0,0 +1,7 @@
+function version_str = version()
+% version()
+%   show Caffe's version.
+
+version_str = caffe_('version');
+
+end
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index ccda1bc..e2881b8 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,6 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
 from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
+from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
 from .detector import Detector
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 69d5533..12a5745 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -212,6 +212,9 @@ BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 BOOST_PYTHON_MODULE(_caffe) {
   // below, we prepend an underscore to methods that will be replaced
   // in Python
+
+  bp::scope().attr("__version__") = AS_STRING(CAFFE_VERSION);
+
   // Caffe utility functions
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
@@ -232,6 +235,10 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("share_with", &Net<Dtype>::ShareTrainedLayersWith)
     .add_property("_blob_loss_weights", bp::make_function(
         &Net<Dtype>::blob_loss_weights, bp::return_internal_reference<>()))
+    .def("_bottom_ids", bp::make_function(&Net<Dtype>::bottom_ids,
+        bp::return_value_policy<bp::copy_const_reference>()))
+    .def("_top_ids", bp::make_function(&Net<Dtype>::top_ids,
+        bp::return_value_policy<bp::copy_const_reference>()))
     .add_property("_blobs", bp::make_function(&Net<Dtype>::blobs,
         bp::return_internal_reference<>()))
     .add_property("layers", bp::make_function(&Net<Dtype>::layers,
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index f8bf572..cfa3fc5 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -10,7 +10,16 @@ Caffe network visualization: draw the NetParameter protobuffer.
 """
 
 from caffe.proto import caffe_pb2
-import pydot
+
+"""
+pydot is not supported under python 3 and pydot2 doesn't work properly.
+pydotplus works nicely (pip install pydotplus)
+"""
+try:
+    # Try to load pydotplus
+    import pydotplus as pydot
+except ImportError:
+    import pydot
 
 # Internal layer and blob styles.
 LAYER_STYLE_DEFAULT = {'shape': 'record',
diff --git a/python/caffe/io.py b/python/caffe/io.py
index 14942be..7531058 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -292,7 +292,7 @@ def load_image(filename, color=True):
         of size (H x W x 3) in RGB or
         of size (H x W x 1) in grayscale.
     """
-    img = skimage.img_as_float(skimage.io.imread(filename)).astype(np.float32)
+    img = skimage.img_as_float(skimage.io.imread(filename, as_grey=not color)).astype(np.float32)
     if img.ndim == 2:
         img = img[:, :, np.newaxis]
         if color:
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 31dc702..3054110 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -276,6 +276,22 @@ def _Net_batch(self, blobs):
                                                  padding])
         yield padded_batch
 
+
+class _Net_IdNameWrapper:
+    """
+    A simple wrapper that allows the ids propery to be accessed as a dict
+    indexed by names. Used for top and bottom names
+    """
+    def __init__(self, net, func):
+        self.net, self.func = net, func
+
+    def __getitem__(self, name):
+        # Map the layer name to id
+        ids = self.func(self.net, list(self.net._layer_names).index(name))
+        # Map the blob id to name
+        id_to_name = list(self.net.blobs)
+        return [id_to_name[i] for i in ids]
+
 # Attach methods to Net.
 Net.blobs = _Net_blobs
 Net.blob_loss_weights = _Net_blob_loss_weights
@@ -288,3 +304,5 @@ Net.set_input_arrays = _Net_set_input_arrays
 Net._batch = _Net_batch
 Net.inputs = _Net_inputs
 Net.outputs = _Net_outputs
+Net.top_names = property(lambda n: _Net_IdNameWrapper(n, Net._top_ids))
+Net.bottom_names = property(lambda n: _Net_IdNameWrapper(n, Net._bottom_ids))
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 40e6c11..8a80c94 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -20,6 +20,10 @@ endif()
 add_library(caffe ${srcs})
 target_link_libraries(caffe proto ${Caffe_LINKER_LIBS})
 caffe_default_properties(caffe)
+set_target_properties(caffe PROPERTIES
+    VERSION   ${CAFFE_TARGET_VERSION}
+    SOVERSION ${CAFFE_TARGET_SOVERSION}
+    )
 
 # ---[ Tests
  add_subdirectory(test)
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 76d851a..e967bd6 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -37,17 +37,32 @@ namespace caffe {
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetConvolutionLayer(
     const LayerParameter& param) {
-  ConvolutionParameter_Engine engine = param.convolution_param().engine();
+  ConvolutionParameter conv_param = param.convolution_param();
+  ConvolutionParameter_Engine engine = conv_param.engine();
+#ifdef USE_CUDNN
+  bool use_dilation = false;
+  for (int i = 0; i < conv_param.dilation_size(); ++i) {
+    if (conv_param.dilation(i) > 1) {
+      use_dilation = true;
+    }
+  }
+#endif
   if (engine == ConvolutionParameter_Engine_DEFAULT) {
     engine = ConvolutionParameter_Engine_CAFFE;
 #ifdef USE_CUDNN
-    engine = ConvolutionParameter_Engine_CUDNN;
+    if (!use_dilation) {
+      engine = ConvolutionParameter_Engine_CUDNN;
+    }
 #endif
   }
   if (engine == ConvolutionParameter_Engine_CAFFE) {
     return shared_ptr<Layer<Dtype> >(new ConvolutionLayer<Dtype>(param));
 #ifdef USE_CUDNN
   } else if (engine == ConvolutionParameter_Engine_CUDNN) {
+    if (use_dilation) {
+      LOG(FATAL) << "CuDNN doesn't support the dilated convolution at Layer "
+                 << param.name();
+    }
     return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
 #endif
   } else {
@@ -76,7 +91,16 @@ shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
                 << "Using Caffe's own pooling layer.";
       return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
     }
-    return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
+    // CuDNN assumes layers are not being modified in place, thus
+    // breaking our index tracking for updates in some cases in Caffe.
+    // Until there is a workaround in Caffe (index management) or
+    // cuDNN, use Caffe layer to max pooling, or don't use in place
+    // layers after max pooling layers
+    if (param.pooling_param().pool() == PoolingParameter_PoolMethod_MAX) {
+        return shared_ptr<Layer<Dtype> >(new PoolingLayer<Dtype>(param));
+    } else {
+        return shared_ptr<Layer<Dtype> >(new CuDNNPoolingLayer<Dtype>(param));
+    }
 #endif
   } else {
     LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index f6f14cd..4a4c68e 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -36,7 +36,7 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
         << "kernel_size must be specified once, or once per spatial dimension "
         << "(kernel_size specified " << num_kernel_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
+        << num_spatial_axes_ << " spatial dims).";
       for (int i = 0; i < num_spatial_axes_; ++i) {
         kernel_shape_data[i] =
             conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
@@ -61,7 +61,7 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
           num_stride_dims == num_spatial_axes_)
         << "stride must be specified once, or once per spatial dimension "
         << "(stride specified " << num_stride_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
+        << num_spatial_axes_ << " spatial dims).";
     const int kDefaultStride = 1;
     for (int i = 0; i < num_spatial_axes_; ++i) {
       stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
@@ -85,13 +85,27 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
           num_pad_dims == num_spatial_axes_)
         << "pad must be specified once, or once per spatial dimension "
         << "(pad specified " << num_pad_dims << " times; "
-        << num_spatial_axes_ << " spatial dims);";
+        << num_spatial_axes_ << " spatial dims).";
     const int kDefaultPad = 0;
     for (int i = 0; i < num_spatial_axes_; ++i) {
       pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
           conv_param.pad((num_pad_dims == 1) ? 0 : i);
     }
   }
+  // Setup dilation dimensions (dilation_).
+  dilation_.Reshape(spatial_dim_blob_shape);
+  int* dilation_data = dilation_.mutable_cpu_data();
+  const int num_dilation_dims = conv_param.dilation_size();
+  CHECK(num_dilation_dims == 0 || num_dilation_dims == 1 ||
+        num_dilation_dims == num_spatial_axes_)
+      << "dilation must be specified once, or once per spatial dimension "
+      << "(dilation specified " << num_dilation_dims << " times; "
+      << num_spatial_axes_ << " spatial dims).";
+  const int kDefaultDilation = 1;
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    dilation_data[i] = (num_dilation_dims == 0) ? kDefaultDilation :
+                       conv_param.dilation((num_dilation_dims == 1) ? 0 : i);
+  }
   // Special case: im2col is the identity for 1x1 convolution with stride 1
   // and no padding, so flag for skipping the buffer and transformation.
   is_1x1_ = true;
diff --git a/src/caffe/layers/bias_layer.cpp b/src/caffe/layers/bias_layer.cpp
new file mode 100644
index 0000000..4726a72
--- /dev/null
+++ b/src/caffe/layers/bias_layer.cpp
@@ -0,0 +1,121 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/bias_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void BiasLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1 && this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else if (bottom.size() == 1) {
+    // bias is a learned parameter; initialize it
+    const BiasParameter& param = this->layer_param_.bias_param();
+    const int axis = bottom[0]->CanonicalAxisIndex(param.axis());
+    const int num_axes = param.num_axes();
+    CHECK_GE(num_axes, -1) << "num_axes must be non-negative, "
+                           << "or -1 to extend to the end of bottom[0]";
+    if (num_axes >= 0) {
+      CHECK_GE(bottom[0]->num_axes(), axis + num_axes)
+          << "bias blob's shape extends past bottom[0]'s shape when applied "
+          << "starting with bottom[0] axis = " << axis;
+    }
+    this->blobs_.resize(1);
+    const vector<int>::const_iterator& shape_start =
+        bottom[0]->shape().begin() + axis;
+    const vector<int>::const_iterator& shape_end =
+        (num_axes == -1) ? bottom[0]->shape().end() : (shape_start + num_axes);
+    vector<int> bias_shape(shape_start, shape_end);
+    this->blobs_[0].reset(new Blob<Dtype>(bias_shape));
+    shared_ptr<Filler<Dtype> > filler(GetFiller<Dtype>(param.filler()));
+    filler->Fill(this->blobs_[0].get());
+  }
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const BiasParameter& param = this->layer_param_.bias_param();
+  Blob<Dtype>* bias = (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get();
+  // Always set axis == 0 in special case where bias is a scalar
+  // (num_axes == 0). Mathematically equivalent for any choice of axis, so the
+  // actual setting can be safely ignored; and computation is most efficient
+  // with axis == 0 and (therefore) outer_dim_ == 1.
+  const int axis = (bias->num_axes() == 0) ?
+      0 : bottom[0]->CanonicalAxisIndex(param.axis());
+  CHECK_GE(bottom[0]->num_axes(), axis + bias->num_axes())
+      << "bias blob's shape extends past bottom[0]'s shape when applied "
+      << "starting with bottom[0] axis = " << axis;
+  for (int i = 0; i < bias->num_axes(); ++i) {
+    CHECK_EQ(bottom[0]->shape(axis + i), bias->shape(i))
+        << "dimension mismatch between bottom[0]->shape(" << axis + i
+        << ") and bias->shape(" << i << ")";
+  }
+  outer_dim_ = bottom[0]->count(0, axis);
+  bias_dim_ = bias->count();
+  inner_dim_ = bottom[0]->count(axis + bias->num_axes());
+  dim_ = bias_dim_ * inner_dim_;
+  if (bottom[0] != top[0]) {
+    top[0]->ReshapeLike(*bottom[0]);
+  }
+  bias_multiplier_.Reshape(vector<int>(1, inner_dim_));
+  if (bias_multiplier_.cpu_data()[inner_dim_ - 1] != Dtype(1)) {
+    caffe_set(inner_dim_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* bias_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  if (bottom[0] != top[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+  }
+  for (int n = 0; n < outer_dim_; ++n) {
+    caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, bias_dim_,
+        inner_dim_, 1, Dtype(1), bias_data,
+        bias_multiplier_.cpu_data(), Dtype(1), top_data);
+    top_data += dim_;
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0] && bottom[0] != top[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    caffe_copy(bottom[0]->count(), top_diff, bottom_diff);
+  }
+  // in-place, we don't need to do anything with the data diff
+  const bool bias_param = (bottom.size() == 1);
+  if ((!bias_param && propagate_down[1]) ||
+      (bias_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bias_diff = (bias_param ? this->blobs_[0].get() : bottom[1])
+        ->mutable_cpu_diff();
+    bool accum = bias_param;
+    for (int n = 0; n < outer_dim_; ++n) {
+      caffe_cpu_gemv(CblasNoTrans, bias_dim_, inner_dim_, Dtype(1),
+          top_diff, bias_multiplier_.cpu_data(), Dtype(accum), bias_diff);
+      top_diff += dim_;
+      accum = true;
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(BiasLayer);
+#endif
+
+INSTANTIATE_CLASS(BiasLayer);
+REGISTER_LAYER_CLASS(Bias);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/bias_layer.cu b/src/caffe/layers/bias_layer.cu
new file mode 100644
index 0000000..8ac913a
--- /dev/null
+++ b/src/caffe/layers/bias_layer.cu
@@ -0,0 +1,59 @@
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layers/bias_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void BiasForward(const int n, const Dtype* in,
+    const Dtype* bias, const int bias_dim, const int inner_dim,
+    Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int bias_index = (index / inner_dim) % bias_dim;
+    out[index] = in[index] + bias[bias_index];
+  }
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* bias_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  BiasForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, bias_data, bias_dim_, inner_dim_, top_data);
+}
+
+template <typename Dtype>
+void BiasLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0] && bottom[0] != top[0]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    caffe_copy(bottom[0]->count(), top_diff, bottom_diff);
+  }
+  // in-place, we don't need to do anything with the data diff
+  const bool bias_param = (bottom.size() == 1);
+  if ((!bias_param && propagate_down[1]) ||
+      (bias_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bias_diff = (bias_param ? this->blobs_[0].get() : bottom[1])
+        ->mutable_gpu_diff();
+    bool accum = bias_param;
+    for (int n = 0; n < outer_dim_; ++n) {
+      caffe_gpu_gemv(CblasNoTrans, bias_dim_, inner_dim_, Dtype(1),
+          top_diff, bias_multiplier_.gpu_data(), Dtype(accum), bias_diff);
+      top_diff += dim_;
+      accum = true;
+    }
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(BiasLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index cff0978..5d522ab 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -9,11 +9,13 @@ void ConvolutionLayer<Dtype>::compute_output_shape() {
   const int* kernel_shape_data = this->kernel_shape_.cpu_data();
   const int* stride_data = this->stride_.cpu_data();
   const int* pad_data = this->pad_.cpu_data();
+  const int* dilation_data = this->dilation_.cpu_data();
   this->output_shape_.clear();
   for (int i = 0; i < this->num_spatial_axes_; ++i) {
     // i + 1 to skip channel axis
     const int input_dim = this->input_shape(i + 1);
-    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
+    const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;
+    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_extent)
         / stride_data[i] + 1;
     this->output_shape_.push_back(output_dim);
   }
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 275c056..20a460f 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -9,12 +9,14 @@ void DeconvolutionLayer<Dtype>::compute_output_shape() {
   const int* kernel_shape_data = this->kernel_shape_.cpu_data();
   const int* stride_data = this->stride_.cpu_data();
   const int* pad_data = this->pad_.cpu_data();
+  const int* dilation_data = this->dilation_.cpu_data();
   this->output_shape_.clear();
   for (int i = 0; i < this->num_spatial_axes_; ++i) {
     // i + 1 to skip channel axis
     const int input_dim = this->input_shape(i + 1);
+    const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;
     const int output_dim = stride_data[i] * (input_dim - 1)
-        + kernel_shape_data[i] - 2 * pad_data[i];
+        + kernel_extent - 2 * pad_data[i];
     this->output_shape_.push_back(output_dim);
   }
 }
diff --git a/src/caffe/layers/elu_layer.cpp b/src/caffe/layers/elu_layer.cpp
new file mode 100644
index 0000000..a0f8763
--- /dev/null
+++ b/src/caffe/layers/elu_layer.cpp
@@ -0,0 +1,47 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/elu_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ELULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  Dtype alpha = this->layer_param_.elu_param().alpha();
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = std::max(bottom_data[i], Dtype(0))
+        + alpha * (exp(std::min(bottom_data[i], Dtype(0))) - Dtype(1));
+  }
+}
+
+template <typename Dtype>
+void ELULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* top_data = top[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    Dtype alpha = this->layer_param_.elu_param().alpha();
+    for (int i = 0; i < count; ++i) {
+      bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
+          + (alpha + top_data[i]) * (bottom_data[i] <= 0));
+    }
+  }
+}
+
+
+#ifdef CPU_ONLY
+STUB_GPU(ELULayer);
+#endif
+
+INSTANTIATE_CLASS(ELULayer);
+REGISTER_LAYER_CLASS(ELU);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/elu_layer.cu b/src/caffe/layers/elu_layer.cu
new file mode 100644
index 0000000..12545aa
--- /dev/null
+++ b/src/caffe/layers/elu_layer.cu
@@ -0,0 +1,62 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/elu_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void ELUForward(const int n, const Dtype* in, Dtype* out,
+    Dtype alpha) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out[index] = in[index] > 0 ? in[index] :
+        alpha * (exp(in[index]) - 1);
+  }
+}
+
+template <typename Dtype>
+void ELULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype alpha = this->layer_param_.elu_param().alpha();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ELUForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, top_data, alpha);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void ELUBackward(const int n, const Dtype* in_diff,
+    const Dtype* out_data, const Dtype* in_data,
+    Dtype* out_diff, Dtype alpha) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out_diff[index] = in_data[index] > 0 ? in_diff[index] :
+        in_diff[index] * (out_data[index] + alpha);
+  }
+}
+
+template <typename Dtype>
+void ELULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* top_data = top[0]->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype alpha = this->layer_param_.elu_param().alpha();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    ELUBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, top_data, bottom_data, bottom_diff, alpha);
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+
+
+INSTANTIATE_LAYER_GPU_FUNCS(ELULayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index 651507e..d4ab393 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -7,6 +7,8 @@ namespace caffe {
 template <typename Dtype>
 void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
+      "allow in-place computation.";
   const int start_axis = bottom[0]->CanonicalAxisIndex(
       this->layer_param_.flatten_param().axis());
   const int end_axis = bottom[0]->CanonicalAxisIndex(
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index c12e4f5..2fb9b3c 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -87,6 +87,20 @@ void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
           conv_param.pad((num_pad_dims == 1) ? 0 : i);
     }
   }
+  // Setup dilation dimensions (dilation_).
+  dilation_.Reshape(dim_blob_shape);
+  int* dilation_data = dilation_.mutable_cpu_data();
+  const int num_dilation_dims = conv_param.dilation_size();
+  CHECK(num_dilation_dims == 0 || num_dilation_dims == 1 ||
+        num_dilation_dims == num_spatial_axes_)
+      << "dilation must be specified once, or once per spatial dimension "
+      << "(dilation specified " << num_dilation_dims << " times; "
+      << num_spatial_axes_ << " spatial dims).";
+  const int kDefaultDilation = 1;
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    dilation_data[i] = (num_dilation_dims == 0) ? kDefaultDilation :
+                       conv_param.dilation((num_dilation_dims == 1) ? 0 : i);
+  }
 }
 
 template <typename Dtype>
@@ -96,10 +110,12 @@ void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   const int* kernel_shape_data = kernel_shape_.cpu_data();
   const int* stride_data = stride_.cpu_data();
   const int* pad_data = pad_.cpu_data();
+  const int* dilation_data = dilation_.cpu_data();
   for (int i = 0; i < num_spatial_axes_; ++i) {
     top_shape[channel_axis_] *= kernel_shape_data[i];
     const int input_dim = bottom[0]->shape(channel_axis_ + i + 1);
-    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
+    const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;
+    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_extent)
         / stride_data[i] + 1;
     top_shape[channel_axis_ + i + 1] = output_dim;
   }
@@ -122,6 +138,7 @@ void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     DCHECK_EQ(kernel_shape_.count(), num_spatial_axes_);
     DCHECK_EQ(pad_.count(), num_spatial_axes_);
     DCHECK_EQ(stride_.count(), num_spatial_axes_);
+    DCHECK_EQ(dilation_.count(), num_spatial_axes_);
     if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
       im2col_cpu(bottom_data + n * bottom_dim_, channels_,
           bottom[0]->shape(channel_axis_ + 1),
@@ -129,13 +146,14 @@ void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
           stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
           top_data + n * top_dim_);
     } else {
       im2col_nd_cpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
           bottom[0]->shape().data() + channel_axis_,
           top[0]->shape().data() + channel_axis_,
           kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
-          top_data + n * top_dim_);
+          dilation_.cpu_data(), top_data + n * top_dim_);
     }
   }
 }
@@ -153,13 +171,14 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
           stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
           bottom_diff + n * bottom_dim_);
     } else {
       col2im_nd_cpu(top_diff + n * top_dim_, num_spatial_axes_,
           bottom[0]->shape().data() + channel_axis_,
           top[0]->shape().data() + channel_axis_,
           kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
-          bottom_diff + n * bottom_dim_);
+          dilation_.cpu_data(), bottom_diff + n * bottom_dim_);
     }
   }
 }
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
index 517b422..792c97f 100644
--- a/src/caffe/layers/im2col_layer.cu
+++ b/src/caffe/layers/im2col_layer.cu
@@ -19,13 +19,14 @@ void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
           stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
           top_data + n * top_dim_);
     } else {
       im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
           num_kernels, bottom[0]->gpu_shape() + channel_axis_,
           top[0]->gpu_shape() + channel_axis_,
           kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          top_data + n * top_dim_);
+          dilation_.gpu_data(), top_data + n * top_dim_);
     }
   }
 }
@@ -43,13 +44,14 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
           pad_.cpu_data()[0], pad_.cpu_data()[1],
           stride_.cpu_data()[0], stride_.cpu_data()[1],
+          dilation_.cpu_data()[0], dilation_.cpu_data()[1],
           bottom_diff + n * bottom_dim_);
     } else {
       col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_,
           bottom[0]->gpu_shape() + channel_axis_,
           top[0]->gpu_shape() + channel_axis_,
           kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
-          bottom_diff + n * bottom_dim_);
+          dilation_.gpu_data(), bottom_diff + n * bottom_dim_);
     }
   }
 }
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
index 82339f7..45dd090 100644
--- a/src/caffe/layers/reshape_layer.cpp
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -7,6 +7,8 @@ namespace caffe {
 template <typename Dtype>
 void ReshapeLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
+  CHECK_NE(top[0], bottom[0]) << this->type() << " Layer does not "
+      "allow in-place computation.";
   inferred_axis_ = -1;
   copy_axes_.clear();
   const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
diff --git a/src/caffe/layers/scale_layer.cpp b/src/caffe/layers/scale_layer.cpp
new file mode 100644
index 0000000..ecdbb12
--- /dev/null
+++ b/src/caffe/layers/scale_layer.cpp
@@ -0,0 +1,219 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/filler.hpp"
+#include "caffe/layer_factory.hpp"
+#include "caffe/layers/scale_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const ScaleParameter& param = this->layer_param_.scale_param();
+  if (bottom.size() == 1 && this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else if (bottom.size() == 1) {
+    // scale is a learned parameter; initialize it
+    axis_ = bottom[0]->CanonicalAxisIndex(param.axis());
+    const int num_axes = param.num_axes();
+    CHECK_GE(num_axes, -1) << "num_axes must be non-negative, "
+                           << "or -1 to extend to the end of bottom[0]";
+    if (num_axes >= 0) {
+      CHECK_GE(bottom[0]->num_axes(), axis_ + num_axes)
+          << "scale blob's shape extends past bottom[0]'s shape when applied "
+          << "starting with bottom[0] axis = " << axis_;
+    }
+    this->blobs_.resize(1);
+    const vector<int>::const_iterator& shape_start =
+        bottom[0]->shape().begin() + axis_;
+    const vector<int>::const_iterator& shape_end =
+        (num_axes == -1) ? bottom[0]->shape().end() : (shape_start + num_axes);
+    vector<int> scale_shape(shape_start, shape_end);
+    this->blobs_[0].reset(new Blob<Dtype>(scale_shape));
+    FillerParameter filler_param(param.filler());
+    if (!param.has_filler()) {
+      // Default to unit (1) filler for identity operation.
+      filler_param.set_type("constant");
+      filler_param.set_value(1);
+    }
+    shared_ptr<Filler<Dtype> > filler(GetFiller<Dtype>(filler_param));
+    filler->Fill(this->blobs_[0].get());
+  }
+  if (param.bias_term()) {
+    LayerParameter layer_param(this->layer_param_);
+    layer_param.set_type("Bias");
+    BiasParameter* bias_param = layer_param.mutable_bias_param();
+    bias_param->set_axis(param.axis());
+    if (bottom.size() > 1) {
+      bias_param->set_num_axes(bottom[1]->num_axes());
+    } else {
+      bias_param->set_num_axes(param.num_axes());
+    }
+    bias_param->mutable_filler()->CopyFrom(param.bias_filler());
+    bias_layer_ = LayerRegistry<Dtype>::CreateLayer(layer_param);
+    bias_bottom_vec_.resize(1);
+    bias_bottom_vec_[0] = bottom[0];
+    bias_layer_->SetUp(bias_bottom_vec_, top);
+    bias_param_id_ = this->blobs_.size();
+    this->blobs_.resize(bias_param_id_ + 1);
+    this->blobs_[bias_param_id_] = bias_layer_->blobs()[0];
+    bias_propagate_down_.resize(1, false);
+  }
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const ScaleParameter& param = this->layer_param_.scale_param();
+  Blob<Dtype>* scale = (bottom.size() > 1) ? bottom[1] : this->blobs_[0].get();
+  // Always set axis_ == 0 in special case where scale is a scalar
+  // (num_axes == 0). Mathematically equivalent for any choice of axis_, so the
+  // actual setting can be safely ignored; and computation is most efficient
+  // with axis_ == 0 and (therefore) outer_dim_ == 1. (Setting axis_ to
+  // bottom[0]->num_axes() - 1, giving inner_dim_ == 1, would be equally
+  // performant.)
+  axis_ = (scale->num_axes() == 0) ?
+      0 : bottom[0]->CanonicalAxisIndex(param.axis());
+  CHECK_GE(bottom[0]->num_axes(), axis_ + scale->num_axes())
+      << "scale blob's shape extends past bottom[0]'s shape when applied "
+      << "starting with bottom[0] axis = " << axis_;
+  for (int i = 0; i < scale->num_axes(); ++i) {
+    CHECK_EQ(bottom[0]->shape(axis_ + i), scale->shape(i))
+        << "dimension mismatch between bottom[0]->shape(" << axis_ + i
+        << ") and scale->shape(" << i << ")";
+  }
+  outer_dim_ = bottom[0]->count(0, axis_);
+  scale_dim_ = scale->count();
+  inner_dim_ = bottom[0]->count(axis_ + scale->num_axes());
+  if (bottom[0] == top[0]) {  // in-place computation
+    temp_.ReshapeLike(*bottom[0]);
+  } else {
+    top[0]->ReshapeLike(*bottom[0]);
+  }
+  sum_result_.Reshape(vector<int>(1, outer_dim_ * scale_dim_));
+  const int sum_mult_size = std::max(outer_dim_, inner_dim_);
+  sum_multiplier_.Reshape(vector<int>(1, sum_mult_size));
+  if (sum_multiplier_.cpu_data()[sum_mult_size - 1] != Dtype(1)) {
+    caffe_set(sum_mult_size, Dtype(1), sum_multiplier_.mutable_cpu_data());
+  }
+  if (bias_layer_) {
+    bias_bottom_vec_[0] = top[0];
+    bias_layer_->Reshape(bias_bottom_vec_, top);
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  if (bottom[0] == top[0]) {
+    // In-place computation; need to store bottom data before overwriting it.
+    // Note that this is only necessary for Backward; we could skip this if not
+    // doing Backward, but Caffe currently provides no way of knowing whether
+    // we'll need to do Backward at the time of the Forward call.
+    caffe_copy(bottom[0]->count(), bottom[0]->cpu_data(),
+               temp_.mutable_cpu_data());
+  }
+  const Dtype* scale_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int n = 0; n < outer_dim_; ++n) {
+    for (int d = 0; d < scale_dim_; ++d) {
+      const Dtype factor = scale_data[d];
+      caffe_cpu_scale(inner_dim_, factor, bottom_data, top_data);
+      bottom_data += inner_dim_;
+      top_data += inner_dim_;
+    }
+  }
+  if (bias_layer_) {
+    bias_layer_->Forward(bias_bottom_vec_, top);
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bias_layer_ &&
+      this->param_propagate_down_[this->param_propagate_down_.size() - 1]) {
+    bias_layer_->Backward(top, bias_propagate_down_, bias_bottom_vec_);
+  }
+  const bool scale_param = (bottom.size() == 1);
+  Blob<Dtype>* scale = scale_param ? this->blobs_[0].get() : bottom[1];
+  if ((!scale_param && propagate_down[1]) ||
+      (scale_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const bool in_place = (bottom[0] == top[0]);
+    const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->cpu_data();
+    // Hack: store big eltwise product in bottom[0] diff, except in the special
+    // case where this layer itself does the eltwise product, in which case we
+    // can store it directly in the scale diff, and we're done.
+    // If we're computing in-place (and not doing eltwise computation), this
+    // hack doesn't work and we store the product in temp_.
+    const bool is_eltwise = (bottom[0]->count() == scale->count());
+    Dtype* product = (is_eltwise ? scale->mutable_cpu_diff() :
+        (in_place ? temp_.mutable_cpu_data() : bottom[0]->mutable_cpu_diff()));
+    caffe_mul(top[0]->count(), top_diff, bottom_data, product);
+    if (!is_eltwise) {
+      Dtype* sum_result = NULL;
+      if (inner_dim_ == 1) {
+        sum_result = product;
+      } else if (sum_result_.count() == 1) {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        Dtype* scale_diff = scale->mutable_cpu_diff();
+        if (scale_param) {
+          Dtype result = caffe_cpu_dot(inner_dim_, product, sum_mult);
+          *scale_diff += result;
+        } else {
+          *scale_diff = caffe_cpu_dot(inner_dim_, product, sum_mult);
+        }
+      } else {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        sum_result = (outer_dim_ == 1) ?
+            scale->mutable_cpu_diff() : sum_result_.mutable_cpu_data();
+        caffe_cpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
+                       Dtype(1), product, sum_mult, Dtype(0), sum_result);
+      }
+      if (outer_dim_ != 1) {
+        const Dtype* sum_mult = sum_multiplier_.cpu_data();
+        Dtype* scale_diff = scale->mutable_cpu_diff();
+        if (scale_dim_ == 1) {
+          if (scale_param) {
+            Dtype result = caffe_cpu_dot(outer_dim_, sum_mult, sum_result);
+            *scale_diff += result;
+          } else {
+            *scale_diff = caffe_cpu_dot(outer_dim_, sum_mult, sum_result);
+          }
+        } else {
+          caffe_cpu_gemv(CblasTrans, outer_dim_, scale_dim_,
+                         Dtype(1), sum_result, sum_mult, Dtype(scale_param),
+                         scale_diff);
+        }
+      }
+    }
+  }
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* scale_data = scale->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    for (int n = 0; n < outer_dim_; ++n) {
+      for (int d = 0; d < scale_dim_; ++d) {
+        const Dtype factor = scale_data[d];
+        caffe_cpu_scale(inner_dim_, factor, top_diff, bottom_diff);
+        bottom_diff += inner_dim_;
+        top_diff += inner_dim_;
+      }
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(ScaleLayer);
+#endif
+
+INSTANTIATE_CLASS(ScaleLayer);
+REGISTER_LAYER_CLASS(Scale);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/scale_layer.cu b/src/caffe/layers/scale_layer.cu
new file mode 100644
index 0000000..fc9a806
--- /dev/null
+++ b/src/caffe/layers/scale_layer.cu
@@ -0,0 +1,135 @@
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/scale_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void ScaleForward(const int n, const Dtype* in,
+    const Dtype* scale, const int scale_dim, const int inner_dim,
+    Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int scale_index = (index / inner_dim) % scale_dim;
+    out[index] = in[index] * scale[scale_index];
+  }
+}
+
+template <typename Dtype>
+__global__ void ScaleBiasForward(const int n, const Dtype* in,
+    const Dtype* scale, const Dtype* bias,
+    const int scale_dim, const int inner_dim, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int scale_index = (index / inner_dim) % scale_dim;
+    out[index] = in[index] * scale[scale_index] + bias[scale_index];
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const int count = top[0]->count();
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  if (bottom[0] == top[0]) {
+    // in-place computation; need to store bottom data before overwriting it.
+    // Note that this is only necessary for Backward; we could skip this if not
+    // doing Backward, but Caffe currently provides no way of knowing whether
+    // we'll need to do Backward at the time of the Forward call.
+    caffe_copy(bottom[0]->count(), bottom[0]->gpu_data(),
+               temp_.mutable_gpu_data());
+  }
+  const Dtype* scale_data =
+      ((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  if (bias_layer_) {
+    const Dtype* bias_data = this->blobs_[bias_param_id_]->gpu_data();
+    ScaleBiasForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, scale_data, bias_data, scale_dim_, inner_dim_,
+        top_data);
+  } else {
+    ScaleForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, bottom_data, scale_data, scale_dim_, inner_dim_, top_data);
+  }
+}
+
+template <typename Dtype>
+void ScaleLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bias_layer_ &&
+      this->param_propagate_down_[this->param_propagate_down_.size() - 1]) {
+    bias_layer_->Backward(top, bias_propagate_down_, bias_bottom_vec_);
+  }
+  const bool scale_param = (bottom.size() == 1);
+  Blob<Dtype>* scale = scale_param ? this->blobs_[0].get() : bottom[1];
+  if ((!scale_param && propagate_down[1]) ||
+      (scale_param && this->param_propagate_down_[0])) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const bool in_place = (bottom[0] == top[0]);
+    const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data();
+    // Hack: store big eltwise product in bottom[0] diff, except in the special
+    // case where this layer itself does the eltwise product, in which case we
+    // can store it directly in the scale diff, and we're done.
+    // If we're computing in-place (and not doing eltwise computation), this
+    // hack doesn't work and we store the product in temp_.
+    const bool is_eltwise = (bottom[0]->count() == scale->count());
+    Dtype* product = (is_eltwise ? scale->mutable_gpu_diff() :
+        (in_place ? temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff()));
+    caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product);
+    if (!is_eltwise) {
+      Dtype* sum_result = NULL;
+      if (inner_dim_ == 1) {
+        sum_result = product;
+      } else if (sum_result_.count() == 1) {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        Dtype* scale_diff = scale->mutable_cpu_diff();
+        if (scale_param) {
+          Dtype result;
+          caffe_gpu_dot(inner_dim_, product, sum_mult, &result);
+          *scale_diff += result;
+        } else {
+          caffe_gpu_dot(inner_dim_, product, sum_mult, scale_diff);
+        }
+      } else {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        sum_result = (outer_dim_ == 1) ?
+            scale->mutable_gpu_diff() : sum_result_.mutable_gpu_data();
+        caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
+                       Dtype(1), product, sum_mult, Dtype(0), sum_result);
+      }
+      if (outer_dim_ != 1) {
+        const Dtype* sum_mult = sum_multiplier_.gpu_data();
+        if (scale_dim_ == 1) {
+          Dtype* scale_diff = scale->mutable_cpu_diff();
+          if (scale_param) {
+            Dtype result;
+            caffe_gpu_dot(outer_dim_, sum_mult, sum_result, &result);
+            *scale_diff += result;
+          } else {
+            caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scale_diff);
+          }
+        } else {
+          Dtype* scale_diff = scale->mutable_gpu_diff();
+          caffe_gpu_gemv(CblasTrans, outer_dim_, scale_dim_,
+                         Dtype(1), sum_result, sum_mult, Dtype(scale_param),
+                         scale_diff);
+        }
+      }
+    }
+  }
+  if (propagate_down[0]) {
+    const int count = top[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* scale_data = scale->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    ScaleForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, scale_data, scale_dim_, inner_dim_, bottom_diff);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(ScaleLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 787369f..6493a72 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -306,7 +306,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 140 (last added: batch_norm_param)
+// LayerParameter next available layer-specific ID: 143 (last added: scale_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -356,6 +356,7 @@ message LayerParameter {
   optional AccuracyParameter accuracy_param = 102;
   optional ArgMaxParameter argmax_param = 103;
   optional BatchNormParameter batch_norm_param = 139;
+  optional BiasParameter bias_param = 141;
   optional ConcatParameter concat_param = 104;
   optional ContrastiveLossParameter contrastive_loss_param = 105;
   optional ConvolutionParameter convolution_param = 106;
@@ -363,6 +364,7 @@ message LayerParameter {
   optional DropoutParameter dropout_param = 108;
   optional DummyDataParameter dummy_data_param = 109;
   optional EltwiseParameter eltwise_param = 110;
+  optional ELUParameter elu_param = 140;
   optional EmbedParameter embed_param = 137;
   optional ExpParameter exp_param = 111;
   optional FlattenParameter flatten_param = 135;
@@ -383,6 +385,7 @@ message LayerParameter {
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
+  optional ScaleParameter scale_param = 142;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
@@ -497,6 +500,38 @@ message BatchNormParameter {
   optional float eps = 3 [default = 1e-5];
 }
 
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+}
+
 message ContrastiveLossParameter {
   // margin for dissimilar pair
   optional float margin = 1 [default = 1.0];
@@ -518,6 +553,10 @@ message ConvolutionParameter {
   repeated uint32 pad = 3; // The padding size; defaults to 0
   repeated uint32 kernel_size = 4; // The kernel size
   repeated uint32 stride = 6; // The stride; defaults to 1
+  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+  // holes. (Kernel dilation is sometimes referred to by its use in the
+  // algorithme à trous from Holschneider et al. 1987.)
+  repeated uint32 dilation = 18; // The dilation; defaults to 1
 
   // For 2D convolution only, the *_h and *_w versions may also be used to
   // specify both spatial dimensions.
@@ -629,6 +668,14 @@ message EltwiseParameter {
   optional bool stable_prod_grad = 3 [default = true];
 }
 
+// Message that stores parameters used by ELULayer
+message ELUParameter {
+  // Described in:
+  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate 
+  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
+  optional float alpha = 1 [default = 1];
+}
+
 // Message that stores parameters used by EmbedLayer
 message EmbedParameter {
   optional uint32 num_output = 1; // The number of outputs for the layer
@@ -947,6 +994,43 @@ message ReshapeParameter {
   optional int32 num_axes = 3 [default = -1];
 }
 
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [default = false];
+  optional FillerParameter bias_filler = 5;
+}
+
 message SigmoidParameter {
   enum Engine {
     DEFAULT = 0;
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 95d7506..a5ccf9c 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -196,8 +196,8 @@ void Solver<Dtype>::Step(int iters) {
   const int start_iter = iter_;
   const int stop_iter = iter_ + iters;
   int average_loss = this->param_.average_loss();
-  vector<Dtype> losses;
-  Dtype smoothed_loss = 0;
+  losses_.clear();
+  smoothed_loss_ = 0;
 
   while (iter_ < stop_iter) {
     // zero-init the params
@@ -224,18 +224,10 @@ void Solver<Dtype>::Step(int iters) {
     }
     loss /= param_.iter_size();
     // average the loss across iterations for smoothed reporting
-    if (losses.size() < average_loss) {
-      losses.push_back(loss);
-      int size = losses.size();
-      smoothed_loss = (smoothed_loss * (size - 1) + loss) / size;
-    } else {
-      int idx = (iter_ - start_iter) % average_loss;
-      smoothed_loss += (loss - losses[idx]) / average_loss;
-      losses[idx] = loss;
-    }
+    UpdateSmoothedLoss(loss, start_iter, average_loss);
     if (display) {
       LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
-          << ", loss = " << smoothed_loss;
+          << ", loss = " << smoothed_loss_;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
       int score_index = 0;
       for (int j = 0; j < result.size(); ++j) {
@@ -298,6 +290,7 @@ void Solver<Dtype>::Solve(const char* resume_file) {
 
   // For a network that is trained by the solver, no bottom or top vecs
   // should be given, and we will just provide dummy vecs.
+  int start_iter = iter_;
   Step(param_.max_iter() - iter_);
   // If we haven't already, save a snapshot after optimization, unless
   // overridden by setting snapshot_after_train := false
@@ -316,9 +309,13 @@ void Solver<Dtype>::Solve(const char* resume_file) {
   // updated the parameters "max_iter" times -- this final pass is only done to
   // display the loss, which is computed in the forward pass.
   if (param_.display() && iter_ % param_.display() == 0) {
+    int average_loss = this->param_.average_loss();
     Dtype loss;
     net_->ForwardPrefilled(&loss);
-    LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
+
+    UpdateSmoothedLoss(loss, start_iter, average_loss);
+
+    LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss_;
   }
   if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
     TestAll();
@@ -483,6 +480,20 @@ void Solver<Dtype>::Restore(const char* state_file) {
   }
 }
 
+template <typename Dtype>
+void Solver<Dtype>::UpdateSmoothedLoss(Dtype loss, int start_iter,
+    int average_loss) {
+  if (losses_.size() < average_loss) {
+    losses_.push_back(loss);
+    int size = losses_.size();
+    smoothed_loss_ = (smoothed_loss_ * (size - 1) + loss) / size;
+  } else {
+    int idx = (iter_ - start_iter) % average_loss;
+    smoothed_loss_ += (loss - losses_[idx]) / average_loss;
+    losses_[idx] = loss;
+  }
+}
+
 INSTANTIATE_CLASS(Solver);
 
 }  // namespace caffe
diff --git a/src/caffe/solvers/adadelta_solver.cpp b/src/caffe/solvers/adadelta_solver.cpp
index a37899e..fd30f19 100644
--- a/src/caffe/solvers/adadelta_solver.cpp
+++ b/src/caffe/solvers/adadelta_solver.cpp
@@ -16,6 +16,12 @@ void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
   }
 }
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void adadelta_update_gpu(int N, Dtype* g, Dtype* h, Dtype* h2, Dtype momentum,
+    Dtype delta, Dtype local_rate);
+#endif
+
 template <typename Dtype>
 void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
@@ -85,61 +91,11 @@ void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   }
   case Caffe::GPU: {
 #ifndef CPU_ONLY
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history of gradients
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->gpu_data(), momentum,
-        this->history_[param_id]->mutable_gpu_data());
-
-    // add delta to history to guard against dividing by zero later
-    caffe_gpu_set(net_params[param_id]->count(), delta,
-        this->temp_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->temp_[param_id]->gpu_data(),
-        this->history_[update_history_offset + param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->temp_[param_id]->gpu_data(),
-        this->history_[param_id]->gpu_data(),
-        this->temp_[param_id]->mutable_gpu_data());
-
-    // divide history of updates by history of gradients
-    caffe_gpu_div(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        this->temp_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // jointly compute the RMS of both for update and gradient history
-    caffe_gpu_powx(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // compute the update and copy to net_diff
-    caffe_gpu_mul(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(),
-        this->update_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
-
-    // compute square of update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history of updates
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
-        this->update_[param_id]->gpu_data(), momentum,
-        this->history_[update_history_offset + param_id]->mutable_gpu_data());
-
-    // apply learning rate
-    caffe_gpu_scale(net_params[param_id]->count(), local_rate,
-        net_params[param_id]->gpu_diff(),
-        net_params[param_id]->mutable_gpu_diff());
+    adadelta_update_gpu(net_params[param_id]->count(),
+        net_params[param_id]->mutable_gpu_diff(),
+        this->history_[param_id]->mutable_gpu_data(),
+        this->history_[update_history_offset + param_id]->mutable_gpu_data(),
+        momentum, delta, local_rate);
 #else
     NO_GPU;
 #endif
diff --git a/src/caffe/solvers/adadelta_solver.cu b/src/caffe/solvers/adadelta_solver.cu
new file mode 100644
index 0000000..6c94585
--- /dev/null
+++ b/src/caffe/solvers/adadelta_solver.cu
@@ -0,0 +1,30 @@
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void AdaDeltaUpdate(int N, Dtype* g, Dtype* h, Dtype* h2,
+    Dtype momentum, Dtype delta, Dtype local_rate) {
+  CUDA_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float hi = h[i] = momentum * h[i] + (1-momentum) * gi * gi;
+    gi = gi * sqrt((h2[i] + delta) / (hi + delta));
+    h2[i] = momentum * h2[i] + (1-momentum) * gi * gi;
+    g[i] = local_rate * gi;
+  }
+}
+template <typename Dtype>
+void adadelta_update_gpu(int N, Dtype* g, Dtype* h, Dtype* h2, Dtype momentum,
+    Dtype delta, Dtype local_rate) {
+  AdaDeltaUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, g, h, h2, momentum, delta, local_rate);
+  CUDA_POST_KERNEL_CHECK;
+}
+template void adadelta_update_gpu<float>(int , float*, float*, float*,
+    float, float, float);
+template void adadelta_update_gpu<double>(int, double*, double*, double*,
+    double, double, double);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/adagrad_solver.cpp b/src/caffe/solvers/adagrad_solver.cpp
index 5e40632..e78eadc 100644
--- a/src/caffe/solvers/adagrad_solver.cpp
+++ b/src/caffe/solvers/adagrad_solver.cpp
@@ -4,6 +4,12 @@
 
 namespace caffe {
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
+    Dtype local_rate);
+#endif
+
 template <typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   CHECK(Caffe::root_solver());
@@ -45,34 +51,9 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   }
   case Caffe::GPU: {
 #ifndef CPU_ONLY
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_add(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        this->history_[param_id]->gpu_data(),
-        this->history_[param_id]->mutable_gpu_data());
-
-    // prepare update
-    caffe_gpu_powx(net_params[param_id]->count(),
-              this->history_[param_id]->gpu_data(), Dtype(0.5),
-              this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add_scalar(net_params[param_id]->count(),
-              delta, this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_div(net_params[param_id]->count(),
-              net_params[param_id]->gpu_diff(),
-              this->update_[param_id]->gpu_data(),
-              this->update_[param_id]->mutable_gpu_data());
-
-    // scale and copy
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->gpu_data(), Dtype(0),
-        net_params[param_id]->mutable_gpu_diff());
+    adagrad_update_gpu(net_params[param_id]->count(),
+        net_params[param_id]->mutable_gpu_diff(),
+        this->history_[param_id]->mutable_gpu_data(), delta, local_rate);
 #else
     NO_GPU;
 #endif
diff --git a/src/caffe/solvers/adagrad_solver.cu b/src/caffe/solvers/adagrad_solver.cu
new file mode 100644
index 0000000..adefd55
--- /dev/null
+++ b/src/caffe/solvers/adagrad_solver.cu
@@ -0,0 +1,26 @@
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void AdaGradUpdate(int N, Dtype* g, Dtype* h, Dtype delta,
+    Dtype local_rate) {
+  CUDA_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float hi = h[i] = h[i] + gi*gi;
+    g[i] = local_rate * gi / (sqrt(hi) + delta);
+  }
+}
+template <typename Dtype>
+void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
+    Dtype local_rate) {
+  AdaGradUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, g, h, delta, local_rate);
+  CUDA_POST_KERNEL_CHECK;
+}
+template void adagrad_update_gpu<float>(int, float*, float*, float, float);
+template void adagrad_update_gpu<double>(int, double*, double*, double, double);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/adam_solver.cpp b/src/caffe/solvers/adam_solver.cpp
index cb0fbfe..4a91f00 100644
--- a/src/caffe/solvers/adam_solver.cpp
+++ b/src/caffe/solvers/adam_solver.cpp
@@ -16,6 +16,12 @@ void AdamSolver<Dtype>::AdamPreSolve() {
   }
 }
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void adam_update_gpu(int N, Dtype* g, Dtype* m, Dtype* v, Dtype beta1,
+    Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate);
+#endif
+
 template <typename Dtype>
 void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
@@ -30,7 +36,7 @@ void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   Blob<Dtype>* val_v = this->history_[param_id + update_history_offset].get();
   Blob<Dtype>* val_t = this->temp_[param_id].get();
 
-  const int t = this->iter_  + 1;
+  const int t = this->iter_ + 1;
   const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t)) /
       (Dtype(1.) - pow(beta1, t));
   const int N = net_params[param_id]->count();
@@ -69,34 +75,9 @@ void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   }
   case Caffe::GPU: {
 #ifndef CPU_ONLY
-    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
-    caffe_gpu_axpby(N, Dtype(1)-beta1,
-        net_params[param_id]->gpu_diff(), beta1,
-        val_m->mutable_gpu_data());
-
-    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
-    caffe_gpu_mul(N,
-        net_params[param_id]->gpu_diff(),
-        net_params[param_id]->gpu_diff(),
-        val_t->mutable_gpu_data());
-    caffe_gpu_axpby(N, Dtype(1)-beta2,
-        val_t->gpu_data(), beta2,
-        val_v->mutable_gpu_data());
-
-    // set update
-    caffe_gpu_powx(N,
-        val_v->gpu_data(), Dtype(0.5),
-        val_t->mutable_gpu_data());
-    caffe_gpu_add_scalar(N, eps_hat,
-        val_t->mutable_gpu_data());
-    caffe_gpu_div(N,
-        val_m->gpu_data(),
-        val_t->gpu_data(),
-        val_t->mutable_gpu_data());
-
-    caffe_gpu_scale(N, local_rate*correction,
-        val_t->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
+    adam_update_gpu(N, net_params[param_id]->mutable_gpu_diff(),
+        val_m->mutable_gpu_data(), val_v->mutable_gpu_data(), beta1, beta2,
+        eps_hat, local_rate*correction);
 #else
     NO_GPU;
 #endif
diff --git a/src/caffe/solvers/adam_solver.cu b/src/caffe/solvers/adam_solver.cu
new file mode 100644
index 0000000..917ae10
--- /dev/null
+++ b/src/caffe/solvers/adam_solver.cu
@@ -0,0 +1,29 @@
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void AdamUpdate(int N, Dtype* g, Dtype* m, Dtype* v,
+    Dtype beta1, Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate) {
+  CUDA_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float mi = m[i] = m[i]*beta1 + gi*(1-beta1);
+    float vi = v[i] = v[i]*beta2 + gi*gi*(1-beta2);
+    g[i] = corrected_local_rate * mi / (sqrt(vi) + eps_hat);
+  }
+}
+template <typename Dtype>
+void adam_update_gpu(int N, Dtype* g, Dtype* m, Dtype* v, Dtype beta1,
+    Dtype beta2, Dtype eps_hat, Dtype corrected_local_rate) {
+  AdamUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, g, m, v, beta1, beta2, eps_hat, corrected_local_rate);
+  CUDA_POST_KERNEL_CHECK;
+}
+template void adam_update_gpu<float>(int, float*, float*, float*,
+    float, float, float, float);
+template void adam_update_gpu<double>(int, double*, double*, double*,
+    double, double, double, double);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/nesterov_solver.cpp b/src/caffe/solvers/nesterov_solver.cpp
index 34bf01e..23ab2d4 100644
--- a/src/caffe/solvers/nesterov_solver.cpp
+++ b/src/caffe/solvers/nesterov_solver.cpp
@@ -4,6 +4,12 @@
 
 namespace caffe {
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
+    Dtype local_rate);
+#endif
+
 template <typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   CHECK(Caffe::root_solver());
@@ -36,25 +42,10 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   }
   case Caffe::GPU: {
 #ifndef CPU_ONLY
-    // save history momentum for stepping back
-    caffe_copy(net_params[param_id]->count(),
-        this->history_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              this->history_[param_id]->mutable_gpu_data());
-
-    // compute update: step back then over step
-    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum,
-        this->history_[param_id]->gpu_data(), -momentum,
-        this->update_[param_id]->mutable_gpu_data());
-
-    // copy
-    caffe_copy(net_params[param_id]->count(),
-        this->update_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
+    nesterov_update_gpu(net_params[param_id]->count(),
+        net_params[param_id]->mutable_gpu_diff(),
+        this->history_[param_id]->mutable_gpu_data(),
+        momentum, local_rate);
 #else
     NO_GPU;
 #endif
diff --git a/src/caffe/solvers/nesterov_solver.cu b/src/caffe/solvers/nesterov_solver.cu
new file mode 100644
index 0000000..57a456b
--- /dev/null
+++ b/src/caffe/solvers/nesterov_solver.cu
@@ -0,0 +1,27 @@
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void NesterovUpdate(int N, Dtype* g, Dtype* h,
+    Dtype momentum, Dtype local_rate) {
+  CUDA_KERNEL_LOOP(i, N) {
+    float hi = h[i];
+    float hi_new = h[i] = momentum * hi + local_rate * g[i];
+    g[i] = (1+momentum) * hi_new - momentum * hi;
+  }
+}
+template <typename Dtype>
+void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
+    Dtype local_rate) {
+  NesterovUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, g, h, momentum, local_rate);
+  CUDA_POST_KERNEL_CHECK;
+}
+template void nesterov_update_gpu<float>(int, float*, float*, float, float);
+template void nesterov_update_gpu<double>(int, double*, double*, double,
+    double);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/rmsprop_solver.cpp b/src/caffe/solvers/rmsprop_solver.cpp
index c624767..3251ee4 100644
--- a/src/caffe/solvers/rmsprop_solver.cpp
+++ b/src/caffe/solvers/rmsprop_solver.cpp
@@ -4,6 +4,12 @@
 
 namespace caffe {
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void rmsprop_update_gpu(int N, Dtype* g, Dtype* h, Dtype rms_decay,
+    Dtype delta, Dtype local_rate);
+#endif
+
 template <typename Dtype>
 void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
@@ -45,31 +51,10 @@ void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
     break;
   case Caffe::GPU:
 #ifndef CPU_ONLY
-    // compute square of gradient in update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), Dtype(2),
-        this->update_[param_id]->mutable_gpu_data());
-
-    // update history
-    caffe_gpu_axpby(net_params[param_id] -> count(),
-        Dtype(1-rms_decay), this->update_[param_id]->gpu_data(),
-        rms_decay, this->history_[param_id]-> mutable_gpu_data());
-
-    // prepare update
-    caffe_gpu_powx(net_params[param_id]->count(),
-        this->history_[param_id]->gpu_data(), Dtype(0.5),
-        this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_add_scalar(net_params[param_id]->count(),
-        delta, this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_div(net_params[param_id]->count(),
-        net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(),
-        this->update_[param_id]->mutable_gpu_data());
-
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-        this->update_[param_id]->gpu_data(), Dtype(0),
-        net_params[param_id]->mutable_gpu_diff());
+    rmsprop_update_gpu(net_params[param_id]->count(),
+        net_params[param_id]->mutable_gpu_diff(),
+        this->history_[param_id]->mutable_gpu_data(),
+        rms_decay, delta, local_rate);
 #else
     NO_GPU;
 #endif
diff --git a/src/caffe/solvers/rmsprop_solver.cu b/src/caffe/solvers/rmsprop_solver.cu
new file mode 100644
index 0000000..c5ffd32
--- /dev/null
+++ b/src/caffe/solvers/rmsprop_solver.cu
@@ -0,0 +1,28 @@
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void RMSPropUpdate(int N, Dtype* g, Dtype* h,
+    Dtype rms_decay, Dtype delta, Dtype local_rate) {
+  CUDA_KERNEL_LOOP(i, N) {
+    float gi = g[i];
+    float hi = h[i] = rms_decay*h[i] + (1-rms_decay)*gi*gi;
+    g[i] = local_rate * g[i] / (sqrt(hi) + delta);
+  }
+}
+template <typename Dtype>
+void rmsprop_update_gpu(int N, Dtype* g, Dtype* h, Dtype rms_decay,
+    Dtype delta, Dtype local_rate) {
+  RMSPropUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, g, h, rms_decay, delta, local_rate);
+  CUDA_POST_KERNEL_CHECK;
+}
+template void rmsprop_update_gpu<float>(int, float*, float*, float, float,
+    float);
+template void rmsprop_update_gpu<double>(int, double*, double*, double, double,
+    double);
+
+}  // namespace caffe
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 32bf19b..f30f316 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -203,6 +203,12 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
   }
 }
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void sgd_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
+    Dtype local_rate);
+#endif
+
 template <typename Dtype>
 void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
@@ -222,12 +228,10 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   }
   case Caffe::GPU: {
 #ifndef CPU_ONLY
-    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
-              net_params[param_id]->gpu_diff(), momentum,
-              history_[param_id]->mutable_gpu_data());
-    caffe_copy(net_params[param_id]->count(),
-        history_[param_id]->gpu_data(),
-        net_params[param_id]->mutable_gpu_diff());
+    sgd_update_gpu(net_params[param_id]->count(),
+        net_params[param_id]->mutable_gpu_diff(),
+        history_[param_id]->mutable_gpu_data(),
+        momentum, local_rate);
 #else
     NO_GPU;
 #endif
diff --git a/src/caffe/solvers/sgd_solver.cu b/src/caffe/solvers/sgd_solver.cu
new file mode 100644
index 0000000..e541035
--- /dev/null
+++ b/src/caffe/solvers/sgd_solver.cu
@@ -0,0 +1,24 @@
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void SGDUpdate(int N, Dtype* g, Dtype* h,
+    Dtype momentum, Dtype local_rate) {
+  CUDA_KERNEL_LOOP(i, N) {
+    g[i] = h[i] = momentum*h[i] + local_rate*g[i];
+  }
+}
+template <typename Dtype>
+void sgd_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
+    Dtype local_rate) {
+  SGDUpdate<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, g, h, momentum, local_rate);
+  CUDA_POST_KERNEL_CHECK;
+}
+template void sgd_update_gpu<float>(int, float*, float*, float, float);
+template void sgd_update_gpu<double>(int, double*, double*, double, double);
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_bias_layer.cpp b/src/caffe/test/test_bias_layer.cpp
new file mode 100644
index 0000000..3862e76
--- /dev/null
+++ b/src/caffe/test/test_bias_layer.cpp
@@ -0,0 +1,467 @@
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/bias_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class BiasLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  BiasLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_eltwise_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_broadcast_0_(new Blob<Dtype>()),
+        blob_bottom_broadcast_1_(new Blob<Dtype>()),
+        blob_bottom_broadcast_2_(new Blob<Dtype>()),
+        blob_bottom_bias_(new Blob<Dtype>(vector<int>())),
+        blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
+    vector<int> broadcast_shape(2);
+    broadcast_shape[0] = 2; broadcast_shape[1] = 3;
+    this->blob_bottom_broadcast_0_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 3; broadcast_shape[1] = 4;
+    this->blob_bottom_broadcast_1_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 4; broadcast_shape[1] = 5;
+    this->blob_bottom_broadcast_2_->Reshape(broadcast_shape);
+    FillerParameter filler_param;
+    filler_param.set_min(1);
+    filler_param.set_max(10);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_eltwise_);
+    filler.Fill(this->blob_bottom_broadcast_0_);
+    filler.Fill(this->blob_bottom_broadcast_1_);
+    filler.Fill(this->blob_bottom_broadcast_2_);
+    filler.Fill(this->blob_bottom_bias_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~BiasLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_eltwise_;
+    delete blob_bottom_broadcast_0_;
+    delete blob_bottom_broadcast_1_;
+    delete blob_bottom_broadcast_2_;
+    delete blob_bottom_bias_;
+    delete blob_top_;
+  }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_eltwise_;
+  Blob<Dtype>* const blob_bottom_broadcast_0_;
+  Blob<Dtype>* const blob_bottom_broadcast_1_;
+  Blob<Dtype>* const blob_bottom_broadcast_2_;
+  Blob<Dtype>* const blob_bottom_bias_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(BiasLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(BiasLayerTest, TestForwardEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(0);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(0);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_bottom_->cpu_data();
+  const int count = this->blob_bottom_->count();
+  const Dtype* in_data_a = orig_bottom.cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestBackwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(0);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_bias_diff;
+  orig_bias_diff.CopyFrom(*this->blob_bottom_eltwise_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_eltwise_->count(); ++i) {
+    EXPECT_NEAR(orig_bias_diff.cpu_diff()[i],
+                this->blob_bottom_eltwise_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(0);
+  bias_param->set_num_axes(-1);
+  bias_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = layer->blobs()[0]->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(0);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_bottom_->data_at(n, c, h, w),
+                      orig_bottom.data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestBackwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_bias_diff;
+  orig_bias_diff.CopyFrom(*this->blob_bottom_broadcast_1_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_broadcast_1_->count(); ++i) {
+    EXPECT_NEAR(orig_bias_diff.cpu_diff()[i],
+                this->blob_bottom_broadcast_1_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(1);
+  bias_param->set_num_axes(2);
+  bias_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      layer->blobs()[0]->data_at(c, h, 0, 0), 1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) +
+                      this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype bias = *this->blob_bottom_bias_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] + bias, 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestForwardBiasAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  shared_ptr<BiasLayer<Dtype> > layer(new BiasLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype bias = *this->blob_bottom_bias_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] + bias, 1e-5);
+  }
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(0);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(0);
+  bias_param->set_num_axes(-1);
+  bias_param->mutable_filler()->set_type("gaussian");
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(0);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(1);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  BiasParameter* bias_param = layer_param.mutable_bias_param();
+  bias_param->set_axis(1);
+  bias_param->set_num_axes(2);
+  bias_param->mutable_filler()->set_type("gaussian");
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(BiasLayerTest, TestGradientBiasAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_bias_);
+  LayerParameter layer_param;
+  layer_param.mutable_bias_param()->set_axis(2);
+  BiasLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index e2d43f3..9bb19d1 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -46,13 +46,17 @@ void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
   } else {
     stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
   }
-  int kernel_d, pad_d, stride_d;
+  int dilation_h, dilation_w;
+  dilation_h = dilation_w = conv_param->dilation_size() ?
+                            conv_param->dilation(0) : 1;
+  int kernel_d, pad_d, stride_d, dilation_d;
   if (has_depth) {
     kernel_d = kernel_h;
     stride_d = stride_h;
     pad_d = pad_h;
+    dilation_d = dilation_h;
   } else {
-    kernel_d = stride_d = 1;
+    kernel_d = stride_d = dilation_d = 1;
     pad_d = 0;
   }
   // Groups
@@ -77,9 +81,9 @@ void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
                 for (int r = 0; r < kernel_d; r++) {
                   for (int p = 0; p < kernel_h; p++) {
                     for (int q = 0; q < kernel_w; q++) {
-                      int in_z = z * stride_d - pad_d + r;
-                      int in_y = y * stride_h - pad_h + p;
-                      int in_x = x * stride_w - pad_w + q;
+                      int in_z = z * stride_d - pad_d + r * dilation_d;
+                      int in_y = y * stride_h - pad_h + p * dilation_h;
+                      int in_x = x * stride_w - pad_w + q * dilation_w;
                       if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
                           && in_y >= 0 && in_y < in->shape(2 + has_depth)
                           && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
@@ -260,6 +264,50 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
   }
 }
 
+TYPED_TEST(ConvolutionLayerTest, TestDilatedConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  vector<int> bottom_shape;
+  bottom_shape.push_back(2);
+  bottom_shape.push_back(3);
+  bottom_shape.push_back(8);
+  bottom_shape.push_back(7);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_dilation(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new ConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_2_));
+  top_data = this->blob_top_2_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
 TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -345,6 +393,53 @@ TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) {
   }
 }
 
+TYPED_TEST(ConvolutionLayerTest, TestDilated3DConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 6;
+  bottom_shape[3] = 7;
+  bottom_shape[4] = 8;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_dilation(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<Layer<Dtype> > layer(
+      new ConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+             this->MakeReferenceTop(this->blob_top_2_));
+  top_data = this->blob_top_2_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
 TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -629,6 +724,30 @@ TYPED_TEST(ConvolutionLayerTest, TestGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(ConvolutionLayerTest, TestDilatedGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  vector<int> bottom_shape;
+  bottom_shape.push_back(2);
+  bottom_shape.push_back(3);
+  bottom_shape.push_back(5);
+  bottom_shape.push_back(6);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+  }
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_dilation(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  ConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+                                  this->blob_top_vec_);
+}
+
 TYPED_TEST(ConvolutionLayerTest, TestGradient3D) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
diff --git a/src/caffe/test/test_data_transformer.cpp b/src/caffe/test/test_data_transformer.cpp
index 8a10137..6103918 100644
--- a/src/caffe/test/test_data_transformer.cpp
+++ b/src/caffe/test/test_data_transformer.cpp
@@ -40,23 +40,21 @@ class DataTransformTest : public ::testing::Test {
   int NumSequenceMatches(const TransformationParameter transform_param,
       const Datum& datum, Phase phase) {
     // Get crop sequence with Caffe seed 1701.
-    DataTransformer<Dtype>* transformer =
-        new DataTransformer<Dtype>(transform_param, phase);
+    DataTransformer<Dtype> transformer(transform_param, phase);
     const int crop_size = transform_param.crop_size();
     Caffe::set_random_seed(seed_);
-    transformer->InitRand();
-    Blob<Dtype>* blob =
-        new Blob<Dtype>(1, datum.channels(), datum.height(), datum.width());
+    transformer.InitRand();
+    Blob<Dtype> blob(1, datum.channels(), datum.height(), datum.width());
     if (transform_param.crop_size() > 0) {
-      blob->Reshape(1, datum.channels(), crop_size, crop_size);
+      blob.Reshape(1, datum.channels(), crop_size, crop_size);
     }
 
     vector<vector<Dtype> > crop_sequence;
     for (int iter = 0; iter < this->num_iter_; ++iter) {
       vector<Dtype> iter_crop_sequence;
-      transformer->Transform(datum, blob);
-      for (int j = 0; j < blob->count(); ++j) {
-        iter_crop_sequence.push_back(blob->cpu_data()[j]);
+      transformer.Transform(datum, &blob);
+      for (int j = 0; j < blob.count(); ++j) {
+        iter_crop_sequence.push_back(blob.cpu_data()[j]);
       }
       crop_sequence.push_back(iter_crop_sequence);
     }
@@ -64,17 +62,14 @@ class DataTransformTest : public ::testing::Test {
     int num_sequence_matches = 0;
     for (int iter = 0; iter < this->num_iter_; ++iter) {
       vector<Dtype> iter_crop_sequence = crop_sequence[iter];
-      transformer->Transform(datum, blob);
-      for (int j = 0; j < blob->count(); ++j) {
-        num_sequence_matches +=
-            (crop_sequence[iter][j] == blob->cpu_data()[j]);
+      transformer.Transform(datum, &blob);
+      for (int j = 0; j < blob.count(); ++j) {
+        num_sequence_matches += (crop_sequence[iter][j] == blob.cpu_data()[j]);
       }
     }
     return num_sequence_matches;
   }
 
-  virtual ~DataTransformTest() { }
-
   int seed_;
   int num_iter_;
 };
@@ -91,17 +86,16 @@ TYPED_TEST(DataTransformTest, TestEmptyTransform) {
 
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam>* blob = new Blob<TypeParam>(1, channels, height, width);
-  DataTransformer<TypeParam>* transformer =
-      new DataTransformer<TypeParam>(transform_param, TEST);
-  transformer->InitRand();
-  transformer->Transform(datum, blob);
-  EXPECT_EQ(blob->num(), 1);
-  EXPECT_EQ(blob->channels(), datum.channels());
-  EXPECT_EQ(blob->height(), datum.height());
-  EXPECT_EQ(blob->width(), datum.width());
-  for (int j = 0; j < blob->count(); ++j) {
-    EXPECT_EQ(blob->cpu_data()[j], label);
+  Blob<TypeParam> blob(1, channels, height, width);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
+  transformer.InitRand();
+  transformer.Transform(datum, &blob);
+  EXPECT_EQ(blob.num(), 1);
+  EXPECT_EQ(blob.channels(), datum.channels());
+  EXPECT_EQ(blob.height(), datum.height());
+  EXPECT_EQ(blob.width(), datum.width());
+  for (int j = 0; j < blob.count(); ++j) {
+    EXPECT_EQ(blob.cpu_data()[j], label);
   }
 }
 
@@ -115,17 +109,16 @@ TYPED_TEST(DataTransformTest, TestEmptyTransformUniquePixels) {
 
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam>* blob = new Blob<TypeParam>(1, 3, 4, 5);
-  DataTransformer<TypeParam>* transformer =
-      new DataTransformer<TypeParam>(transform_param, TEST);
-  transformer->InitRand();
-  transformer->Transform(datum, blob);
-  EXPECT_EQ(blob->num(), 1);
-  EXPECT_EQ(blob->channels(), datum.channels());
-  EXPECT_EQ(blob->height(), datum.height());
-  EXPECT_EQ(blob->width(), datum.width());
-  for (int j = 0; j < blob->count(); ++j) {
-    EXPECT_EQ(blob->cpu_data()[j], j);
+  Blob<TypeParam> blob(1, 3, 4, 5);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
+  transformer.InitRand();
+  transformer.Transform(datum, &blob);
+  EXPECT_EQ(blob.num(), 1);
+  EXPECT_EQ(blob.channels(), datum.channels());
+  EXPECT_EQ(blob.height(), datum.height());
+  EXPECT_EQ(blob.width(), datum.width());
+  for (int j = 0; j < blob.count(); ++j) {
+    EXPECT_EQ(blob.cpu_data()[j], j);
   }
 }
 
@@ -141,19 +134,17 @@ TYPED_TEST(DataTransformTest, TestCropSize) {
   transform_param.set_crop_size(crop_size);
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  DataTransformer<TypeParam>* transformer =
-      new DataTransformer<TypeParam>(transform_param, TEST);
-  transformer->InitRand();
-  Blob<TypeParam>* blob =
-      new Blob<TypeParam>(1, channels, crop_size, crop_size);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
+  transformer.InitRand();
+  Blob<TypeParam> blob(1, channels, crop_size, crop_size);
   for (int iter = 0; iter < this->num_iter_; ++iter) {
-    transformer->Transform(datum, blob);
-    EXPECT_EQ(blob->num(), 1);
-    EXPECT_EQ(blob->channels(), datum.channels());
-    EXPECT_EQ(blob->height(), crop_size);
-    EXPECT_EQ(blob->width(), crop_size);
-    for (int j = 0; j < blob->count(); ++j) {
-      EXPECT_EQ(blob->cpu_data()[j], label);
+    transformer.Transform(datum, &blob);
+    EXPECT_EQ(blob.num(), 1);
+    EXPECT_EQ(blob.channels(), datum.channels());
+    EXPECT_EQ(blob.height(), crop_size);
+    EXPECT_EQ(blob.width(), crop_size);
+    for (int j = 0; j < blob.count(); ++j) {
+      EXPECT_EQ(blob.cpu_data()[j], label);
     }
   }
 }
@@ -280,13 +271,12 @@ TYPED_TEST(DataTransformTest, TestMeanValue) {
   transform_param.add_mean_value(mean_value);
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam>* blob = new Blob<TypeParam>(1, channels, height, width);
-  DataTransformer<TypeParam>* transformer =
-      new DataTransformer<TypeParam>(transform_param, TEST);
-  transformer->InitRand();
-  transformer->Transform(datum, blob);
-  for (int j = 0; j < blob->count(); ++j) {
-    EXPECT_EQ(blob->cpu_data()[j], label - mean_value);
+  Blob<TypeParam> blob(1, channels, height, width);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
+  transformer.InitRand();
+  transformer.Transform(datum, &blob);
+  for (int j = 0; j < blob.count(); ++j) {
+    EXPECT_EQ(blob.cpu_data()[j], label - mean_value);
   }
 }
 
@@ -303,14 +293,13 @@ TYPED_TEST(DataTransformTest, TestMeanValues) {
   transform_param.add_mean_value(2);
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam>* blob = new Blob<TypeParam>(1, channels, height, width);
-  DataTransformer<TypeParam>* transformer =
-      new DataTransformer<TypeParam>(transform_param, TEST);
-  transformer->InitRand();
-  transformer->Transform(datum, blob);
+  Blob<TypeParam> blob(1, channels, height, width);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
+  transformer.InitRand();
+  transformer.Transform(datum, &blob);
   for (int c = 0; c < channels; ++c) {
     for (int j = 0; j < height * width; ++j) {
-      EXPECT_EQ(blob->cpu_data()[blob->offset(0, c) + j], label - c);
+      EXPECT_EQ(blob.cpu_data()[blob.offset(0, c) + j], label - c);
     }
   }
 }
@@ -325,8 +314,8 @@ TYPED_TEST(DataTransformTest, TestMeanFile) {
   const int size = channels * height * width;
 
   // Create a mean file
-  string* mean_file = new string();
-  MakeTempFilename(mean_file);
+  string mean_file;
+  MakeTempFilename(&mean_file);
   BlobProto blob_mean;
   blob_mean.set_num(1);
   blob_mean.set_channels(channels);
@@ -337,19 +326,18 @@ TYPED_TEST(DataTransformTest, TestMeanFile) {
       blob_mean.add_data(j);
   }
 
-  LOG(INFO) << "Using temporary mean_file " << *mean_file;
-  WriteProtoToBinaryFile(blob_mean, *mean_file);
+  LOG(INFO) << "Using temporary mean_file " << mean_file;
+  WriteProtoToBinaryFile(blob_mean, mean_file);
 
-  transform_param.set_mean_file(*mean_file);
+  transform_param.set_mean_file(mean_file);
   Datum datum;
   FillDatum(label, channels, height, width, unique_pixels, &datum);
-  Blob<TypeParam>* blob = new Blob<TypeParam>(1, channels, height, width);
-  DataTransformer<TypeParam>* transformer =
-      new DataTransformer<TypeParam>(transform_param, TEST);
-  transformer->InitRand();
-  transformer->Transform(datum, blob);
-  for (int j = 0; j < blob->count(); ++j) {
-      EXPECT_EQ(blob->cpu_data()[j], 0);
+  Blob<TypeParam> blob(1, channels, height, width);
+  DataTransformer<TypeParam> transformer(transform_param, TEST);
+  transformer.InitRand();
+  transformer.Transform(datum, &blob);
+  for (int j = 0; j < blob.count(); ++j) {
+    EXPECT_EQ(blob.cpu_data()[j], 0);
   }
 }
 
diff --git a/src/caffe/test/test_embed_layer.cpp b/src/caffe/test/test_embed_layer.cpp
index acd4b0f..dc7f5c4 100644
--- a/src/caffe/test/test_embed_layer.cpp
+++ b/src/caffe/test/test_embed_layer.cpp
@@ -12,10 +12,6 @@
 
 namespace caffe {
 
-#ifndef CPU_ONLY
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
-#endif
-
 template <typename TypeParam>
 class EmbedLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
diff --git a/src/caffe/test/test_im2col_kernel.cu b/src/caffe/test/test_im2col_kernel.cu
index 3f97cf6..e3a9791 100644
--- a/src/caffe/test/test_im2col_kernel.cu
+++ b/src/caffe/test/test_im2col_kernel.cu
@@ -18,6 +18,7 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
     const int height_col, const int width_col,
     Dtype* data_col);
 
@@ -25,19 +26,18 @@ template <typename Dtype, int num_axes>
 __global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col);
-
-extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+    const int* dilation, Dtype* data_col);
 
 template <typename Dtype>
 class Im2colKernelTest : public GPUDeviceTest<Dtype> {
  protected:
   Im2colKernelTest()
         // big so launches > 1024 threads
-      : blob_bottom_(new Blob<Dtype>(5, 500, 10, 10)),
+      : blob_bottom_(new Blob<Dtype>(5, 500, 15, 15)),
         blob_kernel_shape_(new Blob<int>()),
         blob_stride_(new Blob<int>()),
         blob_pad_(new Blob<int>()),
+        blob_dilation_(new Blob<int>()),
         blob_top_(new Blob<Dtype>()),
         blob_top_cpu_(new Blob<Dtype>()) {
     FillerParameter filler_param;
@@ -47,20 +47,25 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
     blob_kernel_shape_->Reshape(dim_blob_shape);
     blob_stride_->Reshape(dim_blob_shape);
     blob_pad_->Reshape(dim_blob_shape);
+    blob_dilation_->Reshape(dim_blob_shape);
 
     height_ = blob_bottom_->height();
     width_ = blob_bottom_->width();
     channels_ = blob_bottom_->channels();
     pad_ = 0;
     stride_ = 2;
+    dilation_ = 3;
     kernel_size_ = 3;
-    height_col_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
-    width_col_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+    height_col_ = (height_ + 2 * pad_ -
+        (dilation_ * (kernel_size_ - 1) + 1)) / stride_ + 1;
+    width_col_ = (width_ + 2 * pad_ -
+        (dilation_ * (kernel_size_ - 1) + 1)) / stride_ + 1;
 
     for (int i = 0; i < 2; ++i) {
       blob_kernel_shape_->mutable_cpu_data()[i] = kernel_size_;
       blob_stride_->mutable_cpu_data()[i] = stride_;
       blob_pad_->mutable_cpu_data()[i] = pad_;
+      blob_dilation_->mutable_cpu_data()[i] = dilation_;
     }
   }
 
@@ -71,11 +76,13 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
     delete blob_kernel_shape_;
     delete blob_stride_;
     delete blob_pad_;
+    delete blob_dilation_;
   }
 
   Blob<int>* const blob_kernel_shape_;
   Blob<int>* const blob_stride_;
   Blob<int>* const blob_pad_;
+  Blob<int>* const blob_dilation_;
   Blob<Dtype>* const blob_bottom_;
   Blob<Dtype>* const blob_top_;
   Blob<Dtype>* const blob_top_cpu_;
@@ -84,6 +91,7 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
   int channels_;
   int pad_;
   int stride_;
+  int dilation_;
   int kernel_size_;
   int height_col_;
   int width_col_;
@@ -112,7 +120,7 @@ TYPED_TEST(Im2colKernelTest, Test2D) {
     im2col_cpu(this->blob_bottom_->cpu_data() + this->blob_bottom_->offset(n),
       this->channels_, this->height_, this->width_,
       this->kernel_size_, this->kernel_size_, this->pad_, this->pad_,
-      this->stride_, this->stride_,
+      this->stride_, this->stride_, this->dilation_, this->dilation_,
       cpu_data + this->blob_top_cpu_->offset(n));
   }
 
@@ -129,6 +137,7 @@ TYPED_TEST(Im2colKernelTest, Test2D) {
         num_kernels, bottom_data + this->blob_bottom_->offset(n),
         this->height_, this->width_, this->kernel_size_, this->kernel_size_,
         this->pad_, this->pad_, this->stride_, this->stride_,
+        this->dilation_, this->dilation_,
         this->height_col_, this->width_col_,
         top_data + this->blob_top_->offset(n));
       CUDA_POST_KERNEL_CHECK;
@@ -165,6 +174,7 @@ TYPED_TEST(Im2colKernelTest, TestND) {
         this->blob_top_cpu_->shape().data() + 1,
         this->blob_kernel_shape_->cpu_data(),
         this->blob_pad_->cpu_data(), this->blob_stride_->cpu_data(),
+        this->blob_dilation_->cpu_data(),
         top_data_cpu + this->blob_top_cpu_->offset(n));
   }
 
@@ -183,7 +193,7 @@ TYPED_TEST(Im2colKernelTest, TestND) {
           num_kernels, bottom_data_gpu + this->blob_bottom_->offset(n),
           this->blob_bottom_->gpu_shape() + 1, this->blob_top_->gpu_shape() + 1,
           this->blob_kernel_shape_->gpu_data(), this->blob_pad_->gpu_data(),
-          this->blob_stride_->gpu_data(),
+          this->blob_stride_->gpu_data(), this->blob_dilation_->gpu_data(),
           top_data_gpu + this->blob_top_->offset(n));
       CUDA_POST_KERNEL_CHECK;
     }
diff --git a/src/caffe/test/test_im2col_layer.cpp b/src/caffe/test/test_im2col_layer.cpp
index 8274dd4..a7faf18 100644
--- a/src/caffe/test/test_im2col_layer.cpp
+++ b/src/caffe/test/test_im2col_layer.cpp
@@ -41,14 +41,21 @@ TYPED_TEST(Im2colLayerTest, TestSetup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
+  vector<int> bottom_shape;
+  bottom_shape.push_back(2);
+  bottom_shape.push_back(3);
+  bottom_shape.push_back(10);
+  bottom_shape.push_back(11);
+  this->blob_bottom_->Reshape(bottom_shape);
   convolution_param->add_kernel_size(3);
   convolution_param->add_stride(2);
+  convolution_param->add_dilation(3);
   Im2colLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 2);
   EXPECT_EQ(this->blob_top_->channels(), 27);
   EXPECT_EQ(this->blob_top_->height(), 2);
-  EXPECT_EQ(this->blob_top_->width(), 2);
+  EXPECT_EQ(this->blob_top_->width(), 3);
 }
 
 TYPED_TEST(Im2colLayerTest, TestForward) {
@@ -81,6 +88,26 @@ TYPED_TEST(Im2colLayerTest, TestGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(Im2colLayerTest, TestDilatedGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  vector<int> bottom_shape;
+  bottom_shape.push_back(2);
+  bottom_shape.push_back(3);
+  bottom_shape.push_back(10);
+  bottom_shape.push_back(9);
+  this->blob_bottom_->Reshape(bottom_shape);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->add_dilation(3);
+  Im2colLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+                                  this->blob_top_vec_);
+}
+
 TYPED_TEST(Im2colLayerTest, TestGradientForceND) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -95,6 +122,27 @@ TYPED_TEST(Im2colLayerTest, TestGradientForceND) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(Im2colLayerTest, TestDilatedGradientForceND) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  vector<int> bottom_shape;
+  bottom_shape.push_back(2);
+  bottom_shape.push_back(3);
+  bottom_shape.push_back(10);
+  bottom_shape.push_back(9);
+  this->blob_bottom_->Reshape(bottom_shape);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->add_dilation(3);
+  convolution_param->set_force_nd_im2col(true);
+  Im2colLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+                                  this->blob_top_vec_);
+}
+
 TYPED_TEST(Im2colLayerTest, TestRect) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -113,7 +161,6 @@ TYPED_TEST(Im2colLayerTest, TestRect) {
   }
 }
 
-
 TYPED_TEST(Im2colLayerTest, TestRectGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 21441b4..dd591f7 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -11,6 +11,7 @@
 #include "caffe/layers/absval_layer.hpp"
 #include "caffe/layers/bnll_layer.hpp"
 #include "caffe/layers/dropout_layer.hpp"
+#include "caffe/layers/elu_layer.hpp"
 #include "caffe/layers/exp_layer.hpp"
 #include "caffe/layers/inner_product_layer.hpp"
 #include "caffe/layers/log_layer.hpp"
@@ -259,6 +260,64 @@ TYPED_TEST(NeuronLayerTest, TestReLUGradientWithNegativeSlope) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(NeuronLayerTest, TestELU) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "elu_param { alpha: 0.5 }", &layer_param));
+  ELULayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype kDelta = 2e-4;
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    if (bottom_data[i] > 0) {
+      EXPECT_FLOAT_EQ(top_data[i], bottom_data[i]);
+    } else {
+      EXPECT_NEAR(top_data[i], 0.5 * (exp(bottom_data[i]) - 1), kDelta);
+    }
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestELUasReLU) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "elu_param { alpha: 0 }", &layer_param));
+  ELULayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_GE(top_data[i], 0.);
+    EXPECT_TRUE(top_data[i] == 0 || top_data[i] == bottom_data[i]);
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestELUGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ELULayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(NeuronLayerTest, TestELUasReLUGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "elu_param { alpha: 0 }", &layer_param));
+  ELULayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
 TYPED_TEST(NeuronLayerTest, TestSigmoid) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
diff --git a/src/caffe/test/test_scale_layer.cpp b/src/caffe/test/test_scale_layer.cpp
new file mode 100644
index 0000000..ad11679
--- /dev/null
+++ b/src/caffe/test/test_scale_layer.cpp
@@ -0,0 +1,507 @@
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/scale_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class ScaleLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  ScaleLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_eltwise_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_broadcast_0_(new Blob<Dtype>()),
+        blob_bottom_broadcast_1_(new Blob<Dtype>()),
+        blob_bottom_broadcast_2_(new Blob<Dtype>()),
+        blob_bottom_scale_(new Blob<Dtype>(vector<int>())),
+        blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
+    vector<int> broadcast_shape(2);
+    broadcast_shape[0] = 2; broadcast_shape[1] = 3;
+    this->blob_bottom_broadcast_0_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 3; broadcast_shape[1] = 4;
+    this->blob_bottom_broadcast_1_->Reshape(broadcast_shape);
+    broadcast_shape[0] = 4; broadcast_shape[1] = 5;
+    this->blob_bottom_broadcast_2_->Reshape(broadcast_shape);
+    FillerParameter filler_param;
+    filler_param.set_min(1);
+    filler_param.set_max(10);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_eltwise_);
+    filler.Fill(this->blob_bottom_broadcast_0_);
+    filler.Fill(this->blob_bottom_broadcast_1_);
+    filler.Fill(this->blob_bottom_broadcast_2_);
+    filler.Fill(this->blob_bottom_scale_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~ScaleLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_eltwise_;
+    delete blob_bottom_broadcast_0_;
+    delete blob_bottom_broadcast_1_;
+    delete blob_bottom_broadcast_2_;
+    delete blob_bottom_scale_;
+    delete blob_top_;
+  }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_eltwise_;
+  Blob<Dtype>* const blob_bottom_broadcast_0_;
+  Blob<Dtype>* const blob_bottom_broadcast_1_;
+  Blob<Dtype>* const blob_bottom_broadcast_2_;
+  Blob<Dtype>* const blob_bottom_scale_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(ScaleLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(ScaleLayerTest, TestForwardEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(0);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(0);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_bottom_->cpu_data();
+  const int count = this->blob_bottom_->count();
+  const Dtype* in_data_a = orig_bottom.cpu_data();
+  const Dtype* in_data_b = this->blob_bottom_eltwise_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestBackwardEltwiseInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(0);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_scale_diff;
+  orig_scale_diff.CopyFrom(*this->blob_bottom_eltwise_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_eltwise_->count(); ++i) {
+    EXPECT_NEAR(orig_scale_diff.cpu_diff()[i],
+                this->blob_bottom_eltwise_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(0);
+  scale_param->set_num_axes(-1);
+  scale_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data_a = this->blob_bottom_->cpu_data();
+  const Dtype* in_data_b = layer->blobs()[0]->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(0);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_0_->data_at(n, c, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_bottom_->data_at(n, c, h, w),
+                      orig_bottom.data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_1_->data_at(c, h, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestBackwardBroadcastMiddleInPlace) {
+  typedef typename TypeParam::Dtype Dtype;
+  Blob<Dtype> orig_bottom(this->blob_bottom_->shape());
+  orig_bottom.CopyFrom(*this->blob_bottom_);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  Blob<Dtype> top_diff(this->blob_bottom_->shape());
+  FillerParameter filler_param;
+  filler_param.set_type("gaussian");
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(&top_diff);
+  vector<bool> propagate_down(2, true);
+  // Run forward + backward without in-place computation;
+  // save resulting bottom diffs.
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_top_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  const bool kReshape = true;
+  const bool kCopyDiff = true;
+  Blob<Dtype> orig_bottom_diff;
+  orig_bottom_diff.CopyFrom(*this->blob_bottom_, kCopyDiff, kReshape);
+  Blob<Dtype> orig_scale_diff;
+  orig_scale_diff.CopyFrom(*this->blob_bottom_broadcast_1_,
+                            kCopyDiff, kReshape);
+  // Rerun forward + backward with in-place computation;
+  // check that resulting bottom diffs are the same.
+  this->blob_top_vec_[0] = this->blob_bottom_;  // in-place computation
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  caffe_copy(top_diff.count(), top_diff.cpu_data(),
+             this->blob_bottom_->mutable_cpu_diff());
+  layer->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_NEAR(orig_bottom_diff.cpu_diff()[i],
+                this->blob_bottom_->cpu_diff()[i], 1e-5);
+  }
+  for (int i = 0; i < this->blob_bottom_broadcast_1_->count(); ++i) {
+    EXPECT_NEAR(orig_scale_diff.cpu_diff()[i],
+                this->blob_bottom_broadcast_1_->cpu_diff()[i], 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(1);
+  scale_param->set_num_axes(2);
+  scale_param->mutable_filler()->set_type("gaussian");
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      layer->blobs()[0]->data_at(c, h, 0, 0), 1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastMiddleWithParamAndBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(1);
+  scale_param->set_num_axes(2);
+  scale_param->mutable_filler()->set_type("gaussian");
+  scale_param->set_bias_term(true);
+  scale_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      layer->blobs()[0]->data_at(c, h, 0, 0) +
+                      layer->blobs()[1]->data_at(c, h, 0, 0), 1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    for (int c = 0; c < this->blob_bottom_->channels(); ++c) {
+      for (int h = 0; h < this->blob_bottom_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_->width(); ++w) {
+          EXPECT_NEAR(this->blob_top_->data_at(n, c, h, w),
+                      this->blob_bottom_->data_at(n, c, h, w) *
+                      this->blob_bottom_broadcast_2_->data_at(h, w, 0, 0),
+                      1e-5);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardScale) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype scale = *this->blob_bottom_scale_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] * scale, 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestForwardScaleAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  shared_ptr<ScaleLayer<Dtype> > layer(new ScaleLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const Dtype* in_data = this->blob_bottom_->cpu_data();
+  const Dtype scale = *this->blob_bottom_scale_->cpu_data();
+  for (int i = 0; i < count; ++i) {
+    EXPECT_NEAR(data[i], in_data[i] * scale, 1e-5);
+  }
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientEltwise) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_eltwise_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(0);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientEltwiseWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(0);
+  scale_param->set_num_axes(-1);
+  scale_param->mutable_filler()->set_type("gaussian");
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastBegin) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_0_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(0);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(1);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastMiddleWithParam) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_1_);
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_axis(1);
+  scale_param->set_num_axes(2);
+  scale_param->mutable_filler()->set_type("gaussian");
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientBroadcastEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_broadcast_2_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientScale) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientScaleAndBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  ScaleParameter* scale_param = layer_param.mutable_scale_param();
+  scale_param->set_bias_term(true);
+  scale_param->mutable_bias_filler()->set_type("gaussian");
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ScaleLayerTest, TestGradientScaleAxis2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_scale_);
+  LayerParameter layer_param;
+  layer_param.mutable_scale_param()->set_axis(2);
+  ScaleLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index 27e5b7c..114a86c 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -5,26 +5,50 @@
 
 namespace caffe {
 
+// Function uses casting from int to unsigned to compare if value of
+// parameter a is greater or equal to zero and lower than value of
+// parameter b. The b parameter is of type signed and is always positive,
+// therefore its value is always lower than 0x800... where casting
+// negative value of a parameter converts it to value higher than 0x800...
+// The casting allows to use one condition instead of two.
+inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
+  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
+}
+
 template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
     Dtype* data_col) {
-  const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  const int channels_col = channels * kernel_h * kernel_w;
-  for (int c_col = 0; c_col < channels_col; ++c_col) {
-    int w_offset = c_col % kernel_w;
-    int h_offset = (c_col / kernel_w) % kernel_h;
-    int c_im = c_col / kernel_h / kernel_w;
-    for (int h_col = 0; h_col < height_col; ++h_col) {
-      for (int w_col = 0; w_col < width_col; ++w_col) {
-        int h_im = h_col * stride_h - pad_h + h_offset;
-        int w_im = w_col * stride_w - pad_w + w_offset;
-        data_col[(c_col * height_col + h_col) * width_col + w_col] =
-            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-            data_im[(c_im * height + h_im) * width + w_im] : 0;
+  const int output_h = (height + 2 * pad_h -
+    (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int output_w = (width + 2 * pad_w -
+    (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channel_size = height * width;
+  for (int channel = channels; channel--; data_im += channel_size) {
+    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_row = -pad_h + kernel_row * dilation_h;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+            for (int output_cols = output_w; output_cols; output_cols--) {
+              *(data_col++) = 0;
+            }
+          } else {
+            int input_col = -pad_w + kernel_col * dilation_w;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                *(data_col++) = data_im[input_row * width + input_col];
+              } else {
+                *(data_col++) = 0;
+              }
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
       }
     }
   }
@@ -34,17 +58,19 @@ void im2col_cpu(const Dtype* data_im, const int channels,
 template void im2col_cpu<float>(const float* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_col);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    float* data_col);
 template void im2col_cpu<double>(const double* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_col);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    double* data_col);
 
 template <typename Dtype>
 inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
     const int num_spatial_axes, const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_output) {
+    const int* dilation, Dtype* data_output) {
   if (!im2col) {
     int im_size = im_shape[0];
     for (int i = 0; i < num_spatial_axes; ++i) {
@@ -76,7 +102,8 @@ inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
       bool is_padding = false;
       for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
         const int d = d_iter[d_i];
-        const int d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i];
+        const int d_im = d * stride[d_i] - pad[d_i] +
+            d_offset[d_i] * dilation[d_i];
         is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
         index_col *= col_shape[d_i + 1];
         index_col += d;
@@ -114,10 +141,10 @@ template <typename Dtype>
 void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col) {
+    const int* dilation, Dtype* data_col) {
   const bool kIm2Col = true;
   im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape,
-                  kernel_shape, pad, stride, data_col);
+                  kernel_shape, pad, stride, dilation, data_col);
 }
 
 // Explicit instantiation
@@ -125,34 +152,45 @@ template void im2col_nd_cpu<float>(const float* data_im,
     const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    float* data_col);
+    const int* dilation, float* data_col);
 template void im2col_nd_cpu<double>(const double* data_im,
     const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    double* data_col);
+    const int* dilation, double* data_col);
 
 template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
     Dtype* data_im) {
   caffe_set(height * width * channels, Dtype(0), data_im);
-  const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  const int channels_col = channels * kernel_h * kernel_w;
-  for (int c_col = 0; c_col < channels_col; ++c_col) {
-    int w_offset = c_col % kernel_w;
-    int h_offset = (c_col / kernel_w) % kernel_h;
-    int c_im = c_col / kernel_h / kernel_w;
-    for (int h_col = 0; h_col < height_col; ++h_col) {
-      for (int w_col = 0; w_col < width_col; ++w_col) {
-        int h_im = h_col * stride_h - pad_h + h_offset;
-        int w_im = w_col * stride_w - pad_w + w_offset;
-        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
-          data_im[(c_im * height + h_im) * width + w_im] +=
-              data_col[(c_col * height_col + h_col) * width_col + w_col];
+  const int output_h = (height + 2 * pad_h -
+    (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int output_w = (width + 2 * pad_w -
+    (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channel_size = height * width;
+  for (int channel = channels; channel--; data_im += channel_size) {
+    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_row = -pad_h + kernel_row * dilation_h;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+            data_col += output_w;
+          } else {
+            int input_col = -pad_w + kernel_col * dilation_w;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                data_im[input_row * width + input_col] += *data_col;
+              }
+              data_col++;
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
       }
     }
   }
@@ -162,20 +200,22 @@ void col2im_cpu(const Dtype* data_col, const int channels,
 template void col2im_cpu<float>(const float* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    float* data_im);
 template void col2im_cpu<double>(const double* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_im);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    double* data_im);
 
 template <typename Dtype>
 void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im) {
+    const int* dilation, Dtype* data_im) {
   const bool kIm2Col = false;
   im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape,
-                     kernel_shape, pad, stride, data_im);
+                     kernel_shape, pad, stride, dilation, data_im);
 }
 
 // Explicit instantiation
@@ -183,12 +223,12 @@ template void col2im_nd_cpu<float>(const float* data_col,
     const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    float* data_im);
+    const int* dilation, float* data_im);
 template void col2im_nd_cpu<double>(const double* data_col,
     const int num_spatial_axes,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    double* data_im);
+    const int* dilation, double* data_im);
 
 
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index 49354ab..a8f30a0 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -10,6 +10,7 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
     const int height_col, const int width_col,
     Dtype* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
@@ -26,11 +27,11 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
     data_im_ptr += (c_im * height + h_offset) * width + w_offset;
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i;
-        int w_im = w_offset + j;
+        int h_im = h_offset + i * dilation_h;
+        int w_im = w_offset + j * dilation_w;
         *data_col_ptr =
             (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-            data_im_ptr[i * width + j] : 0;
+            data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
         data_col_ptr += height_col * width_col;
       }
     }
@@ -42,17 +43,20 @@ void im2col_gpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
     Dtype* data_col) {
   // We are going to launch channels * height_col * width_col kernels, each
   // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int height_col = (height + 2 * pad_h -
+      (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w -
+      (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   int num_kernels = channels * height_col * width_col;
   // NOLINT_NEXT_LINE(whitespace/operators)
   im2col_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
                              CAFFE_CUDA_NUM_THREADS>>>(
       num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
-      pad_w, stride_h, stride_w, height_col,
+      pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
       width_col, data_col);
   CUDA_POST_KERNEL_CHECK;
 }
@@ -61,19 +65,39 @@ void im2col_gpu(const Dtype* data_im, const int channels,
 template void im2col_gpu<float>(const float* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_col);
+    const int dilation_h, const int dilation_w, float* data_col);
 template void im2col_gpu<double>(const double* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    double* data_col);
+    const int dilation_h, const int dilation_w, double* data_col);
 
 template <typename Dtype, int num_axes>
 __global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col) {
+    const int* dilation, Dtype* data_col) {
   int d_temp[num_axes];  // NOLINT(runtime/arrays)
   int d_iter[num_axes];  // NOLINT(runtime/arrays)
+
+  __shared__ int shared_dilation[num_axes];
+  __shared__ int shared_kernel_shape[num_axes];
+  __shared__ int shared_pad[num_axes];
+  __shared__ int shared_stride[num_axes];
+  __shared__ int shared_col_shape[num_axes + 1];
+  __shared__ int shared_im_shape[num_axes + 1];
+
+  if (threadIdx.x < num_axes) {
+    shared_dilation[threadIdx.x] = dilation[threadIdx.x];
+    shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
+    shared_pad[threadIdx.x] = pad[threadIdx.x];
+    shared_stride[threadIdx.x] = stride[threadIdx.x];
+  }
+  if (threadIdx.x < num_axes + 1) {
+    shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
+    shared_im_shape[threadIdx.x] = im_shape[threadIdx.x];
+  }
+  __syncthreads();
+
   int i;
   CUDA_KERNEL_LOOP(index, n) {
     // Initialize channel_in, computed in the loop below, with intermediate
@@ -81,19 +105,19 @@ __global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
     int channel_in = index;
     int channel_out = 1;
     for (i = num_axes - 1; i >= 0; --i) {
-      d_temp[i] = channel_in % col_shape[i + 1];
-      channel_in /= col_shape[i + 1];
-      channel_out *= kernel_shape[i];
+      d_temp[i] = channel_in % shared_col_shape[i + 1];
+      channel_in /= shared_col_shape[i + 1];
+      channel_out *= shared_kernel_shape[i];
     }
     channel_out *= channel_in;
     int data_col_inc = 1;
     for (i = 0; i < num_axes; ++i) {
-      channel_out *= col_shape[i + 1];
+      channel_out *= shared_col_shape[i + 1];
       channel_out += d_temp[i];
-      d_temp[i] = d_temp[i] * stride[i] - pad[i];
-      channel_in *= im_shape[i + 1];
+      d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
+      channel_in *= shared_im_shape[i + 1];
       channel_in += d_temp[i];
-      data_col_inc *= col_shape[i + 1];
+      data_col_inc *= shared_col_shape[i + 1];
       d_iter[i] = 0;
     }
     Dtype* data_col_ptr = data_col + channel_out;
@@ -102,15 +126,15 @@ __global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
     do {
       bool in_range = true;
       for (i = 0; i < num_axes; ++i) {
-        const int d_iter_im = d_iter[i] + d_temp[i];
-        in_range &= d_iter_im >= 0 && d_iter_im < im_shape[i + 1];
+        const int d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
+        in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
         if (!in_range) { break; }
       }
       if (in_range) {
-        int data_im_offset = d_iter[0];
+        int data_im_offset = d_iter[0] * shared_dilation[0];
         for (i = 1; i < num_axes; ++i) {
-          data_im_offset *= im_shape[i + 1];
-          data_im_offset += d_iter[i];
+          data_im_offset *= shared_im_shape[i + 1];
+          data_im_offset += d_iter[i] * shared_dilation[i];
         }
         *data_col_ptr = data_im_ptr[data_im_offset];
       } else {
@@ -119,7 +143,7 @@ __global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
       data_col_ptr += data_col_inc;
       incremented = false;
       for (i = num_axes - 1; i >= 0; --i) {
-        const int d_max = kernel_shape[i];
+        const int d_max = shared_kernel_shape[i];
         if (d_iter[i] == d_max - 1) {
           d_iter[i] = 0;
         } else {  // d_iter[i] < d_max - 1
@@ -136,67 +160,69 @@ template <typename Dtype>
 void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
     const int num_kernels, const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_col) {
+    const int* dilation, Dtype* data_col) {
+  // num_axes should be smaller than block size
+  DCHECK_LT(num_spatial_axes, CAFFE_CUDA_NUM_THREADS);
   switch (num_spatial_axes) {
   case 1:
     im2col_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 2:
     im2col_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 3:
     im2col_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 4:
     im2col_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 5:
     im2col_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 6:
     im2col_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 7:
     im2col_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 8:
     im2col_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 9:
     im2col_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   case 10:
     im2col_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
         <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
         num_kernels, data_im, im_shape, col_shape,
-        kernel_shape, pad, stride, data_col);
+        kernel_shape, pad, stride, dilation, data_col);
     break;
   default:
     LOG(FATAL) << "im2col_nd_gpu does not support computation with "
@@ -210,12 +236,12 @@ template void im2col_nd_gpu<float>(const float* data_im,
     const int num_spatial_axes, const int col_size,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    float* data_col);
+    const int* dilation, float* data_col);
 template void im2col_nd_gpu<double>(const double* data_im,
     const int num_spatial_axes, const int col_size,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    double* data_col);
+    const int* dilation, double* data_col);
 
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
@@ -223,6 +249,7 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
     const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
     const int height_col, const int width_col,
     Dtype* data_im) {
   CUDA_KERNEL_LOOP(index, n) {
@@ -230,33 +257,27 @@ __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
     const int w_im = index % width + pad_w;
     const int h_im = (index / width) % height + pad_h;
     const int c_im = index / (width * height);
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
     // compute the start and end of the output
     const int w_col_start =
-        (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
-    const int w_col_end =
-        min(w_im / stride_w + 1, width_col);
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
     const int h_col_start =
-        (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
-    const int h_col_end =
-        min(h_im / stride_h + 1, height_col);
-    /*
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        // the col location: [c * width * height + h_out, w_out]
-        int c_col = c_im * kernel_h * kernel_w
-            + (h_im - h_col * stride_h) * kernel_w + (w_im - w_col * stride_w);
-        val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-      }
-    }
-    */
-    // equivalent implementation
-    int offset = (c_im * kernel_h * kernel_w + h_im * kernel_w + w_im)
-        * height_col * width_col;
-    int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
-    int coeff_w_col = (1 - stride_w * height_col * width_col);
-    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-        val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int h_k = (h_im - h_col * stride_h);
+        int w_k = (w_im - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                                height_col + h_col) * width_col + w_col;
+          val += data_col[data_col_index];
+        }
       }
     }
     data_im[index] = val;
@@ -267,9 +288,12 @@ template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, Dtype* data_im) {
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+    const int stride_w, const int dilation_h, const int dilation_w,
+    Dtype* data_im) {
+  int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) /
+      stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
+      stride_w + 1;
   int num_kernels = channels * height * width;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
@@ -277,7 +301,7 @@ void col2im_gpu(const Dtype* data_col, const int channels,
   col2im_gpu_kernel<Dtype><<<CAFFE_GET_BLOCKS(num_kernels),
                              CAFFE_CUDA_NUM_THREADS>>>(
       num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
-      pad_h, pad_w, stride_h, stride_w,
+      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
       height_col, width_col, data_im);
   CUDA_POST_KERNEL_CHECK;
 }
@@ -286,37 +310,62 @@ void col2im_gpu(const Dtype* data_col, const int channels,
 template void col2im_gpu<float>(const float* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, float* data_im);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    float* data_im);
 template void col2im_gpu<double>(const double* data_col, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, double* data_im);
+    const int stride_w, const int dilation_h, const int dilation_w,
+    double* data_im);
 
 template <typename Dtype, int num_axes>
 __global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im) {
+    const int* dilation, Dtype* data_im) {
   int d_im[num_axes];  // NOLINT(runtime/arrays)
   int d_col_iter[num_axes];  // NOLINT(runtime/arrays)
   int d_col_start[num_axes];  // NOLINT(runtime/arrays)
   int d_col_end[num_axes];  // NOLINT(runtime/arrays)
+
+  __shared__ int shared_dilation[num_axes];
+  __shared__ int shared_kernel_shape[num_axes];
+  __shared__ int shared_pad[num_axes];
+  __shared__ int shared_stride[num_axes];
+  __shared__ int shared_col_shape[num_axes + 1];
+  __shared__ int shared_im_shape[num_axes + 1];
+
+  if (threadIdx.x < num_axes) {
+    shared_dilation[threadIdx.x] = dilation[threadIdx.x];
+    shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
+    shared_pad[threadIdx.x] = pad[threadIdx.x];
+    shared_stride[threadIdx.x] = stride[threadIdx.x];
+  }
+  if (threadIdx.x < num_axes + 1) {
+    shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
+    shared_im_shape[threadIdx.x] = im_shape[threadIdx.x];
+  }
+  __syncthreads();
+
   CUDA_KERNEL_LOOP(index, n) {
     // Initialize channel_in, computed in the loop below, with intermediate
     // computations used to compute the spatial indices.
     int c_im = index;
     // Calculate d_im (image dimensions).
     for (int i = num_axes - 1; i >= 0; --i) {
-      d_im[i] = c_im % im_shape[i + 1] + pad[i];
-      c_im /= im_shape[i + 1];
+      d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
+      c_im /= shared_im_shape[i + 1];
     }
     // Calculate col start/end indices.
     bool done = false;
     for (int i = 0; i < num_axes; ++i) {
+      const int kernel_extent =
+          shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1;
       d_col_start[i] = d_col_iter[i] =
-          (d_im[i] < kernel_shape[i]) ?
-          0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
-      d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);
+          (d_im[i] < kernel_extent) ? 0 :
+          (d_im[i] - kernel_extent) / shared_stride[i] + 1;
+      d_col_end[i] =
+          min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]);
       if (d_col_start[i] >= d_col_end[i]) {
         // Skip computation if the dimension is 0 at any spatial axis --
         // final val will be 0.
@@ -331,21 +380,32 @@ __global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col,
     // Loop over the col to compute the output val.
     Dtype val = 0;
     bool incremented = true;
+    bool skip = false;
     do {
       // Compute the final offset.
       int final_offset = 0;
       int kernel_shape_prod = 1;
+      int kernel_index;
       for (int i = num_axes - 1; i >= 0; --i) {
-        final_offset +=
-            (d_im[i] - d_col_iter[i] * stride[i]) * kernel_shape_prod;
-        kernel_shape_prod *= kernel_shape[i];
+        kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
+        if (kernel_index % shared_dilation[i]) {
+          skip = true;
+          break;
+        } else {
+          kernel_index /= shared_dilation[i];
+          final_offset += kernel_index * kernel_shape_prod;
+          kernel_shape_prod *= shared_kernel_shape[i];
+        }
       }
-      final_offset += kernel_shape_prod * c_im;
-      for (int i = 0; i < num_axes; ++i) {
-        final_offset *= col_shape[i + 1];
-        final_offset += d_col_iter[i];
+      if (!skip) {
+        final_offset += kernel_shape_prod * c_im;
+        for (int i = 0; i < num_axes; ++i) {
+          final_offset *= shared_col_shape[i + 1];
+          final_offset += d_col_iter[i];
+        }
+        val += data_col[final_offset];
       }
-      val += data_col[final_offset];
+      skip = false;
       incremented = false;
       for (int i = num_axes - 1; i >= 0; --i) {
         const int d_max = d_col_end[i];
@@ -366,67 +426,69 @@ template <typename Dtype>
 void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
     const int im_size, const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    Dtype* data_im) {
+    const int* dilation, Dtype* data_im) {
+  // num_axes should be smaller than block size
+  DCHECK_LT(num_spatial_axes, CAFFE_CUDA_NUM_THREADS);
   switch (num_spatial_axes) {
   case 1:
     col2im_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 2:
     col2im_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 3:
     col2im_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 4:
     col2im_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 5:
     col2im_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 6:
     col2im_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 7:
     col2im_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 8:
     col2im_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 9:
     col2im_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   case 10:
     col2im_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
           <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
           im_size, data_col, im_shape, col_shape,
-          kernel_shape, pad, stride, data_im);
+          kernel_shape, pad, stride, dilation, data_im);
     break;
   default:
     LOG(FATAL) << "col2im_nd_gpu does not support computation with "
@@ -440,11 +502,11 @@ template void col2im_nd_gpu<float>(const float* data_col,
     const int num_spatial_axes, const int im_size,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    float* data_im);
+    const int* dilation, float* data_im);
 template void col2im_nd_gpu<double>(const double* data_col,
     const int num_spatial_axes, const int im_size,
     const int* im_shape, const int* col_shape,
     const int* kernel_shape, const int* pad, const int* stride,
-    double* data_im);
+    const int* dilation, double* data_im);
 
 }  // namespace caffe
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 305cfc3..470165a 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -3,6 +3,7 @@
 namespace bp = boost::python;
 #endif
 
+#include <gflags/gflags.h>
 #include <glog/logging.h>
 
 #include <cstring>
@@ -164,7 +165,7 @@ int train() {
   if (FLAGS_gpu.size() == 0
       && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
       if (solver_param.has_device_id()) {
-          FLAGS_gpu = ""  +
+          FLAGS_gpu = "" +
               boost::lexical_cast<string>(solver_param.device_id());
       } else {  // Set default GPU if unspecified
           FLAGS_gpu = "" + boost::lexical_cast<string>(0);
@@ -378,6 +379,8 @@ RegisterBrewFunction(time);
 int main(int argc, char** argv) {
   // Print output to stderr (while still logging).
   FLAGS_alsologtostderr = 1;
+  // Set version
+  gflags::SetVersionString(AS_STRING(CAFFE_VERSION));
   // Usage message.
   gflags::SetUsageMessage("command line brew\n"
       "usage: caffe <command> <args>\n\n"

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/caffe-contrib.git