[caffe-contrib] 233/362: Imported Upstream version 0.9999~rc2+20150925+g674b349

Tue May 3 09:24:36 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository caffe-contrib.

commit ee1cded1d32500fe19cd0fdaeaa75010554a43af
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Fri Sep 25 05:27:13 2015 +0000

    Imported Upstream version 0.9999~rc2+20150925+g674b349
---
 .travis.yml                                      |  11 +-
 CMakeLists.txt                                   |   9 +-
 Makefile                                         |  38 ++-
 Makefile.config.example                          |   5 +
 cmake/ConfigGen.cmake                            |  12 +
 cmake/Dependencies.cmake                         |  41 ++-
 cmake/Summary.cmake                              |  18 +-
 cmake/Templates/CaffeConfig.cmake.in             |  26 +-
 cmake/Templates/caffe_config.h.in                |   5 +
 docs/installation.md                             |   9 +-
 docs/multigpu.md                                 |  26 ++
 examples/cpp_classification/classification.cpp   |   8 +
 examples/feature_extraction/readme.md            |   2 +-
 examples/mnist/convert_mnist_data.cpp            |  12 +
 examples/siamese/convert_mnist_siamese_data.cpp  |   9 +-
 include/caffe/blob.hpp                           |   2 +
 include/caffe/common_layers.hpp                  |   4 +-
 include/caffe/data_layers.hpp                    |   3 +-
 include/caffe/data_transformer.hpp               |   5 +-
 include/caffe/solver.hpp                         |  16 +-
 include/caffe/util/db_leveldb.hpp                |   2 +
 include/caffe/util/db_lmdb.hpp                   |   2 +
 include/caffe/util/im2col.hpp                    |  24 ++
 include/caffe/util/io.hpp                        |   2 +
 include/caffe/vision_layers.hpp                  | 111 ++++--
 python/caffe/__init__.py                         |   2 +-
 python/caffe/_caffe.cpp                          |   9 +
 python/caffe/io.py                               |   2 +-
 python/caffe/net_spec.py                         |  10 +-
 python/caffe/pycaffe.py                          |   3 +-
 python/caffe/test/test_layer_type_list.py        |   1 +
 python/caffe/test/test_net_spec.py               |   3 +-
 scripts/travis/travis_build_and_test.sh          |  15 +-
 scripts/travis/travis_setup_makefile_config.sh   |   6 +
 src/caffe/blob.cpp                               |  11 +
 src/caffe/data_transformer.cpp                   |  16 +-
 src/caffe/layers/base_conv_layer.cpp             | 235 ++++++++-----
 src/caffe/layers/concat_layer.cpp                |   6 +
 src/caffe/layers/concat_layer.cu                 |   2 +
 src/caffe/layers/conv_layer.cpp                  |  31 +-
 src/caffe/layers/conv_layer.cu                   |  16 +-
 src/caffe/layers/cudnn_conv_layer.cpp            |  46 +--
 src/caffe/layers/cudnn_conv_layer.cu             |  18 +-
 src/caffe/layers/data_layer.cpp                  |   3 +-
 src/caffe/layers/deconv_layer.cpp                |  31 +-
 src/caffe/layers/deconv_layer.cu                 |  16 +-
 src/caffe/layers/im2col_layer.cpp                | 171 +++++++---
 src/caffe/layers/im2col_layer.cu                 |  41 ++-
 src/caffe/layers/image_data_layer.cpp            |   2 +
 src/caffe/layers/memory_data_layer.cpp           |   4 +
 src/caffe/layers/slice_layer.cpp                 |   7 +-
 src/caffe/layers/slice_layer.cu                  |   3 +-
 src/caffe/layers/window_data_layer.cpp           |   2 +
 src/caffe/net.cpp                                |  11 +-
 src/caffe/proto/caffe.proto                      |  44 ++-
 src/caffe/solver.cpp                             |  37 +-
 src/caffe/test/test_accuracy_layer.cpp           |  12 +-
 src/caffe/test/test_concat_layer.cpp             |  23 ++
 src/caffe/test/test_convolution_layer.cpp        | 409 +++++++++++++++++++----
 src/caffe/test/test_data/generate_sample_data.py |  12 +-
 src/caffe/test/test_data/sample_data_2_gzip.h5   | Bin 15446 -> 15446 bytes
 src/caffe/test/test_data_layer.cpp               |   6 +
 src/caffe/test/test_data_transformer.cpp         |   2 +
 src/caffe/test/test_db.cpp                       |   2 +
 src/caffe/test/test_deconvolution_layer.cpp      | 159 ++++++++-
 src/caffe/test/test_eltwise_layer.cpp            |   4 +-
 src/caffe/test/test_im2col_kernel.cu             |  87 ++++-
 src/caffe/test/test_im2col_layer.cpp             |  30 +-
 src/caffe/test/test_image_data_layer.cpp         |   2 +
 src/caffe/test/test_io.cpp                       |   2 +
 src/caffe/test/test_layer_factory.cpp            |   4 +
 src/caffe/test/test_memory_data_layer.cpp        |   5 +-
 src/caffe/test/test_net.cpp                      |  26 +-
 src/caffe/test/test_slice_layer.cpp              |  27 ++
 src/caffe/test/test_upgrade_proto.cpp            |  12 +-
 src/caffe/util/db.cpp                            |  14 +-
 src/caffe/util/db_leveldb.cpp                    |   2 +
 src/caffe/util/db_lmdb.cpp                       |   2 +
 src/caffe/util/hdf5.cpp                          |  29 +-
 src/caffe/util/im2col.cpp                        | 116 +++++++
 src/caffe/util/im2col.cu                         | 306 ++++++++++++++++-
 src/caffe/util/io.cpp                            |  10 +-
 src/caffe/util/upgrade_proto.cpp                 |   6 +-
 tools/caffe.cpp                                  |   1 +
 tools/compute_image_mean.cpp                     |   4 +
 tools/convert_imageset.cpp                       |   4 +
 tools/extract_features.cpp                       |   2 +-
 87 files changed, 2087 insertions(+), 439 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b920a93..4dc7ed7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,11 +2,12 @@
 # one using CMake, and one using make.
 env:
   matrix:
-    - WITH_CUDA=false WITH_CMAKE=false
-    - WITH_CUDA=false WITH_CMAKE=true
-    - WITH_CUDA=true WITH_CMAKE=false
-    - WITH_CUDA=true WITH_CMAKE=true
-    - WITH_CUDA=false WITH_CMAKE=true PYTHON_VERSION=3
+    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true
+    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3
+    - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true
+    - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true
+    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false
+    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3
 
 language: cpp
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef599b6..37f937f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,13 +16,16 @@ include(cmake/ConfigGen.cmake)
 
 # ---[ Options
 caffe_option(CPU_ONLY  "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
-caffe_option(USE_CUDNN "Build Caffe with cuDNN libary support" ON IF NOT CPU_ONLY)
+caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
-set(python_version "2" CACHE STRING "Specify which python version to use")
+set(python_version "2" CACHE STRING "Specify which Python version to use")
 caffe_option(BUILD_matlab "Build Matlab wrapper" OFF IF UNIX OR APPLE)
 caffe_option(BUILD_docs   "Build documentation" ON IF UNIX OR APPLE)
-caffe_option(BUILD_python_layer "Build the Caffe python layer" ON)
+caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
+caffe_option(USE_LMDB "Build with lmdb" ON)
+caffe_option(USE_LEVELDB "Build with levelDB" ON)
+caffe_option(USE_OPENCV "Build with OpenCV support" ON)
 
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
diff --git a/Makefile b/Makefile
index 80bc373..5fb6394 100644
--- a/Makefile
+++ b/Makefile
@@ -169,9 +169,23 @@ ifneq ($(CPU_ONLY), 1)
 	LIBRARY_DIRS += $(CUDA_LIB_DIR)
 	LIBRARIES := cudart cublas curand
 endif
-LIBRARIES += glog gflags protobuf leveldb snappy \
-	lmdb boost_system hdf5_hl hdf5 m \
-	opencv_core opencv_highgui opencv_imgproc
+
+LIBRARIES += glog gflags protobuf boost_system m hdf5_hl hdf5
+
+# handle IO dependencies
+USE_LEVELDB ?= 1
+USE_LMDB ?= 1
+USE_OPENCV ?= 1
+
+ifeq ($(USE_LEVELDB), 1)
+	LIBRARIES += leveldb snappy
+endif
+ifeq ($(USE_LMDB), 1)
+	LIBRARIES += lmdb
+endif
+ifeq ($(USE_OPENCV), 1)
+	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
+endif
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall -Wno-sign-compare
 
@@ -290,6 +304,17 @@ ifeq ($(USE_CUDNN), 1)
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
+# configure IO libraries
+ifeq ($(USE_OPENCV), 1)
+	COMMON_FLAGS += -DUSE_OPENCV
+endif
+ifeq ($(USE_LEVELDB), 1)
+	COMMON_FLAGS += -DUSE_LEVELDB
+endif
+ifeq ($(USE_LMDB), 1)
+	COMMON_FLAGS += -DUSE_LMDB
+endif
+
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
 	OBJS := $(PROTO_OBJS) $(CXX_OBJS)
@@ -329,8 +354,9 @@ else
 		# OS X packages atlas as the vecLib framework
 		LIBRARIES += cblas
 		# 10.10 has accelerate while 10.9 has veclib
-		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep -o 'version: 6')
-		ifneq (,$(findstring version: 6,$(XCODE_CLT_VER)))
+		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep 'version' | sed 's/[^0-9]*\([0-9]\).*/\1/')
+		XCODE_CLT_GEQ_6 := $(shell [ $(XCODE_CLT_VER) -gt 5 ] && echo 1)
+		ifeq ($(XCODE_CLT_GEQ_6), 1)
 			BLAS_INCLUDE ?= /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
 			LDFLAGS += -framework Accelerate
 		else
@@ -472,7 +498,7 @@ runtest: $(TEST_ALL_BIN)
 
 pytest: py
 	cd python; python -m unittest discover -s caffe/test
-	
+
 mattest: mat
 	cd matlab; $(MATLAB_DIR)/bin/matlab -nodisplay -r 'caffe.run_tests(), exit()'
 
diff --git a/Makefile.config.example b/Makefile.config.example
index a873502..a20bad2 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -7,6 +7,11 @@
 # CPU-only switch (uncomment to build without GPU support).
 # CPU_ONLY := 1
 
+# uncomment to disable IO dependencies and corresponding data layers
+# USE_LEVELDB := 0
+# USE_LMDB := 0
+# USE_OPENCV := 0
+
 # To customize your choice of compiler, uncomment and set the following.
 # N.B. the default for Linux is g++ and the default for OSX is clang++
 # CUSTOM_CXX := g++
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 566d6ca..8b25996 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -56,6 +56,18 @@ function(caffe_generate_export_configs)
     list(APPEND Caffe_DEFINITIONS -DCPU_ONLY)
   endif()
 
+  if(USE_OPENCV)
+    list(APPEND Caffe_DEFINITIONS -DUSE_OPENCV)
+  endif()
+
+  if(USE_LMDB)
+    list(APPEND Caffe_DEFINITIONS -DUSE_LMDB)
+  endif()
+
+  if(USE_LEVELDB)
+    list(APPEND Caffe_DEFINITIONS -DUSE_LEVELDB)
+  endif()
+
   if(NOT HAVE_CUDNN)
     set(HAVE_CUDNN FALSE)
   else()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 7c86dd5..d68d7bf 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -29,19 +29,27 @@ include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
 list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
 
 # ---[ LMDB
-find_package(LMDB REQUIRED)
-include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${LMDB_LIBRARIES})
+if(USE_LMDB)
+  find_package(LMDB REQUIRED)
+  include_directories(SYSTEM ${LMDB_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${LMDB_LIBRARIES})
+  add_definitions(-DUSE_LMDB)
+endif()
 
 # ---[ LevelDB
-find_package(LevelDB REQUIRED)
-include_directories(SYSTEM ${LevelDB_INCLUDE})
-list(APPEND Caffe_LINKER_LIBS ${LevelDB_LIBRARIES})
+if(USE_LEVELDB)
+  find_package(LevelDB REQUIRED)
+  include_directories(SYSTEM ${LevelDB_INCLUDE})
+  list(APPEND Caffe_LINKER_LIBS ${LevelDB_LIBRARIES})
+  add_definitions(-DUSE_LEVELDB)
+endif()
 
 # ---[ Snappy
-find_package(Snappy REQUIRED)
-include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
+if(USE_LEVELDB)
+  find_package(Snappy REQUIRED)
+  include_directories(SYSTEM ${Snappy_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
+endif()
 
 # ---[ CUDA
 include(cmake/Cuda.cmake)
@@ -57,13 +65,16 @@ if(NOT HAVE_CUDA)
 endif()
 
 # ---[ OpenCV
-find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
-if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
-  find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
+if(USE_OPENCV)
+  find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
+  if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
+    find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
+  endif()
+  include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
+  list(APPEND Caffe_LINKER_LIBS ${OpenCV_LIBS})
+  message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
+  add_definitions(-DUSE_OPENCV)
 endif()
-include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-list(APPEND Caffe_LINKER_LIBS ${OpenCV_LIBS})
-message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
 
 # ---[ BLAS
 if(NOT APPLE)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index e094ac0..3d12e81 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -114,6 +114,9 @@ function(caffe_print_configuration_summary)
   caffe_status("  BUILD_matlab      :   ${BUILD_matlab}")
   caffe_status("  BUILD_docs        :   ${BUILD_docs}")
   caffe_status("  CPU_ONLY          :   ${CPU_ONLY}")
+  caffe_status("  USE_LMDB          :   ${USE_LMDB}")
+  caffe_status("  USE_LEVELDB       :   ${USE_LEVELDB}")
+  caffe_status("  USE_OPENCV        :   ${USE_OPENCV}")
   caffe_status("")
   caffe_status("Dependencies:")
   caffe_status("  BLAS              : " APPLE THEN "Yes (vecLib)" ELSE "Yes (${BLAS})")
@@ -121,10 +124,16 @@ function(caffe_print_configuration_summary)
   caffe_status("  glog              :   Yes")
   caffe_status("  gflags            :   Yes")
   caffe_status("  protobuf          : " PROTOBUF_FOUND THEN "Yes (ver. ${PROTOBUF_VERSION})" ELSE "No" )
-  caffe_status("  lmdb              : " LMDB_FOUND THEN "Yes (ver. ${LMDB_VERSION})" ELSE "No")
-  caffe_status("  Snappy            : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" )
-  caffe_status("  LevelDB           : " LEVELDB_FOUND THEN  "Yes (ver. ${LEVELDB_VERSION})" ELSE "No")
-  caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
+  if(USE_LMDB)
+    caffe_status("  lmdb              : " LMDB_FOUND THEN "Yes (ver. ${LMDB_VERSION})" ELSE "No")
+  endif()
+  if(USE_LEVELDB)
+    caffe_status("  LevelDB           : " LEVELDB_FOUND THEN  "Yes (ver. ${LEVELDB_VERSION})" ELSE "No")
+    caffe_status("  Snappy            : " SNAPPY_FOUND THEN "Yes (ver. ${Snappy_VERSION})" ELSE "No" )
+  endif()
+  if(USE_OPENCV)
+    caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
+  endif()
   caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
   caffe_status("")
   if(HAVE_CUDA)
@@ -165,4 +174,3 @@ function(caffe_print_configuration_summary)
   caffe_status("  Install path      :   ${CMAKE_INSTALL_PREFIX}")
   caffe_status("")
 endfunction()
-
diff --git a/cmake/Templates/CaffeConfig.cmake.in b/cmake/Templates/CaffeConfig.cmake.in
index 8f23742..73f57ac 100644
--- a/cmake/Templates/CaffeConfig.cmake.in
+++ b/cmake/Templates/CaffeConfig.cmake.in
@@ -17,22 +17,24 @@
 #   Caffe_HAVE_CUDNN   - signals about cuDNN support
 
 
-# OpenCV dependency
+# OpenCV dependency (optional)
 
-if(NOT OpenCV_FOUND)
-  set(Caffe_OpenCV_CONFIG_PATH "@OpenCV_CONFIG_PATH@")
-  if(Caffe_OpenCV_CONFIG_PATH)
-    get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE)
+if(@USE_OPENCV@)
+  if(NOT OpenCV_FOUND)
+    set(Caffe_OpenCV_CONFIG_PATH "@OpenCV_CONFIG_PATH@")
+    if(Caffe_OpenCV_CONFIG_PATH)
+      get_filename_component(Caffe_OpenCV_CONFIG_PATH ${Caffe_OpenCV_CONFIG_PATH} ABSOLUTE)
 
-    if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
-      message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
-      include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
-    endif()
+      if(EXISTS ${Caffe_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
+        message(STATUS "Caffe: using OpenCV config from ${Caffe_OpenCV_CONFIG_PATH}")
+        include(${Caffe_OpenCV_CONFIG_PATH}/OpenCVModules.cmake)
+      endif()
 
-  else()
-    find_package(OpenCV REQUIRED)
+    else()
+      find_package(OpenCV REQUIRED)
+    endif()
+    unset(Caffe_OpenCV_CONFIG_PATH)
   endif()
-  unset(Caffe_OpenCV_CONFIG_PATH)
 endif()
 
 # Compute paths
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 6039e8f..9302022 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -30,3 +30,8 @@
 
 /* Matlab */
 #cmakedefine HAVE_MATLAB
+
+/* IO libraries */
+#cmakedefine USE_OPENCV
+#cmakedefine USE_LMDB
+#cmakedefine USE_LEVELDB
diff --git a/docs/installation.md b/docs/installation.md
index d535c6d..89a8c71 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -17,16 +17,19 @@ When updating Caffe, it's best to `make clean` before re-compiling.
 
 ## Prerequisites
 
-Caffe has several dependencies.
+Caffe has several dependencies:
 
 * [CUDA](https://developer.nvidia.com/cuda-zone) is required for GPU mode.
     * library version 7.0 and the latest driver version are recommended, but 6.* is fine too
     * 5.5, and 5.0 are compatible but considered legacy
 * [BLAS](http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) via ATLAS, MKL, or OpenBLAS.
 * [Boost](http://www.boost.org/) >= 1.55
+* `protobuf`, `glog`, `gflags`, `hdf5`
+
+Optional dependencies:
+
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
-* `protobuf`, `glog`, `gflags`
-* IO libraries `hdf5`, `leveldb`, `snappy`, `lmdb`
+* IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
diff --git a/docs/multigpu.md b/docs/multigpu.md
new file mode 100644
index 0000000..01cfb89
--- /dev/null
+++ b/docs/multigpu.md
@@ -0,0 +1,26 @@
+---
+title: Multi-GPU Usage, Hardware Configuration Assumptions, and Performance
+---
+
+# Multi-GPU Usage
+
+Currently Multi-GPU is only supported via the C/C++ paths and only for training.
+
+The GPUs to be used for training can be set with the "-gpu" flag on the command line to the 'caffe' tool.  e.g. "build/tools/caffe train --solver=models/bvlc_alexnet/solver.prototxt --gpu=0,1" will train on GPUs 0 and 1.
+
+**NOTE**: each GPU runs the batchsize specified in your train_val.prototxt.  So if you go from 1 GPU to 2 GPU, your effective batchsize will double.  e.g. if your train_val.prototxt specified a batchsize of 256, if you run 2 GPUs your effective batch size is now 512.  So you need to adjust the batchsize when running multiple GPUs and/or adjust your solver params, specifically learning rate.
+
+# Hardware Configuration Assumptions
+
+The current implementation uses a tree reduction strategy.  e.g. if there are 4 GPUs in the system, 0:1, 2:3 will exchange gradients, then 0:2 (top of the tree) will exchange gradients, 0 will calculate
+updated model, 0\-\>2, and then 0\-\>1, 2\-\>3. 
+
+For best performance, P2P DMA access between devices is needed. Without P2P access, for example crossing PCIe root complex, data is copied through host and effective exchange bandwidth is greatly reduced.
+
+Current implementation has a "soft" assumption that the devices being used are homogeneous.  In practice, any devices of the same general class should work together, but performance and total size is limited by the smallest device being used.  e.g. if you combine a TitanX and a GTX980, peformance will be limited by the 980.  Mixing vastly different levels of boards, e.g. Kepler and Fermi, is not supported.
+
+"nvidia-smi topo -m" will show you the connectivity matrix.  You can do P2P through PCIe bridges, but not across socket level links at this time, e.g. across CPU sockets on a multi-socket motherboard.
+
+# Scaling Performance
+
+Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs [...]
\ No newline at end of file
diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp
index dc8b863..de48fb6 100644
--- a/examples/cpp_classification/classification.cpp
+++ b/examples/cpp_classification/classification.cpp
@@ -1,7 +1,9 @@
 #include <caffe/caffe.hpp>
+#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
+#endif  // USE_OPENCV
 #include <algorithm>
 #include <iosfwd>
 #include <memory>
@@ -9,6 +11,7 @@
 #include <utility>
 #include <vector>
 
+#ifdef USE_OPENCV
 using namespace caffe;  // NOLINT(build/namespaces)
 using std::string;
 
@@ -255,3 +258,8 @@ int main(int argc, char** argv) {
               << p.first << "\"" << std::endl;
   }
 }
+#else
+int main(int argc, char** argv) {
+  LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
+}
+#endif  // USE_OPENCV
diff --git a/examples/feature_extraction/readme.md b/examples/feature_extraction/readme.md
index a980b8b..2bc3dac 100644
--- a/examples/feature_extraction/readme.md
+++ b/examples/feature_extraction/readme.md
@@ -64,7 +64,7 @@ If you meet with the error "Check failed: status.ok() Failed to open leveldb exa
 
     rm -rf examples/_temp/features/
 
-If you'd like to use the Python wrapper for extracting features, check out the [layer visualization notebook](http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/examples/filter_visualization.ipynb).
+If you'd like to use the Python wrapper for extracting features, check out the [filter visualization notebook](http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/examples/00-classification.ipynb).
 
 Clean Up
 --------
diff --git a/examples/mnist/convert_mnist_data.cpp b/examples/mnist/convert_mnist_data.cpp
index 54443f1..8f29baf 100644
--- a/examples/mnist/convert_mnist_data.cpp
+++ b/examples/mnist/convert_mnist_data.cpp
@@ -9,9 +9,13 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <google/protobuf/text_format.h>
+
+#if defined(USE_LEVELDB) && defined(USE_LMDB)
 #include <leveldb/db.h>
 #include <leveldb/write_batch.h>
 #include <lmdb.h>
+#endif
+
 #include <stdint.h>
 #include <sys/stat.h>
 
@@ -20,6 +24,8 @@
 
 #include "caffe/proto/caffe.pb.h"
 
+#if defined(USE_LEVELDB) && defined(USE_LMDB)
+
 using namespace caffe;  // NOLINT(build/namespaces)
 using std::string;
 
@@ -196,3 +202,9 @@ int main(int argc, char** argv) {
   }
   return 0;
 }
+#else
+int main(int argc, char** argv) {
+  LOG(FATAL) << "This example requires LevelDB and LMDB; " <<
+  "compile with USE_LEVELDB and USE_LMDB.";
+}
+#endif  // USE_LEVELDB and USE_LMDB
diff --git a/examples/siamese/convert_mnist_siamese_data.cpp b/examples/siamese/convert_mnist_siamese_data.cpp
index 8008b44..ad08036 100644
--- a/examples/siamese/convert_mnist_siamese_data.cpp
+++ b/examples/siamese/convert_mnist_siamese_data.cpp
@@ -10,12 +10,14 @@
 
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
-#include "leveldb/db.h"
 #include "stdint.h"
 
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/math_functions.hpp"
 
+#ifdef USE_LEVELDB
+#include "leveldb/db.h"
+
 uint32_t swap_endian(uint32_t val) {
     val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
     return (val << 16) | (val >> 16);
@@ -121,3 +123,8 @@ int main(int argc, char** argv) {
   }
   return 0;
 }
+#else
+int main(int argc, char** argv) {
+  LOG(FATAL) << "This example requires LevelDB; compile with USE_LEVELDB.";
+}
+#endif  // USE_LEVELDB
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index dda7b1f..fea5117 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -219,6 +219,7 @@ class Blob {
 
   const Dtype* cpu_data() const;
   void set_cpu_data(Dtype* data);
+  const int* gpu_shape() const;
   const Dtype* gpu_data() const;
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
@@ -268,6 +269,7 @@ class Blob {
  protected:
   shared_ptr<SyncedMemory> data_;
   shared_ptr<SyncedMemory> diff_;
+  shared_ptr<SyncedMemory> shape_data_;
   vector<int> shape_;
   int count_;
   int capacity_;
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 8e64b3e..89bab8d 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -85,7 +85,7 @@ class ConcatLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top);
 
   virtual inline const char* type() const { return "Concat"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MinBottomBlobs() const { return 1; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
@@ -625,7 +625,7 @@ class SliceLayer : public Layer<Dtype> {
 
   virtual inline const char* type() const { return "Slice"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 2; }
+  virtual inline int MinTopBlobs() const { return 1; }
 
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 552d814..90fd0d1 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -4,7 +4,6 @@
 #include <string>
 #include <utility>
 #include <vector>
-
 #include "hdf5.h"
 
 #include "caffe/blob.hpp"
@@ -275,8 +274,10 @@ class MemoryDataLayer : public BaseDataLayer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 2; }
 
   virtual void AddDatumVector(const vector<Datum>& datum_vector);
+#ifdef USE_OPENCV
   virtual void AddMatVector(const vector<cv::Mat>& mat_vector,
       const vector<int>& labels);
+#endif  // USE_OPENCV
 
   // Reset should accept const pointers, but can't, because the memory
   //  will be given to Blob, which is mutable
diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
index 0ad68c8..97b4ee6 100644
--- a/include/caffe/data_transformer.hpp
+++ b/include/caffe/data_transformer.hpp
@@ -50,6 +50,7 @@ class DataTransformer {
   void Transform(const vector<Datum> & datum_vector,
                 Blob<Dtype>* transformed_blob);
 
+#ifdef USE_OPENCV
   /**
    * @brief Applies the transformation defined in the data layer's
    * transform_param block to a vector of Mat.
@@ -74,6 +75,7 @@ class DataTransformer {
    *    set_cpu_data() is used. See image_data_layer.cpp for an example.
    */
   void Transform(const cv::Mat& cv_img, Blob<Dtype>* transformed_blob);
+#endif  // USE_OPENCV
 
   /**
    * @brief Applies the same transformation defined in the data layer's
@@ -113,6 +115,7 @@ class DataTransformer {
    * @param mat_vector
    *    A vector of Mat containing the data to be transformed.
    */
+#ifdef USE_OPENCV
   vector<int> InferBlobShape(const vector<cv::Mat> & mat_vector);
   /**
    * @brief Infers the shape of transformed_blob will have when
@@ -122,6 +125,7 @@ class DataTransformer {
    *    cv::Mat containing the data to be transformed.
    */
   vector<int> InferBlobShape(const cv::Mat& cv_img);
+#endif  // USE_OPENCV
 
  protected:
    /**
@@ -148,4 +152,3 @@ class DataTransformer {
 }  // namespace caffe
 
 #endif  // CAFFE_DATA_TRANSFORMER_HPP_
-
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index aba3e03..2ecf539 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -82,6 +82,8 @@ class Solver {
     callbacks_.push_back(value);
   }
 
+  void CheckSnapshotWritePermissions();
+
  protected:
   // Make and apply the update value for the current iteration.
   virtual void ApplyUpdate() = 0;
@@ -281,19 +283,19 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
 
   switch (type) {
   case SolverParameter_SolverType_SGD:
-      return new SGDSolver<Dtype>(param);
+    return new SGDSolver<Dtype>(param);
   case SolverParameter_SolverType_NESTEROV:
-      return new NesterovSolver<Dtype>(param);
+    return new NesterovSolver<Dtype>(param);
   case SolverParameter_SolverType_ADAGRAD:
-      return new AdaGradSolver<Dtype>(param);
+    return new AdaGradSolver<Dtype>(param);
   case SolverParameter_SolverType_RMSPROP:
-      return new RMSPropSolver<Dtype>(param);
+    return new RMSPropSolver<Dtype>(param);
   case SolverParameter_SolverType_ADADELTA:
-      return new AdaDeltaSolver<Dtype>(param);
+    return new AdaDeltaSolver<Dtype>(param);
   case SolverParameter_SolverType_ADAM:
-      return new AdamSolver<Dtype>(param);
+    return new AdamSolver<Dtype>(param);
   default:
-      LOG(FATAL) << "Unknown SolverType: " << type;
+    LOG(FATAL) << "Unknown SolverType: " << type;
   }
   return (Solver<Dtype>*) NULL;
 }
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index 1062355..e9fa0d3 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -1,3 +1,4 @@
+#ifdef USE_LEVELDB
 #ifndef CAFFE_UTIL_DB_LEVELDB_HPP
 #define CAFFE_UTIL_DB_LEVELDB_HPP
 
@@ -71,3 +72,4 @@ class LevelDB : public DB {
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_DB_LEVELDB_HPP
+#endif  // USE_LEVELDB
diff --git a/include/caffe/util/db_lmdb.hpp b/include/caffe/util/db_lmdb.hpp
index cc7c90a..4e1568a 100644
--- a/include/caffe/util/db_lmdb.hpp
+++ b/include/caffe/util/db_lmdb.hpp
@@ -1,3 +1,4 @@
+#ifdef USE_LMDB
 #ifndef CAFFE_UTIL_DB_LMDB_HPP
 #define CAFFE_UTIL_DB_LMDB_HPP
 
@@ -89,3 +90,4 @@ class LMDB : public DB {
 }  // namespace caffe
 
 #endif  // CAFFE_UTIL_DB_LMDB_HPP
+#endif  // USE_LMDB
diff --git a/include/caffe/util/im2col.hpp b/include/caffe/util/im2col.hpp
index 0051e2f..531fd29 100644
--- a/include/caffe/util/im2col.hpp
+++ b/include/caffe/util/im2col.hpp
@@ -4,24 +4,48 @@
 namespace caffe {
 
 template <typename Dtype>
+void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_col);
+
+template <typename Dtype>
 void im2col_cpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
+void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_im);
+
+template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_im);
 
 template <typename Dtype>
+void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
+    const int col_size, const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_col);
+
+template <typename Dtype>
 void im2col_gpu(const Dtype* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, Dtype* data_col);
 
 template <typename Dtype>
+void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
+    const int im_size, const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_im);
+
+template <typename Dtype>
 void col2im_gpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h,
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index c0938ad..6070b4c 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -120,6 +120,7 @@ inline bool ReadImageToDatum(const string& filename, const int label,
 bool DecodeDatumNative(Datum* datum);
 bool DecodeDatum(Datum* datum, bool is_color);
 
+#ifdef USE_OPENCV
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width, const bool is_color);
 
@@ -135,6 +136,7 @@ cv::Mat DecodeDatumToCVMatNative(const Datum& datum);
 cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
+#endif  // USE_OPENCV
 
 }  // namespace caffe
 
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 211e3d9..06bc045 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -58,52 +58,110 @@ class BaseConvolutionLayer : public Layer<Dtype> {
   void backward_gpu_bias(Dtype* bias, const Dtype* input);
 #endif
 
+  /// @brief The spatial dimensions of the input.
+  inline int input_shape(int i) {
+    return (*bottom_shape_)[channel_axis_ + i];
+  }
   // reverse_dimensions should return true iff we are implementing deconv, so
   // that conv helpers know which dimensions are which.
   virtual bool reverse_dimensions() = 0;
   // Compute height_out_ and width_out_ from other parameters.
   virtual void compute_output_shape() = 0;
 
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
+  /// @brief The spatial dimensions of a filter kernel.
+  Blob<int> kernel_shape_;
+  /// @brief The spatial dimensions of the stride.
+  Blob<int> stride_;
+  /// @brief The spatial dimensions of the padding.
+  Blob<int> pad_;
+  /// @brief The spatial dimensions of the convolution input.
+  Blob<int> conv_input_shape_;
+  /// @brief The spatial dimensions of the col_buffer.
+  vector<int> col_buffer_shape_;
+  /// @brief The spatial dimensions of the output.
+  vector<int> output_shape_;
+  const vector<int>* bottom_shape_;
+
+  int num_spatial_axes_;
+  int bottom_dim_;
+  int top_dim_;
+
+  int channel_axis_;
   int num_;
   int channels_;
-  int pad_h_, pad_w_;
-  int height_, width_;
   int group_;
+  int out_spatial_dim_;
+  int weight_offset_;
   int num_output_;
-  int height_out_, width_out_;
   bool bias_term_;
   bool is_1x1_;
+  bool force_nd_im2col_;
 
  private:
   // wrap im2col/col2im so we don't have to remember the (long) argument lists
   inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
-    im2col_cpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      im2col_cpu(data, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
+    } else {
+      im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
+          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
+          pad_.cpu_data(), stride_.cpu_data(), col_buff);
+    }
   }
   inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
-    col2im_cpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      col2im_cpu(col_buff, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1], data);
+    } else {
+      col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
+          col_buffer_shape_.data(), kernel_shape_.cpu_data(),
+          pad_.cpu_data(), stride_.cpu_data(), data);
+    }
   }
 #ifndef CPU_ONLY
   inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
-    im2col_gpu(data, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, col_buff);
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      im2col_gpu(data, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
+    } else {
+      im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
+          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
+          kernel_shape_.gpu_data(), pad_.gpu_data(),
+          stride_.gpu_data(), col_buff);
+    }
   }
   inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
-    col2im_gpu(col_buff, conv_in_channels_, conv_in_height_, conv_in_width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data);
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      col2im_gpu(col_buff, conv_in_channels_,
+          conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1], data);
+    } else {
+      col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
+          conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
+          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
+          data);
+    }
   }
 #endif
 
+  int num_kernels_im2col_;
+  int num_kernels_col2im_;
   int conv_out_channels_;
   int conv_in_channels_;
   int conv_out_spatial_dim_;
-  int conv_in_height_;
-  int conv_in_width_;
   int kernel_dim_;
-  int weight_offset_;
   int col_offset_;
   int output_offset_;
 
@@ -250,7 +308,7 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
   cudnnTensorDescriptor_t    bias_desc_;
   cudnnFilterDescriptor_t      filter_desc_;
   vector<cudnnConvolutionDescriptor_t> conv_descs_;
-  int bottom_offset_, top_offset_, weight_offset_, bias_offset_;
+  int bottom_offset_, top_offset_, bias_offset_;
   size_t workspaceSizeInBytes;
   void *workspace;
 };
@@ -287,11 +345,22 @@ class Im2colLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
+  /// @brief The spatial dimensions of a filter kernel.
+  Blob<int> kernel_shape_;
+  /// @brief The spatial dimensions of the stride.
+  Blob<int> stride_;
+  /// @brief The spatial dimensions of the padding.
+  Blob<int> pad_;
+
+  int num_spatial_axes_;
+  int bottom_dim_;
+  int top_dim_;
+
+  int channel_axis_;
+  int num_;
   int channels_;
-  int height_, width_;
-  int pad_h_, pad_w_;
+
+  bool force_nd_im2col_;
 };
 
 // Forward declare PoolingLayer and SplitLayer for use in LRNLayer.
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 6cc44e7..ccda1bc 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,4 +1,4 @@
-from .pycaffe import Net, SGDSolver
+from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
 from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index cc49f60..ccd5776 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -297,6 +297,15 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<AdaGradSolver<Dtype>, bp::bases<Solver<Dtype> >,
     shared_ptr<AdaGradSolver<Dtype> >, boost::noncopyable>(
         "AdaGradSolver", bp::init<string>());
+  bp::class_<RMSPropSolver<Dtype>, bp::bases<Solver<Dtype> >,
+    shared_ptr<RMSPropSolver<Dtype> >, boost::noncopyable>(
+        "RMSPropSolver", bp::init<string>());
+  bp::class_<AdaDeltaSolver<Dtype>, bp::bases<Solver<Dtype> >,
+    shared_ptr<AdaDeltaSolver<Dtype> >, boost::noncopyable>(
+        "AdaDeltaSolver", bp::init<string>());
+  bp::class_<AdamSolver<Dtype>, bp::bases<Solver<Dtype> >,
+    shared_ptr<AdamSolver<Dtype> >, boost::noncopyable>(
+        "AdamSolver", bp::init<string>());
 
   bp::def("get_solver", &GetSolverFromFile,
       bp::return_value_policy<bp::manage_new_object>());
diff --git a/python/caffe/io.py b/python/caffe/io.py
index fc96266..0cad721 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -329,7 +329,7 @@ def resize_image(im, new_dims, interp_order=1):
             return ret
     else:
         # ndimage interpolates anything but more slowly.
-        scale = tuple(np.array(new_dims) / np.array(im.shape[:2]))
+        scale = tuple(np.array(new_dims, dtype=float) / np.array(im.shape[:2]))
         resized_im = zoom(im, scale + (1,), order=interp_order)
     return resized_im.astype(np.float32)
 
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 77a0e00..93fc019 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -56,8 +56,14 @@ def to_proto(*tops):
 def assign_proto(proto, name, val):
     """Assign a Python object to a protobuf message, based on the Python
     type (in recursive fashion). Lists become repeated fields/messages, dicts
-    become messages, and other types are assigned directly."""
-
+    become messages, and other types are assigned directly. For convenience,
+    repeated fields whose values are not lists are converted to single-element
+    lists; e.g., `my_repeated_int_field=3` is converted to
+    `my_repeated_int_field=[3]`."""
+
+    is_repeated_field = hasattr(getattr(proto, name), 'extend')
+    if is_repeated_field and not isinstance(val, list):
+        val = [val]
     if isinstance(val, list):
         if isinstance(val[0], dict):
             for item in val:
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 4f980a9..8ea24da 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -10,7 +10,8 @@ except:
     from itertools import zip_longest as izip_longest
 import numpy as np
 
-from ._caffe import Net, SGDSolver
+from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
+        RMSPropSolver, AdaDeltaSolver, AdamSolver
 import caffe.io
 
 # We directly update methods from Net here (rather than using composition or
diff --git a/python/caffe/test/test_layer_type_list.py b/python/caffe/test/test_layer_type_list.py
index 7edc80d..47f4cf6 100644
--- a/python/caffe/test/test_layer_type_list.py
+++ b/python/caffe/test/test_layer_type_list.py
@@ -5,6 +5,7 @@ import caffe
 class TestLayerTypeList(unittest.TestCase):
 
     def test_standard_types(self):
+        #removing 'Data' from list 
         for type_name in ['Data', 'Convolution', 'InnerProduct']:
             self.assertIn(type_name, caffe.layer_type_list(),
                     '%s not in layer_type_list()' % type_name)
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index b4595e6..fee3c0a 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -43,8 +43,7 @@ def anon_lenet(batch_size):
 
 def silent_net():
     n = caffe.NetSpec()
-    n.data, n.data2 = L.DummyData(shape=[dict(dim=[3]), dict(dim=[4, 2])],
-                                  ntop=2)
+    n.data, n.data2 = L.DummyData(shape=dict(dim=3), ntop=2)
     n.silence_data = L.Silence(n.data, ntop=0)
     n.silence_data2 = L.Silence(n.data2, ntop=0)
     return n.to_proto()
diff --git a/scripts/travis/travis_build_and_test.sh b/scripts/travis/travis_build_and_test.sh
index 9ba737e..174f1ee 100755
--- a/scripts/travis/travis_build_and_test.sh
+++ b/scripts/travis/travis_build_and_test.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Script called by Travis to do a CPU-only build of and test Caffe.
+# Script called by Travis to build and test Caffe.
+# Travis CI tests are CPU-only for lack of compatible hardware.
 
 set -e
 MAKE="make --jobs=$NUM_THREADS --keep-going"
@@ -15,7 +16,12 @@ if $WITH_CMAKE; then
   if [ "$PYTHON_VERSION" = "3" ]; then
     PYTHON_ARGS="$PYTHON_ARGS -Dpython_version=3 -DBOOST_LIBRARYDIR=$CONDA_DIR/lib/"
   fi
-  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" ..
+  if $WITH_IO; then
+    IO_ARGS="-DUSE_OPENCV=ON -DUSE_LMDB=ON -DUSE_LEVELDB=ON"
+  else
+    IO_ARGS="-DUSE_OPENCV=OFF -DUSE_LMDB=OFF -DUSE_LEVELDB=OFF"
+  fi
+  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS ..
   $MAKE
   $MAKE pytest
   if ! $WITH_CUDA; then
@@ -28,6 +34,11 @@ else
   if ! $WITH_CUDA; then
     export CPU_ONLY=1
   fi
+  if $WITH_IO; then
+    export USE_LMDB=1
+    export USE_LEVELDB=1
+    export USE_OPENCV=1
+  fi
   $MAKE all test pycaffe warn lint || true
   if ! $WITH_CUDA; then
     $MAKE runtest
diff --git a/scripts/travis/travis_setup_makefile_config.sh b/scripts/travis/travis_setup_makefile_config.sh
index 1440be2..83aacf1 100755
--- a/scripts/travis/travis_setup_makefile_config.sh
+++ b/scripts/travis/travis_setup_makefile_config.sh
@@ -11,6 +11,12 @@ if $WITH_CUDA; then
   echo "CUDA_ARCH := $GENCODE" >> Makefile.config
 fi
 
+# Remove IO library settings from Makefile.config
+# to avoid conflicts with CI configuration
+sed -i -e '/USE_LMDB/d' Makefile.config
+sed -i -e '/USE_LEVELDB/d' Makefile.config
+sed -i -e '/USE_OPENCV/d' Makefile.config
+
 cat << 'EOF' >> Makefile.config
 # Travis' nvcc doesn't like newer boost versions
 NVCCFLAGS := -Xcudafe --diag_suppress=cc_clobber_ignored -Xcudafe --diag_suppress=useless_using_declaration -Xcudafe --diag_suppress=set_but_not_used
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 8450aa1..c86fd5d 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -24,11 +24,16 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
   CHECK_LE(shape.size(), kMaxBlobAxes);
   count_ = 1;
   shape_.resize(shape.size());
+  if (!shape_data_ || shape_data_->size() < shape.size() * sizeof(int)) {
+    shape_data_.reset(new SyncedMemory(shape.size() * sizeof(int)));
+  }
+  int* shape_data = static_cast<int*>(shape_data_->mutable_cpu_data());
   for (int i = 0; i < shape.size(); ++i) {
     CHECK_GE(shape[i], 0);
     CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
     count_ *= shape[i];
     shape_[i] = shape[i];
+    shape_data[i] = shape[i];
   }
   if (count_ > capacity_) {
     capacity_ = count_;
@@ -68,6 +73,12 @@ Blob<Dtype>::Blob(const vector<int>& shape)
 }
 
 template <typename Dtype>
+const int* Blob<Dtype>::gpu_shape() const {
+  CHECK(shape_data_);
+  return (const int*)shape_data_->gpu_data();
+}
+
+template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_data() const {
   CHECK(data_);
   return (const Dtype*)data_->cpu_data();
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 4666d9b..7189d67 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -1,4 +1,6 @@
+#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#endif  // USE_OPENCV
 
 #include <string>
 #include <vector>
@@ -124,11 +126,13 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
   }
 }
 
+
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
                                        Blob<Dtype>* transformed_blob) {
   // If datum is encoded, decoded and transform the cv::image.
   if (datum.encoded()) {
+#ifdef USE_OPENCV
     CHECK(!(param_.force_color() && param_.force_gray()))
         << "cannot set both force_color and force_gray";
     cv::Mat cv_img;
@@ -140,6 +144,9 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
     }
     // Transform the cv::image into blob.
     return Transform(cv_img, transformed_blob);
+#else
+    LOG(FATAL) << "Encoded datum requires OpenCV; compile with USE_OPENCV.";
+#endif  // USE_OPENCV
   } else {
     if (param_.force_color() || param_.force_gray()) {
       LOG(ERROR) << "force_color and force_gray only for encoded datum";
@@ -194,6 +201,7 @@ void DataTransformer<Dtype>::Transform(const vector<Datum> & datum_vector,
   }
 }
 
+#ifdef USE_OPENCV
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const vector<cv::Mat> & mat_vector,
                                        Blob<Dtype>* transformed_blob) {
@@ -315,6 +323,7 @@ void DataTransformer<Dtype>::Transform(const cv::Mat& cv_img,
     }
   }
 }
+#endif  // USE_OPENCV
 
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
@@ -432,6 +441,7 @@ void DataTransformer<Dtype>::Transform(Blob<Dtype>* input_blob,
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
   if (datum.encoded()) {
+#ifdef USE_OPENCV
     CHECK(!(param_.force_color() && param_.force_gray()))
         << "cannot set both force_color and force_gray";
     cv::Mat cv_img;
@@ -443,8 +453,10 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(const Datum& datum) {
     }
     // InferBlobShape using the cv::image.
     return InferBlobShape(cv_img);
+#else
+    LOG(FATAL) << "Encoded datum requires OpenCV; compile with USE_OPENCV.";
+#endif  // USE_OPENCV
   }
-
   const int crop_size = param_.crop_size();
   const int datum_channels = datum.channels();
   const int datum_height = datum.height();
@@ -474,6 +486,7 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
   return shape;
 }
 
+#ifdef USE_OPENCV
 template<typename Dtype>
 vector<int> DataTransformer<Dtype>::InferBlobShape(const cv::Mat& cv_img) {
   const int crop_size = param_.crop_size();
@@ -504,6 +517,7 @@ vector<int> DataTransformer<Dtype>::InferBlobShape(
   shape[0] = num;
   return shape;
 }
+#endif  // USE_OPENCV
 
 template <typename Dtype>
 void DataTransformer<Dtype>::InitRand() {
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index ccb3adc..c6b4755 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "caffe/filler.hpp"
@@ -11,50 +12,97 @@ namespace caffe {
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
   // Configure the kernel size, padding, stride, and inputs.
   ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  CHECK(!conv_param.has_kernel_size() !=
-      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-  CHECK(conv_param.has_kernel_size() ||
-      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "For non-square filters both kernel_h and kernel_w are required.";
-  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-      && conv_param.has_pad_w())
-      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-      << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-      && conv_param.has_stride_w())
-      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-      << "Stride is stride OR stride_h and stride_w are required.";
-  if (conv_param.has_kernel_size()) {
-    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  force_nd_im2col_ = conv_param.force_nd_im2col();
+  channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_param.axis());
+  const int first_spatial_axis = channel_axis_ + 1;
+  const int num_axes = bottom[0]->num_axes();
+  num_spatial_axes_ = num_axes - first_spatial_axis;
+  CHECK_GE(num_spatial_axes_, 0);
+  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
+  vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
+  // Setup filter kernel dimensions (kernel_shape_).
+  kernel_shape_.Reshape(spatial_dim_blob_shape);
+  int* kernel_shape_data = kernel_shape_.mutable_cpu_data();
+  if (conv_param.has_kernel_h() || conv_param.has_kernel_w()) {
+    CHECK_EQ(num_spatial_axes_, 2)
+        << "kernel_h & kernel_w can only be used for 2D convolution.";
+    CHECK_EQ(0, conv_param.kernel_size_size())
+        << "Either kernel_size or kernel_h/w should be specified; not both.";
+    kernel_shape_data[0] = conv_param.kernel_h();
+    kernel_shape_data[1] = conv_param.kernel_w();
   } else {
-    kernel_h_ = conv_param.kernel_h();
-    kernel_w_ = conv_param.kernel_w();
+    const int num_kernel_dims = conv_param.kernel_size_size();
+    CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
+        << "kernel_size must be specified once, or once per spatial dimension "
+        << "(kernel_size specified " << num_kernel_dims << " times; "
+        << num_spatial_axes_ << " spatial dims);";
+      for (int i = 0; i < num_spatial_axes_; ++i) {
+        kernel_shape_data[i] =
+            conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
+      }
   }
-  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-  if (!conv_param.has_pad_h()) {
-    pad_h_ = pad_w_ = conv_param.pad();
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero.";
+  }
+  // Setup stride dimensions (stride_).
+  stride_.Reshape(spatial_dim_blob_shape);
+  int* stride_data = stride_.mutable_cpu_data();
+  if (conv_param.has_stride_h() || conv_param.has_stride_w()) {
+    CHECK_EQ(num_spatial_axes_, 2)
+        << "stride_h & stride_w can only be used for 2D convolution.";
+    CHECK_EQ(0, conv_param.stride_size())
+        << "Either stride or stride_h/w should be specified; not both.";
+    stride_data[0] = conv_param.stride_h();
+    stride_data[1] = conv_param.stride_w();
   } else {
-    pad_h_ = conv_param.pad_h();
-    pad_w_ = conv_param.pad_w();
+    const int num_stride_dims = conv_param.stride_size();
+    CHECK(num_stride_dims == 0 || num_stride_dims == 1 ||
+          num_stride_dims == num_spatial_axes_)
+        << "stride must be specified once, or once per spatial dimension "
+        << "(stride specified " << num_stride_dims << " times; "
+        << num_spatial_axes_ << " spatial dims);";
+    const int kDefaultStride = 1;
+    for (int i = 0; i < num_spatial_axes_; ++i) {
+      stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
+          conv_param.stride((num_stride_dims == 1) ? 0 : i);
+      CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero.";
+    }
   }
-  if (!conv_param.has_stride_h()) {
-    stride_h_ = stride_w_ = conv_param.stride();
+  // Setup pad dimensions (pad_).
+  pad_.Reshape(spatial_dim_blob_shape);
+  int* pad_data = pad_.mutable_cpu_data();
+  if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
+    CHECK_EQ(num_spatial_axes_, 2)
+        << "pad_h & pad_w can only be used for 2D convolution.";
+    CHECK_EQ(0, conv_param.pad_size())
+        << "Either pad or pad_h/w should be specified; not both.";
+    pad_data[0] = conv_param.pad_h();
+    pad_data[1] = conv_param.pad_w();
   } else {
-    stride_h_ = conv_param.stride_h();
-    stride_w_ = conv_param.stride_w();
+    const int num_pad_dims = conv_param.pad_size();
+    CHECK(num_pad_dims == 0 || num_pad_dims == 1 ||
+          num_pad_dims == num_spatial_axes_)
+        << "pad must be specified once, or once per spatial dimension "
+        << "(pad specified " << num_pad_dims << " times; "
+        << num_spatial_axes_ << " spatial dims);";
+    const int kDefaultPad = 0;
+    for (int i = 0; i < num_spatial_axes_; ++i) {
+      pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
+          conv_param.pad((num_pad_dims == 1) ? 0 : i);
+    }
   }
   // Special case: im2col is the identity for 1x1 convolution with stride 1
   // and no padding, so flag for skipping the buffer and transformation.
-  is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
-      && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
+  is_1x1_ = true;
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    is_1x1_ &=
+        kernel_shape_data[i] == 1 && stride_data[i] == 1 && pad_data[i] == 0;
+    if (!is_1x1_) { break; }
+  }
   // Configure output channels and groups.
-  channels_ = bottom[0]->channels();
+  channels_ = bottom[0]->shape(channel_axis_);
   num_output_ = this->layer_param_.convolution_param().num_output();
   CHECK_GT(num_output_, 0);
   group_ = this->layer_param_.convolution_param().group();
@@ -71,8 +119,29 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // Handle the parameters: weights and biases.
   // - blobs_[0] holds the filter weights
   // - blobs_[1] holds the biases (optional)
+  vector<int> weight_shape(2);
+  weight_shape[0] = conv_out_channels_;
+  weight_shape[1] = conv_in_channels_ / group_;
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    weight_shape.push_back(kernel_shape_data[i]);
+  }
   bias_term_ = this->layer_param_.convolution_param().bias_term();
+  vector<int> bias_shape(bias_term_, num_output_);
   if (this->blobs_.size() > 0) {
+    CHECK_EQ(1 + bias_term_, this->blobs_.size())
+        << "Incorrect number of weight blobs.";
+    if (weight_shape != this->blobs_[0]->shape()) {
+      Blob<Dtype> weight_shaped_blob(weight_shape);
+      LOG(FATAL) << "Incorrect weight shape: expected shape "
+          << weight_shaped_blob.shape_string() << "; instead, shape was "
+          << this->blobs_[0]->shape_string();
+    }
+    if (bias_term_ && bias_shape != this->blobs_[1]->shape()) {
+      Blob<Dtype> bias_shaped_blob(bias_shape);
+      LOG(FATAL) << "Incorrect bias shape: expected shape "
+          << bias_shaped_blob.shape_string() << "; instead, shape was "
+          << this->blobs_[1]->shape_string();
+    }
     LOG(INFO) << "Skipping parameter initialization";
   } else {
     if (bias_term_) {
@@ -82,20 +151,20 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     }
     // Initialize and fill the weights:
     // output channels x input channels per-group x kernel height x kernel width
-    this->blobs_[0].reset(new Blob<Dtype>(
-        conv_out_channels_, conv_in_channels_ / group_, kernel_h_, kernel_w_));
+    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.convolution_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
     // If necessary, initialize and fill the biases.
     if (bias_term_) {
-      vector<int> bias_shape(1, num_output_);
       this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
       shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
           this->layer_param_.convolution_param().bias_filler()));
       bias_filler->Fill(this->blobs_[1].get());
     }
   }
+  kernel_dim_ = this->blobs_[0]->count(1);
+  weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
   // Propagate gradients to the parameters (as directed by backward pass).
   this->param_propagate_down_.resize(this->blobs_.size(), true);
 }
@@ -103,52 +172,68 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  num_ = bottom[0]->num();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  CHECK_EQ(bottom[0]->channels(), channels_) << "Input size incompatible with"
-    " convolution kernel.";
+  const int first_spatial_axis = channel_axis_ + 1;
+  CHECK_EQ(bottom[0]->num_axes(), first_spatial_axis + num_spatial_axes_)
+      << "bottom num_axes may not change.";
+  num_ = bottom[0]->count(0, channel_axis_);
+  CHECK_EQ(bottom[0]->shape(channel_axis_), channels_)
+      << "Input size incompatible with convolution kernel.";
   // TODO: generalize to handle inputs of different shapes.
   for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
-    CHECK_EQ(num_, bottom[bottom_id]->num()) << "Inputs must have same num.";
-    CHECK_EQ(channels_, bottom[bottom_id]->channels())
-        << "Inputs must have same channels.";
-    CHECK_EQ(height_, bottom[bottom_id]->height())
-        << "Inputs must have same height.";
-    CHECK_EQ(width_, bottom[bottom_id]->width())
-        << "Inputs must have same width.";
+    CHECK(bottom[0]->shape() == bottom[bottom_id]->shape())
+        << "All inputs must have the same shape.";
   }
   // Shape the tops.
+  bottom_shape_ = &bottom[0]->shape();
   compute_output_shape();
+  vector<int> top_shape(bottom[0]->shape().begin(),
+      bottom[0]->shape().begin() + channel_axis_);
+  top_shape.push_back(num_output_);
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    top_shape.push_back(output_shape_[i]);
+  }
   for (int top_id = 0; top_id < top.size(); ++top_id) {
-    top[top_id]->Reshape(num_, num_output_, height_out_, width_out_);
+    top[top_id]->Reshape(top_shape);
   }
   if (reverse_dimensions()) {
-    conv_in_height_ = height_out_;
-    conv_in_width_ = width_out_;
-    conv_out_spatial_dim_ = height_ * width_;
+    conv_out_spatial_dim_ = bottom[0]->count(first_spatial_axis);
   } else {
-    conv_in_height_ = height_;
-    conv_in_width_ = width_;
-    conv_out_spatial_dim_ = height_out_ * width_out_;
+    conv_out_spatial_dim_ = top[0]->count(first_spatial_axis);
   }
-  kernel_dim_ = conv_in_channels_ * kernel_h_ * kernel_w_;
-  weight_offset_ = conv_out_channels_ * kernel_dim_ / group_ / group_;
-  col_offset_ = kernel_dim_ * conv_out_spatial_dim_ / group_;
+  col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
   output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
+  // Setup input dimensions (conv_input_shape_).
+  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
+  conv_input_shape_.Reshape(bottom_dim_blob_shape);
+  int* conv_input_shape_data = conv_input_shape_.mutable_cpu_data();
+  for (int i = 0; i < num_spatial_axes_ + 1; ++i) {
+    if (reverse_dimensions()) {
+      conv_input_shape_data[i] = top[0]->shape(channel_axis_ + i);
+    } else {
+      conv_input_shape_data[i] = bottom[0]->shape(channel_axis_ + i);
+    }
+  }
   // The im2col result buffer will only hold one image at a time to avoid
   // overly large memory usage. In the special case of 1x1 convolution
   // it goes lazily unused to save memory.
-  if (reverse_dimensions()) {
-    col_buffer_.Reshape(1, kernel_dim_, height_, width_);
-  } else {
-    col_buffer_.Reshape(1, kernel_dim_, height_out_, width_out_);
+  col_buffer_shape_.clear();
+  col_buffer_shape_.push_back(kernel_dim_ * group_);
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    if (reverse_dimensions()) {
+      col_buffer_shape_.push_back(input_shape(i + 1));
+    } else {
+      col_buffer_shape_.push_back(output_shape_[i]);
+    }
   }
+  col_buffer_.Reshape(col_buffer_shape_);
+  bottom_dim_ = bottom[0]->count(channel_axis_);
+  top_dim_ = top[0]->count(channel_axis_);
+  num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
+  num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_;
   // Set up the all ones "bias multiplier" for adding biases by BLAS
+  out_spatial_dim_ = top[0]->count(first_spatial_axis);
   if (bias_term_) {
-    vector<int> bias_multiplier_shape(1, height_out_ * width_out_);
+    vector<int> bias_multiplier_shape(1, out_spatial_dim_);
     bias_multiplier_.Reshape(bias_multiplier_shape);
     caffe_set(bias_multiplier_.count(), Dtype(1),
         bias_multiplier_.mutable_cpu_data());
@@ -167,7 +252,7 @@ void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
+        group_, conv_out_spatial_dim_, kernel_dim_,
         (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)0., output + output_offset_ * g);
   }
@@ -177,7 +262,7 @@ template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output,
     const Dtype* bias) {
   caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
+      out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
       (Dtype)1., output);
 }
 
@@ -189,7 +274,7 @@ void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
+    caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,
         conv_out_spatial_dim_, conv_out_channels_ / group_,
         (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
         (Dtype)0., col_buff + col_offset_ * g);
@@ -209,7 +294,7 @@ void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_ / group_, conv_out_spatial_dim_,
+        kernel_dim_, conv_out_spatial_dim_,
         (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)1., weights + weight_offset_ * g);
   }
@@ -218,7 +303,7 @@ void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias,
     const Dtype* input) {
-  caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
+  caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1.,
       input, bias_multiplier_.cpu_data(), 1., bias);
 }
 
@@ -236,7 +321,7 @@ void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
-        group_, conv_out_spatial_dim_, kernel_dim_ / group_,
+        group_, conv_out_spatial_dim_, kernel_dim_,
         (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)0., output + output_offset_ * g);
   }
@@ -246,7 +331,7 @@ template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::forward_gpu_bias(Dtype* output,
     const Dtype* bias) {
   caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_,
-      height_out_ * width_out_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
+      out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
       (Dtype)1., output);
 }
 
@@ -258,7 +343,7 @@ void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output,
     col_buff = input;
   }
   for (int g = 0; g < group_; ++g) {
-    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_ / group_,
+    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,
         conv_out_spatial_dim_, conv_out_channels_ / group_,
         (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
         (Dtype)0., col_buff + col_offset_ * g);
@@ -278,7 +363,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
   }
   for (int g = 0; g < group_; ++g) {
     caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
-        kernel_dim_ / group_, conv_out_spatial_dim_,
+        kernel_dim_, conv_out_spatial_dim_,
         (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
         (Dtype)1., weights + weight_offset_ * g);
   }
@@ -287,7 +372,7 @@ void BaseConvolutionLayer<Dtype>::weight_gpu_gemm(const Dtype* input,
 template <typename Dtype>
 void BaseConvolutionLayer<Dtype>::backward_gpu_bias(Dtype* bias,
     const Dtype* input) {
-  caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, height_out_ * width_out_, 1.,
+  caffe_gpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1.,
       input, bias_multiplier_.gpu_data(), 1., bias);
 }
 
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 95fba10..86b500d 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -48,11 +48,16 @@ void ConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   }
   top[0]->Reshape(top_shape);
   CHECK_EQ(bottom_count_sum, top[0]->count());
+  if (bottom.size() == 1) {
+    top[0]->ShareData(*bottom[0]);
+    top[0]->ShareDiff(*bottom[0]);
+  }
 }
 
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1) { return; }
   Dtype* top_data = top[0]->mutable_cpu_data();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -72,6 +77,7 @@ void ConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bottom.size() == 1) { return; }
   const Dtype* top_diff = top[0]->cpu_diff();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
index 3c64c7e..617701e 100644
--- a/src/caffe/layers/concat_layer.cu
+++ b/src/caffe/layers/concat_layer.cu
@@ -28,6 +28,7 @@ __global__ void Concat(const int nthreads, const Dtype* in_data,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  if (bottom.size() == 1) { return; }
   Dtype* top_data = top[0]->mutable_gpu_data();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
@@ -48,6 +49,7 @@ void ConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (bottom.size() == 1) { return; }
   const Dtype* top_diff = top[0]->gpu_diff();
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index 928ef5e..fb50bb0 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -10,10 +10,17 @@ namespace caffe {
 
 template <typename Dtype>
 void ConvolutionLayer<Dtype>::compute_output_shape() {
-  this->height_out_ = (this->height_ + 2 * this->pad_h_ - this->kernel_h_)
-      / this->stride_h_ + 1;
-  this->width_out_ = (this->width_ + 2 * this->pad_w_ - this->kernel_w_)
-      / this->stride_w_ + 1;
+  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
+  const int* stride_data = this->stride_.cpu_data();
+  const int* pad_data = this->pad_.cpu_data();
+  this->output_shape_.clear();
+  for (int i = 0; i < this->num_spatial_axes_; ++i) {
+    // i + 1 to skip channel axis
+    const int input_dim = this->input_shape(i + 1);
+    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
+        / stride_data[i] + 1;
+    this->output_shape_.push_back(output_dim);
+  }
 }
 
 template <typename Dtype>
@@ -24,11 +31,11 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->cpu_data();
     Dtype* top_data = top[i]->mutable_cpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->forward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
+      this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,
+          top_data + n * this->top_dim_);
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->cpu_data();
-        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
+        this->forward_cpu_bias(top_data + n * this->top_dim_, bias);
       }
     }
   }
@@ -47,20 +54,20 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
+        this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_cpu_gemm(bottom_data + bottom[i]->offset(n),
-              top_diff + top[i]->offset(n), weight_diff);
+          this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,
+              top_diff + n * this->top_dim_, weight_diff);
         }
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          this->backward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
+          this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
+              bottom_diff + n * this->bottom_dim_);
         }
       }
     }
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index b8a98ff..b429d2b 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -16,11 +16,11 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->forward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
+      this->forward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
+          top_data + n * this->top_dim_);
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
+        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
       }
     }
   }
@@ -37,7 +37,7 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
+        this->backward_gpu_bias(bias_diff, top_diff + n * this->top_dim_);
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
@@ -46,13 +46,13 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       for (int n = 0; n < this->num_; ++n) {
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(bottom_data + bottom[i]->offset(n),
-              top_diff + top[i]->offset(n), weight_diff);
+          this->weight_gpu_gemm(bottom_data + n * this->bottom_dim_,
+              top_diff + n * this->top_dim_, weight_diff);
         }
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          this->backward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
+          this->backward_gpu_gemm(top_diff + n * this->top_dim_, weight,
+              bottom_diff + n * this->bottom_dim_);
         }
       }
     }
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 104d2b9..3514fe2 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -34,14 +34,15 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
   }
 
   // Set the indexing parameters.
-  weight_offset_ = (this->num_output_ / this->group_)
-      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
   bias_offset_ = (this->num_output_ / this->group_);
 
   // Create filter descriptor.
+  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
+  const int kernel_h = kernel_shape_data[0];
+  const int kernel_w = kernel_shape_data[1];
   cudnn::createFilterDesc<Dtype>(&filter_desc_,
       this->num_output_ / this->group_, this->channels_ / this->group_,
-      this->kernel_h_, this->kernel_w_);
+      kernel_h, kernel_w);
 
   // Create tensor descriptor(s) for data and corresponding convolution(s).
   for (int i = 0; i < bottom.size(); i++) {
@@ -68,29 +69,36 @@ template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   ConvolutionLayer<Dtype>::Reshape(bottom, top);
-  bottom_offset_ = (this->channels_ / this->group_)
-      * this->height_ * this->width_;
-  top_offset_ = (this->num_output_ / this->group_)
-      * this->height_out_ * this->width_out_;
+  CHECK_EQ(2, this->num_spatial_axes_)
+      << "CuDNNConvolution input must have 2 spatial axes "
+      << "(e.g., height and width). "
+      << "Use 'engine: CAFFE' for general ND convolution.";
+  bottom_offset_ = this->bottom_dim_ / this->group_;
+  top_offset_ = this->top_dim_ / this->group_;
+  const int height = bottom[0]->shape(this->channel_axis_ + 1);
+  const int width = bottom[0]->shape(this->channel_axis_ + 2);
+  const int height_out = top[0]->shape(this->channel_axis_ + 1);
+  const int width_out = top[0]->shape(this->channel_axis_ + 2);
+  const int* pad_data = this->pad_.cpu_data();
+  const int pad_h = pad_data[0];
+  const int pad_w = pad_data[1];
+  const int* stride_data = this->stride_.cpu_data();
+  const int stride_h = stride_data[0];
+  const int stride_w = stride_data[1];
 
   for (int i = 0; i < bottom.size(); i++) {
     cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
         this->num_,
-        this->channels_ / this->group_,
-        this->height_, this->width_,
-        this->channels_ * this->height_ * this->width_,
-        this->height_ * this->width_,
-        this->width_, 1);
+        this->channels_ / this->group_, height, width,
+        this->channels_ * height * width,
+        height * width, width, 1);
     cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
         this->num_,
-        this->num_output_ / this->group_,
-        this->height_out_, this->width_out_,
-        this->num_output_ * this->height_out_ * this->width_out_,
-        this->height_out_ * this->width_out_,
-        this->width_out_, 1);
+        this->num_output_ / this->group_, height_out, width_out,
+        this->num_output_ * this->out_spatial_dim_,
+        this->out_spatial_dim_, width_out, 1);
     cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
-        filter_desc_, this->pad_h_, this->pad_w_,
-        this->stride_h_, this->stride_w_);
+        filter_desc_, pad_h, pad_w, stride_h, stride_w);
   }
 
   // Tensor descriptor for bias.
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index b4e802e..6911520 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -14,15 +14,15 @@ __global__ void sync_conv_groups() { }
 template <typename Dtype>
 void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
+  const int kernel_h = kernel_shape_data[0];
+  const int kernel_w = kernel_shape_data[1];
+  const size_t workspace_limit_bytes =
+      kernel_h * kernel_w * this->channels_ * sizeof(int) + 1;
+  const Dtype* weight = this->blobs_[0]->gpu_data();
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
-    const Dtype* weight = this->blobs_[0]->gpu_data();
-
-    size_t workspace_limit_bytes = this->kernel_h_ *
-                                   this->kernel_w_ *
-                                   this->channels_ *
-                                   sizeof(int) + 1;
 
     // Forward through cuDNN in parallel over groups.
     for (int g = 0; g < this->group_; g++) {
@@ -69,7 +69,7 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
       CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
             cudnn::dataType<Dtype>::one,
             bottom_descs_[i], bottom_data + bottom_offset_ * g,
-            filter_desc_, weight + weight_offset_ * g,
+            filter_desc_, weight + this->weight_offset_ * g,
             conv_descs_[i],
             algo, workspace, workspaceSizeInBytes,
             cudnn::dataType<Dtype>::zero,
@@ -128,7 +128,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
               top_descs_[i],    top_diff + top_offset_ * g,
               conv_descs_[i],
               cudnn::dataType<Dtype>::one,
-              filter_desc_, weight_diff + weight_offset_ * g));
+              filter_desc_, weight_diff + this->weight_offset_ * g));
       }
 
       // Gradient w.r.t. bottom data.
@@ -139,7 +139,7 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
         CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
               cudnn::dataType<Dtype>::one,
-              filter_desc_, weight + weight_offset_ * g,
+              filter_desc_, weight + this->weight_offset_ * g,
               top_descs_[i], top_diff + top_offset_ * g,
               conv_descs_[i],
               cudnn::dataType<Dtype>::zero,
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 0932d9f..71f8cb0 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -1,5 +1,6 @@
+#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
-
+#endif  // USE_OPENCV
 #include <stdint.h>
 
 #include <string>
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index a461296..91aabb3 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -10,10 +10,17 @@ namespace caffe {
 
 template <typename Dtype>
 void DeconvolutionLayer<Dtype>::compute_output_shape() {
-  this->height_out_ = this->stride_h_ * (this->height_ - 1) + this->kernel_h_
-      - 2 * this->pad_h_;
-  this->width_out_ = this->stride_w_ * (this->width_ - 1) + this->kernel_w_
-      - 2 * this->pad_w_;
+  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
+  const int* stride_data = this->stride_.cpu_data();
+  const int* pad_data = this->pad_.cpu_data();
+  this->output_shape_.clear();
+  for (int i = 0; i < this->num_spatial_axes_; ++i) {
+    // i + 1 to skip channel axis
+    const int input_dim = this->input_shape(i + 1);
+    const int output_dim = stride_data[i] * (input_dim - 1)
+        + kernel_shape_data[i] - 2 * pad_data[i];
+    this->output_shape_.push_back(output_dim);
+  }
 }
 
 template <typename Dtype>
@@ -24,11 +31,11 @@ void DeconvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->cpu_data();
     Dtype* top_data = top[i]->mutable_cpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->backward_cpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
+      this->backward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight,
+          top_data + n * this->top_dim_);
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->cpu_data();
-        this->forward_cpu_bias(top_data + top[i]->offset(n), bias);
+        this->forward_cpu_bias(top_data + n * this->top_dim_, bias);
       }
     }
   }
@@ -47,21 +54,21 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_cpu_bias(bias_diff, top_diff + top[i]->offset(n));
+        this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
         // Gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_cpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
+          this->weight_cpu_gemm(top_diff + n * this->top_dim_,
+              bottom_data + n * this->bottom_dim_, weight_diff);
         }
         // Gradient w.r.t. bottom data, if necessary, reusing the column buffer
         // we might have just computed above.
         if (propagate_down[i]) {
-          this->forward_cpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n),
+          this->forward_cpu_gemm(top_diff + n * this->top_dim_, weight,
+              bottom_diff + n * this->bottom_dim_,
               this->param_propagate_down_[0]);
         }
       }
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
index 8a1eed8..5dbdcc3 100644
--- a/src/caffe/layers/deconv_layer.cu
+++ b/src/caffe/layers/deconv_layer.cu
@@ -16,11 +16,11 @@ void DeconvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
     for (int n = 0; n < this->num_; ++n) {
-      this->backward_gpu_gemm(bottom_data + bottom[i]->offset(n), weight,
-          top_data + top[i]->offset(n));
+      this->backward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
+          top_data + n * this->top_dim_);
       if (this->bias_term_) {
         const Dtype* bias = this->blobs_[1]->gpu_data();
-        this->forward_gpu_bias(top_data + top[i]->offset(n), bias);
+        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
       }
     }
   }
@@ -39,20 +39,20 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     if (this->bias_term_ && this->param_propagate_down_[1]) {
       Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
       for (int n = 0; n < this->num_; ++n) {
-        this->backward_gpu_bias(bias_diff, top_diff + top[i]->offset(n));
+        this->backward_gpu_bias(bias_diff, top_diff + n * this->top_dim_);
       }
     }
     if (this->param_propagate_down_[0] || propagate_down[i]) {
       for (int n = 0; n < this->num_; ++n) {
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
-          this->weight_gpu_gemm(top_diff + top[i]->offset(n),
-              bottom_data + bottom[i]->offset(n), weight_diff);
+          this->weight_gpu_gemm(top_diff + n * this->top_dim_,
+              bottom_data + n * this->bottom_dim_, weight_diff);
         }
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
-          this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n),
+          this->forward_gpu_gemm(top_diff + n * this->top_dim_, weight,
+              bottom_diff + n * this->bottom_dim_,
               this->param_propagate_down_[0]);
         }
       }
diff --git a/src/caffe/layers/im2col_layer.cpp b/src/caffe/layers/im2col_layer.cpp
index 1c80271..595c9db 100644
--- a/src/caffe/layers/im2col_layer.cpp
+++ b/src/caffe/layers/im2col_layer.cpp
@@ -11,54 +11,106 @@ template <typename Dtype>
 void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-  CHECK(!conv_param.has_kernel_size() !=
-      !(conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-  CHECK(conv_param.has_kernel_size() ||
-      (conv_param.has_kernel_h() && conv_param.has_kernel_w()))
-      << "For non-square filters both kernel_h and kernel_w are required.";
-  CHECK((!conv_param.has_pad() && conv_param.has_pad_h()
-      && conv_param.has_pad_w())
-      || (!conv_param.has_pad_h() && !conv_param.has_pad_w()))
-      << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!conv_param.has_stride() && conv_param.has_stride_h()
-      && conv_param.has_stride_w())
-      || (!conv_param.has_stride_h() && !conv_param.has_stride_w()))
-      << "Stride is stride OR stride_h and stride_w are required.";
-  if (conv_param.has_kernel_size()) {
-    kernel_h_ = kernel_w_ = conv_param.kernel_size();
+  force_nd_im2col_ = conv_param.force_nd_im2col();
+  const int input_num_dims = bottom[0]->shape().size();
+  channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_param.axis());
+  const int first_spatial_dim = channel_axis_ + 1;
+  num_spatial_axes_ = input_num_dims - first_spatial_dim;
+  CHECK_GE(num_spatial_axes_, 1);
+  vector<int> dim_blob_shape(1, num_spatial_axes_);
+  // Setup filter kernel dimensions (kernel_shape_).
+  kernel_shape_.Reshape(dim_blob_shape);
+  int* kernel_shape_data = kernel_shape_.mutable_cpu_data();
+  if (conv_param.has_kernel_h() || conv_param.has_kernel_w()) {
+    CHECK_EQ(num_spatial_axes_, 2)
+        << "kernel_h & kernel_w can only be used for 2D convolution.";
+    CHECK_EQ(0, conv_param.kernel_size_size())
+        << "Either kernel_size or kernel_h/w should be specified; not both.";
+    kernel_shape_data[0] = conv_param.kernel_h();
+    kernel_shape_data[1] = conv_param.kernel_w();
   } else {
-    kernel_h_ = conv_param.kernel_h();
-    kernel_w_ = conv_param.kernel_w();
+    const int num_kernel_dims = conv_param.kernel_size_size();
+    CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
+        << "kernel_size must be specified once, or once per spatial dimension "
+        << "(kernel_size specified " << num_kernel_dims << " times; "
+        << num_spatial_axes_ << " spatial dims);";
+      for (int i = 0; i < num_spatial_axes_; ++i) {
+        kernel_shape_data[i] =
+            conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
+      }
   }
-  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-  if (!conv_param.has_pad_h()) {
-    pad_h_ = pad_w_ = conv_param.pad();
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero.";
+  }
+  // Setup stride dimensions (stride_).
+  stride_.Reshape(dim_blob_shape);
+  int* stride_data = stride_.mutable_cpu_data();
+  if (conv_param.has_stride_h() || conv_param.has_stride_w()) {
+    CHECK_EQ(num_spatial_axes_, 2)
+        << "stride_h & stride_w can only be used for 2D convolution.";
+    CHECK_EQ(0, conv_param.stride_size())
+        << "Either stride or stride_h/w should be specified; not both.";
+    stride_data[0] = conv_param.stride_h();
+    stride_data[1] = conv_param.stride_w();
   } else {
-    pad_h_ = conv_param.pad_h();
-    pad_w_ = conv_param.pad_w();
+    const int num_stride_dims = conv_param.stride_size();
+    CHECK(num_stride_dims == 0 || num_stride_dims == 1 ||
+          num_stride_dims == num_spatial_axes_)
+        << "stride must be specified once, or once per spatial dimension "
+        << "(stride specified " << num_stride_dims << " times; "
+        << num_spatial_axes_ << " spatial dims);";
+    const int kDefaultStride = 1;
+    for (int i = 0; i < num_spatial_axes_; ++i) {
+      stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
+          conv_param.stride((num_stride_dims == 1) ? 0 : i);
+      CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero.";
+    }
   }
-  if (!conv_param.has_stride_h()) {
-    stride_h_ = stride_w_ = conv_param.stride();
+  // Setup pad dimensions (pad_).
+  pad_.Reshape(dim_blob_shape);
+  int* pad_data = pad_.mutable_cpu_data();
+  if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
+    CHECK_EQ(num_spatial_axes_, 2)
+        << "pad_h & pad_w can only be used for 2D convolution.";
+    CHECK_EQ(0, conv_param.pad_size())
+        << "Either pad or pad_h/w should be specified; not both.";
+    pad_data[0] = conv_param.pad_h();
+    pad_data[1] = conv_param.pad_w();
   } else {
-    stride_h_ = conv_param.stride_h();
-    stride_w_ = conv_param.stride_w();
+    const int num_pad_dims = conv_param.pad_size();
+    CHECK(num_pad_dims == 0 || num_pad_dims == 1 ||
+          num_pad_dims == num_spatial_axes_)
+        << "pad must be specified once, or once per spatial dimension "
+        << "(pad specified " << num_pad_dims << " times; "
+        << num_spatial_axes_ << " spatial dims);";
+    const int kDefaultPad = 0;
+    for (int i = 0; i < num_spatial_axes_; ++i) {
+      pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
+          conv_param.pad((num_pad_dims == 1) ? 0 : i);
+    }
   }
 }
 
 template <typename Dtype>
 void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  top[0]->Reshape(
-      bottom[0]->num(), channels_ * kernel_h_ * kernel_w_,
-      (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1,
-      (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1);
+  vector<int> top_shape = bottom[0]->shape();
+  const int* kernel_shape_data = kernel_shape_.cpu_data();
+  const int* stride_data = stride_.cpu_data();
+  const int* pad_data = pad_.cpu_data();
+  for (int i = 0; i < num_spatial_axes_; ++i) {
+    top_shape[channel_axis_] *= kernel_shape_data[i];
+    const int input_dim = bottom[0]->shape(channel_axis_ + i + 1);
+    const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
+        / stride_data[i] + 1;
+    top_shape[channel_axis_ + i + 1] = output_dim;
+  }
+  top[0]->Reshape(top_shape);
+  num_ = bottom[0]->count(0, channel_axis_);
+  bottom_dim_ = bottom[0]->count(channel_axis_);
+  top_dim_ = top[0]->count(channel_axis_);
+
+  channels_ = bottom[0]->shape(channel_axis_);
 }
 
 template <typename Dtype>
@@ -66,10 +118,27 @@ void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_cpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data + top[0]->offset(n));
+  for (int n = 0; n < num_; ++n) {
+    DCHECK_EQ(bottom[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1);
+    DCHECK_EQ(top[0]->shape().size() - channel_axis_, num_spatial_axes_ + 1);
+    DCHECK_EQ(kernel_shape_.count(), num_spatial_axes_);
+    DCHECK_EQ(pad_.count(), num_spatial_axes_);
+    DCHECK_EQ(stride_.count(), num_spatial_axes_);
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      im2col_cpu(bottom_data + n * bottom_dim_, channels_,
+          bottom[0]->shape(channel_axis_ + 1),
+          bottom[0]->shape(channel_axis_ + 2),
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          top_data + n * top_dim_);
+    } else {
+      im2col_nd_cpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
+          bottom[0]->shape().data() + channel_axis_,
+          top[0]->shape().data() + channel_axis_,
+          kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
+          top_data + n * top_dim_);
+    }
   }
 }
 
@@ -78,10 +147,22 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->cpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_cpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
+  for (int n = 0; n < num_; ++n) {
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      col2im_cpu(top_diff + n * top_dim_, channels_,
+          bottom[0]->shape(channel_axis_ + 1),
+          bottom[0]->shape(channel_axis_ + 2),
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          bottom_diff + n * bottom_dim_);
+    } else {
+      col2im_nd_cpu(top_diff + n * top_dim_, num_spatial_axes_,
+          bottom[0]->shape().data() + channel_axis_,
+          top[0]->shape().data() + channel_axis_,
+          kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
+          bottom_diff + n * bottom_dim_);
+    }
   }
 }
 
diff --git a/src/caffe/layers/im2col_layer.cu b/src/caffe/layers/im2col_layer.cu
index 9c338b1..cd50762 100644
--- a/src/caffe/layers/im2col_layer.cu
+++ b/src/caffe/layers/im2col_layer.cu
@@ -12,10 +12,23 @@ void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  for (int n = 0; n < bottom[0]->num(); ++n) {
-    im2col_gpu(bottom_data + bottom[0]->offset(n), channels_, height_,
-        width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, top_data + top[0]->offset(n));
+  const int num_kernels = channels_ * top[0]->count(channel_axis_ + 1);
+  for (int n = 0; n < num_; ++n) {
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      im2col_gpu(bottom_data + n * bottom_dim_, channels_,
+          bottom[0]->shape(channel_axis_ + 1),
+          bottom[0]->shape(channel_axis_ + 2),
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          top_data + n * top_dim_);
+    } else {
+      im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
+          num_kernels, bottom[0]->gpu_shape() + channel_axis_,
+          top[0]->gpu_shape() + channel_axis_,
+          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
+          top_data + n * top_dim_);
+    }
   }
 }
 
@@ -24,10 +37,22 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
-  for (int n = 0; n < top[0]->num(); ++n) {
-    col2im_gpu(top_diff + top[0]->offset(n), channels_, height_, width_,
-        kernel_h_, kernel_w_, pad_h_, pad_w_,
-        stride_h_, stride_w_, bottom_diff + bottom[0]->offset(n));
+  for (int n = 0; n < num_; ++n) {
+    if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
+      col2im_gpu(top_diff + n * top_dim_, channels_,
+          bottom[0]->shape(channel_axis_ + 1),
+          bottom[0]->shape(channel_axis_ + 2),
+          kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
+          pad_.cpu_data()[0], pad_.cpu_data()[1],
+          stride_.cpu_data()[0], stride_.cpu_data()[1],
+          bottom_diff + n * bottom_dim_);
+    } else {
+      col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_,
+          bottom[0]->gpu_shape() + channel_axis_,
+          top[0]->gpu_shape() + channel_axis_,
+          kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
+          bottom_diff + n * bottom_dim_);
+    }
   }
 }
 
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 223ba3a..3d2190f 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
 
 #include <fstream>  // NOLINT(readability/streams)
@@ -164,3 +165,4 @@ INSTANTIATE_CLASS(ImageDataLayer);
 REGISTER_LAYER_CLASS(ImageData);
 
 }  // namespace caffe
+#endif  // USE_OPENCV
diff --git a/src/caffe/layers/memory_data_layer.cpp b/src/caffe/layers/memory_data_layer.cpp
index 42de419..2370aa0 100644
--- a/src/caffe/layers/memory_data_layer.cpp
+++ b/src/caffe/layers/memory_data_layer.cpp
@@ -1,4 +1,6 @@
+#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#endif  // USE_OPENCV
 
 #include <vector>
 
@@ -53,6 +55,7 @@ void MemoryDataLayer<Dtype>::AddDatumVector(const vector<Datum>& datum_vector) {
   has_new_data_ = true;
 }
 
+#ifdef USE_OPENCV
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
     const vector<int>& labels) {
@@ -76,6 +79,7 @@ void MemoryDataLayer<Dtype>::AddMatVector(const vector<cv::Mat>& mat_vector,
   Reset(top_data, top_label, num);
   has_new_data_ = true;
 }
+#endif  // USE_OPENCV
 
 template <typename Dtype>
 void MemoryDataLayer<Dtype>::Reset(Dtype* data, Dtype* labels, int n) {
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index e4418c9..0a059ae 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -67,11 +67,16 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     }
   }
   CHECK_EQ(count, bottom[0]->count());
+  if (top.size() == 1) {
+    top[0]->ShareData(*bottom[0]);
+    top[0]->ShareDiff(*bottom[0]);
+  }
 }
 
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  if (top.size() == 1) { return; }
   int offset_slice_axis = 0;
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
@@ -92,7 +97,7 @@ void SliceLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
+  if (!propagate_down[0] || top.size() == 1) { return; }
   int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
diff --git a/src/caffe/layers/slice_layer.cu b/src/caffe/layers/slice_layer.cu
index 796841d..e8dc6cd 100644
--- a/src/caffe/layers/slice_layer.cu
+++ b/src/caffe/layers/slice_layer.cu
@@ -28,6 +28,7 @@ __global__ void Slice(const int nthreads, const Dtype* in_data,
 template <typename Dtype>
 void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  if (top.size() == 1) { return; }
   int offset_slice_axis = 0;
   const Dtype* bottom_data = bottom[0]->gpu_data();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
@@ -48,7 +49,7 @@ void SliceLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void SliceLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  if (!propagate_down[0]) { return; }
+  if (!propagate_down[0] || top.size() == 1) { return; }
   int offset_slice_axis = 0;
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
   const int bottom_slice_axis = bottom[0]->shape(slice_axis_);
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index f637f2e..f8db61c 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_OPENCV
 #include <opencv2/highgui/highgui_c.h>
 #include <stdint.h>
 
@@ -468,3 +469,4 @@ INSTANTIATE_CLASS(WindowDataLayer);
 REGISTER_LAYER_CLASS(WindowData);
 
 }  // namespace caffe
+#endif  // USE_OPENCV
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 89d1401..ebb8b5d 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -810,12 +810,11 @@ void Net<Dtype>::Backward() {
   BackwardFromTo(layers_.size() - 1, 0);
   if (debug_info_) {
     Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
-    for (int i = 0; i < params_.size(); ++i) {
-      if (param_owners_[i] >= 0) { continue; }
-      asum_data += params_[i]->asum_data();
-      asum_diff += params_[i]->asum_diff();
-      sumsq_data += params_[i]->sumsq_data();
-      sumsq_diff += params_[i]->sumsq_diff();
+    for (int i = 0; i < learnable_params_.size(); ++i) {
+      asum_data += learnable_params_[i]->asum_data();
+      asum_diff += learnable_params_[i]->asum_diff();
+      sumsq_data += learnable_params_[i]->sumsq_data();
+      sumsq_diff += learnable_params_[i]->sumsq_diff();
     }
     const Dtype l2norm_data = std::sqrt(sumsq_data);
     const Dtype l2norm_diff = std::sqrt(sumsq_diff);
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index aa299f8..f52c941 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -471,18 +471,24 @@ message ContrastiveLossParameter {
 message ConvolutionParameter {
   optional uint32 num_output = 1; // The number of outputs for the layer
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
   // Pad, kernel size, and stride are all given as a single value for equal
-  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pad = 3 [default = 0]; // The padding size (equal in Y, X)
-  optional uint32 pad_h = 9 [default = 0]; // The padding height
-  optional uint32 pad_w = 10 [default = 0]; // The padding width
-  optional uint32 kernel_size = 4; // The kernel size (square)
-  optional uint32 kernel_h = 11; // The kernel height
-  optional uint32 kernel_w = 12; // The kernel width
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
   optional uint32 group = 5 [default = 1]; // The group size for group conv
-  optional uint32 stride = 6 [default = 1]; // The stride (equal in Y, X)
-  optional uint32 stride_h = 13; // The stride height
-  optional uint32 stride_w = 14; // The stride width
+
   optional FillerParameter weight_filler = 7; // The filler for the weight
   optional FillerParameter bias_filler = 8; // The filler for the bias
   enum Engine {
@@ -491,6 +497,24 @@ message ConvolutionParameter {
     CUDNN = 2;
   }
   optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [default = false];
 }
 
 message DataParameter {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 394ec3b..12c13dd 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -55,6 +55,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
     << std::endl << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
+  CheckSnapshotWritePermissions();
   if (Caffe::root_solver() && param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
@@ -421,20 +422,38 @@ void Solver<Dtype>::Snapshot() {
   CHECK(Caffe::root_solver());
   string model_filename;
   switch (param_.snapshot_format()) {
-    case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
-      model_filename = SnapshotToBinaryProto();
-      break;
-    case caffe::SolverParameter_SnapshotFormat_HDF5:
-      model_filename = SnapshotToHDF5();
-      break;
-    default:
-      LOG(FATAL) << "Unsupported snapshot format.";
+  case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
+    model_filename = SnapshotToBinaryProto();
+    break;
+  case caffe::SolverParameter_SnapshotFormat_HDF5:
+    model_filename = SnapshotToHDF5();
+    break;
+  default:
+    LOG(FATAL) << "Unsupported snapshot format.";
   }
 
   SnapshotSolverState(model_filename);
 }
 
 template <typename Dtype>
+void Solver<Dtype>::CheckSnapshotWritePermissions() {
+  if (Caffe::root_solver() && param_.snapshot()) {
+    CHECK(param_.has_snapshot_prefix())
+        << "In solver params, snapshot is specified but snapshot_prefix is not";
+    string probe_filename = SnapshotFilename(".tempfile");
+    std::ofstream probe_ofs(probe_filename.c_str());
+    if (probe_ofs.good()) {
+      probe_ofs.close();
+      std::remove(probe_filename.c_str());
+    } else {
+      LOG(FATAL) << "Cannot write to snapshot prefix '"
+          << param_.snapshot_prefix() << "'.  Make sure "
+          << "that the directory exists and is writeable.";
+    }
+  }
+}
+
+template <typename Dtype>
 string Solver<Dtype>::SnapshotFilename(const string extension) {
   string filename(param_.snapshot_prefix());
   const int kBufferSize = 20;
@@ -732,7 +751,7 @@ void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
   }
   string snapshot_filename = Solver<Dtype>::SnapshotFilename(".solverstate");
   LOG(INFO)
-    << "Snapshotting solver state to binary proto file" << snapshot_filename;
+    << "Snapshotting solver state to binary proto file " << snapshot_filename;
   WriteProtoToBinaryFile(state, snapshot_filename.c_str());
 }
 
diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
index 94e529b..ef0e57a 100644
--- a/src/caffe/test/test_accuracy_layer.cpp
+++ b/src/caffe/test/test_accuracy_layer.cpp
@@ -250,7 +250,6 @@ TYPED_TEST(AccuracyLayerTest, TestForwardCPUTopK) {
 
 TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
   LayerParameter layer_param;
-  Caffe::set_mode(Caffe::CPU);
   AccuracyLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
@@ -279,16 +278,16 @@ TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
   EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
               num_correct_labels / 100.0, 1e-4);
   for (int i = 0; i < num_class; ++i) {
+    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
+       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
     EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                static_cast<float>(correct_per_class[i]) / num_per_class[i],
-                1e-4);
+                accuracy_per_class, 1e-4);
   }
 }
 
 
 TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
   LayerParameter layer_param;
-  Caffe::set_mode(Caffe::CPU);
   const TypeParam kIgnoreLabelValue = -1;
   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
   AccuracyLayer<TypeParam> layer(layer_param);
@@ -329,9 +328,10 @@ TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
   EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
               num_correct_labels / TypeParam(count), 1e-4);
   for (int i = 0; i < 10; ++i) {
+    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
+       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
     EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                TypeParam(correct_per_class[i]) / num_per_class[i],
-                1e-4);
+                accuracy_per_class, 1e-4);
   }
 }
 
diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
index 088e0a4..ccd97eb 100644
--- a/src/caffe/test/test_concat_layer.cpp
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -99,6 +99,19 @@ TYPED_TEST(ConcatLayerTest, TestSetupChannelsNegativeIndexing) {
   EXPECT_EQ(this->blob_top_->width(), this->blob_bottom_0_->width());
 }
 
+TYPED_TEST(ConcatLayerTest, TestForwardTrivial) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConcatLayer<Dtype> layer(layer_param);
+  this->blob_bottom_vec_0_.resize(1);
+  layer.SetUp(this->blob_bottom_vec_0_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_0_, this->blob_top_vec_);
+  for (int i = 0; i < this->blob_bottom_0_->count(); ++i) {
+    EXPECT_EQ(this->blob_bottom_0_->cpu_data()[i],
+              this->blob_top_->cpu_data()[i]);
+  }
+}
+
 TYPED_TEST(ConcatLayerTest, TestForwardNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -154,6 +167,16 @@ TYPED_TEST(ConcatLayerTest, TestForwardChannels) {
   }
 }
 
+TYPED_TEST(ConcatLayerTest, TestGradientTrivial) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConcatLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  this->blob_bottom_vec_0_.resize(1);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_0_,
+      this->blob_top_vec_);
+}
+
 TYPED_TEST(ConcatLayerTest, TestGradientNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 67d41ff..9df979a 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -19,54 +19,87 @@ template <typename Dtype>
 void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
     const vector<shared_ptr<Blob<Dtype> > >& weights,
     Blob<Dtype>* out) {
+  const bool has_depth = (out->num_axes() == 5);
+  if (!has_depth) { CHECK_EQ(4, out->num_axes()); }
   // Kernel size, stride, and pad
   int kernel_h, kernel_w;
-  if (conv_param->has_kernel_size()) {
-    kernel_h = kernel_w = conv_param->kernel_size();
-  } else {
+  if (conv_param->has_kernel_h() || conv_param->has_kernel_w()) {
     kernel_h = conv_param->kernel_h();
     kernel_w = conv_param->kernel_w();
+  } else {
+    kernel_h = kernel_w = conv_param->kernel_size(0);
   }
   int pad_h, pad_w;
-  if (!conv_param->has_pad_h()) {
-    pad_h = pad_w = conv_param->pad();
-  } else {
+  if (conv_param->has_pad_h() || conv_param->has_pad_w()) {
     pad_h = conv_param->pad_h();
     pad_w = conv_param->pad_w();
+  } else {
+    pad_h = pad_w = conv_param->pad_size() ? conv_param->pad(0) : 0;
   }
   int stride_h, stride_w;
-  if (!conv_param->has_stride_h()) {
-    stride_h = stride_w = conv_param->stride();
-  } else {
+  if (conv_param->has_stride_h() || conv_param->has_stride_w()) {
     stride_h = conv_param->stride_h();
     stride_w = conv_param->stride_w();
+  } else {
+    stride_h = stride_w = conv_param->stride_size() ? conv_param->stride(0) : 1;
+  }
+  int kernel_d, pad_d, stride_d;
+  if (has_depth) {
+    kernel_d = kernel_h;
+    stride_d = stride_h;
+    pad_d = pad_h;
+  } else {
+    kernel_d = stride_d = 1;
+    pad_d = 0;
   }
   // Groups
   int groups = conv_param->group();
-  int o_g = out->channels() / groups;
-  int k_g = in->channels() / groups;
+  int o_g = out->shape(1) / groups;
+  int k_g = in->shape(1) / groups;
   int o_head, k_head;
   // Convolution
-  const Dtype* in_data = in->cpu_data();
-  const Dtype* weight_data = weights[0]->cpu_data();
+  vector<int> weight_offset(4 + has_depth);
+  vector<int> in_offset(4 + has_depth);
+  vector<int> out_offset(4 + has_depth);
   Dtype* out_data = out->mutable_cpu_data();
-  for (int n = 0; n < out->num(); n++) {
+  for (int n = 0; n < out->shape(0); n++) {
     for (int g = 0; g < groups; g++) {
       o_head = o_g * g;
       k_head = k_g * g;
       for (int o = 0; o < o_g; o++) {
         for (int k = 0; k < k_g; k++) {
-          for (int y = 0; y < out->height(); y++) {
-            for (int x = 0; x < out->width(); x++) {
-              for (int p = 0; p < kernel_h; p++) {
-                for (int q = 0; q < kernel_w; q++) {
-                  int in_y = y * stride_h - pad_h + p;
-                  int in_x = x * stride_w - pad_w + q;
-                  if (in_y >= 0 && in_y < in->height()
-                    && in_x >= 0 && in_x < in->width()) {
-                    out_data[out->offset(n, o + o_head, y, x)] +=
-                        in_data[in->offset(n, k + k_head, in_y, in_x)]
-                        * weight_data[weights[0]->offset(o + o_head, k, p, q)];
+          for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+            for (int y = 0; y < out->shape(2 + has_depth); y++) {
+              for (int x = 0; x < out->shape(3 + has_depth); x++) {
+                for (int r = 0; r < kernel_d; r++) {
+                  for (int p = 0; p < kernel_h; p++) {
+                    for (int q = 0; q < kernel_w; q++) {
+                      int in_z = z * stride_d - pad_d + r;
+                      int in_y = y * stride_h - pad_h + p;
+                      int in_x = x * stride_w - pad_w + q;
+                      if (in_z >= 0 && in_z < (has_depth ? in->shape(2) : 1)
+                          && in_y >= 0 && in_y < in->shape(2 + has_depth)
+                          && in_x >= 0 && in_x < in->shape(3 + has_depth)) {
+                        weight_offset[0] = o + o_head;
+                        weight_offset[1] = k;
+                        if (has_depth) { weight_offset[2] = r; }
+                        weight_offset[2 + has_depth] = p;
+                        weight_offset[3 + has_depth] = q;
+                        in_offset[0] = n;
+                        in_offset[1] = k + k_head;
+                        if (has_depth) { in_offset[2] = in_z; }
+                        in_offset[2 + has_depth] = in_y;
+                        in_offset[3 + has_depth] = in_x;
+                        out_offset[0] = n;
+                        out_offset[1] = o + o_head;
+                        if (has_depth) { out_offset[2] = z; }
+                        out_offset[2 + has_depth] = y;
+                        out_offset[3 + has_depth] = x;
+                        out_data[out->offset(out_offset)] +=
+                            in->data_at(in_offset)
+                            * weights[0]->data_at(weight_offset);
+                      }
+                    }
                   }
                 }
               }
@@ -79,11 +112,18 @@ void caffe_conv(const Blob<Dtype>* in, ConvolutionParameter* conv_param,
   // Bias
   if (conv_param->bias_term()) {
     const Dtype* bias_data = weights[1]->cpu_data();
-    for (int n = 0; n < out->num(); n++) {
-      for (int o = 0; o < out->channels(); o++) {
-        for (int y = 0; y < out->height(); y++) {
-          for (int x = 0; x < out->width(); x++) {
-            out_data[out->offset(n, o, y, x)] += bias_data[o];
+    for (int n = 0; n < out->shape(0); n++) {
+      for (int o = 0; o < out->shape(1); o++) {
+        for (int z = 0; z < (has_depth ? out->shape(2) : 1); z++) {
+          for (int y = 0; y < out->shape(2 + has_depth); y++) {
+            for (int x = 0; x < out->shape(3 + has_depth); x++) {
+              out_offset[0] = n;
+              out_offset[1] = o;
+              if (has_depth) { out_offset[2] = z; }
+              out_offset[2 + has_depth] = y;
+              out_offset[3 + has_depth] = x;
+              out_data[out->offset(out_offset)] += bias_data[o];
+            }
           }
         }
       }
@@ -150,8 +190,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSetup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(4);
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
@@ -188,8 +228,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("constant");
@@ -217,13 +257,98 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
   }
 }
 
+TYPED_TEST(ConvolutionLayerTest, Test0DConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  const int kNumOutput = 3;
+  convolution_param->set_num_output(kNumOutput);
+  convolution_param->set_axis(3);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<Layer<Dtype> > layer(
+      new ConvolutionLayer<Dtype>(layer_param));
+  vector<int> top_shape = this->blob_bottom_->shape();
+  top_shape[3] = kNumOutput;
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(top_shape, this->blob_top_->shape());
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  vector<int> weight_offset(2);
+  const Blob<Dtype>* weight = layer->blobs()[0].get();
+  const Blob<Dtype>* bias = layer->blobs()[1].get();
+  const int num = this->blob_top_->count(3);
+  const int dim = this->blob_top_->shape(3);
+  const int bottom_dim = this->blob_bottom_->shape(3);
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < dim; ++d) {
+      weight_offset[0] = d;
+      Dtype value = bias->cpu_data()[d];
+      for (int bottom_d = 0; bottom_d < bottom_dim; ++bottom_d) {
+        weight_offset[1] = bottom_d;
+        value += weight->data_at(weight_offset) *
+                 this->blob_bottom_->cpu_data()[n * bottom_dim + bottom_d];
+      }
+      EXPECT_NEAR(value, this->blob_top_->cpu_data()[n * dim + d], 1e-4);
+    }
+  }
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestSimple3DConvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 5;
+  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
+  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  shared_ptr<Layer<Dtype> > layer(
+      new ConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+  caffe_conv(this->blob_bottom_2_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_2_));
+  top_data = this->blob_top_2_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
 TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(1);
-  convolution_param->set_stride(1);
+  convolution_param->add_kernel_size(1);
+  convolution_param->add_stride(1);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("constant");
@@ -249,8 +374,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
@@ -288,8 +413,8 @@ TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(1);
   convolution_param->set_bias_term(false);
   shared_ptr<Layer<Dtype> > layer(
@@ -350,14 +475,11 @@ TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
   convolution_param->set_bias_term(false);
   layer.reset(new ConvolutionLayer<Dtype>(layer_param));
   layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<Dtype>(1, 3, 1, 3));
+  layer->blobs()[0].reset(new Blob<Dtype>(1, 1, 1, 3));
   Dtype* weights_2 = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 3;  // 1 x 3 filter
-    weights_2[i +  0] = -1;
-    weights_2[i +  1] =  0;
-    weights_2[i +  2] =  1;
-  }
+  weights_2[0] = -1;
+  weights_2[1] =  0;
+  weights_2[2] =  1;
   layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
   layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
   // Test equivalence of full and separable filters.
@@ -368,6 +490,124 @@ TYPED_TEST(ConvolutionLayerTest, TestSobelConvolution) {
   }
 }
 
+TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kernel_h = 11;
+  const int kernel_w = 13;
+  vector<int> bottom_shape(4);
+  bottom_shape[0] = 15;
+  bottom_shape[1] = 18;
+  bottom_shape[2] = kernel_h * 2;
+  bottom_shape[3] = kernel_w * 2;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->set_num_output(12);
+  convolution_param->set_bias_term(false);
+  convolution_param->set_group(6);
+  convolution_param->set_kernel_h(kernel_h);
+  convolution_param->set_kernel_w(kernel_w);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  Blob<Dtype> weights;
+  Blob<Dtype> top_diff;
+  // Shape and fill weights and top_diff.
+  bool copy_diff;
+  bool reshape;
+  {
+    ConvolutionLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    top_diff.ReshapeLike(*this->blob_top_);
+    filler.Fill(&top_diff);
+    ASSERT_EQ(1, layer.blobs().size());
+    copy_diff = false; reshape = true;
+    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
+  }
+  vector<bool> propagate_down(1, true);
+  Blob<Dtype> result_2d;
+  Blob<Dtype> backward_result_2d;
+  Blob<Dtype> backward_weight_result_2d;
+  // Test with 2D im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_2d.
+    convolution_param->set_force_nd_im2col(false);
+    ConvolutionLayer<Dtype> layer_2d(layer_param);
+    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_2d.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_2d.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_2d.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
+  }
+  Blob<Dtype> result_nd;
+  Blob<Dtype> backward_result_nd;
+  Blob<Dtype> backward_weight_result_nd;
+  // Test with ND im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_nd.
+    convolution_param->set_force_nd_im2col(true);
+    ConvolutionLayer<Dtype> layer_nd(layer_param);
+    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_nd.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_nd.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_nd.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
+  }
+  ASSERT_EQ(result_nd.count(), result_2d.count());
+  for (int i = 0; i < result_2d.count(); ++i)  {
+    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
+  }
+  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
+  for (int i = 0; i < backward_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+              backward_result_nd.cpu_diff()[i]);
+  }
+  ASSERT_EQ(backward_weight_result_nd.count(),
+            backward_weight_result_2d.count());
+  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
+              backward_weight_result_nd.cpu_diff()[i]);
+  }
+}
+
 TYPED_TEST(ConvolutionLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -375,8 +615,36 @@ TYPED_TEST(ConvolutionLayerTest, TestGradient) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  ConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(ConvolutionLayerTest, TestGradient3D) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 5;
+  bottom_shape[3] = this->blob_bottom_vec_[0]->shape(2);
+  bottom_shape[4] = this->blob_bottom_vec_[0]->shape(3);
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -393,8 +661,8 @@ TYPED_TEST(ConvolutionLayerTest, Test1x1Gradient) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->set_kernel_size(1);
-  convolution_param->set_stride(1);
+  convolution_param->add_kernel_size(1);
+  convolution_param->add_stride(1);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -409,8 +677,8 @@ TYPED_TEST(ConvolutionLayerTest, TestGradientGroup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
@@ -472,8 +740,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSetupCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(4);
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
@@ -509,8 +777,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSimpleConvolutionCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("constant");
@@ -542,8 +810,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSimpleConvolutionGroupCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
@@ -581,8 +849,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(1);
   convolution_param->set_bias_term(false);
   shared_ptr<Layer<TypeParam> > layer(
@@ -643,14 +911,11 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestSobelConvolutionCuDNN) {
   convolution_param->set_bias_term(false);
   layer.reset(new CuDNNConvolutionLayer<TypeParam>(layer_param));
   layer->blobs().resize(1);
-  layer->blobs()[0].reset(new Blob<TypeParam>(1, 3, 1, 3));
+  layer->blobs()[0].reset(new Blob<TypeParam>(1, 1, 1, 3));
   TypeParam* weights_2 = layer->blobs()[0]->mutable_cpu_data();
-  for (int c = 0; c < 3; ++c) {
-    int i = c * 3;  // 1 x 3 filter
-    weights_2[i +  0] = -1;
-    weights_2[i +  1] =  0;
-    weights_2[i +  2] =  1;
-  }
+  weights_2[0] = -1;
+  weights_2[1] =  0;
+  weights_2[2] =  1;
   layer->SetUp(sep_blob_bottom_vec, sep_blob_top_vec);
   layer->Forward(sep_blob_bottom_vec, sep_blob_top_vec);
   // Test equivalence of full and separable filters.
@@ -667,8 +932,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestGradientCuDNN) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(2);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -682,8 +947,8 @@ TYPED_TEST(CuDNNConvolutionLayerTest, TestGradientGroupCuDNN) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(3);
   convolution_param->set_group(3);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py
index 3703b41..2645073 100644
--- a/src/caffe/test/test_data/generate_sample_data.py
+++ b/src/caffe/test/test_data/generate_sample_data.py
@@ -43,16 +43,18 @@ with h5py.File(script_dir + '/sample_data_2_gzip.h5', 'w') as f:
     )
     f.create_dataset(
         'label', data=label,
-        compression='gzip', compression_opts=1
+        compression='gzip', compression_opts=1,
+        dtype='uint8',
     )
     f.create_dataset(
         'label2', data=label2,
-        compression='gzip', compression_opts=1
+        compression='gzip', compression_opts=1,
+        dtype='uint8',
     )
 
 with open(script_dir + '/sample_data_list.txt', 'w') as f:
-    f.write(script_dir + '/sample_data.h5\n')
-    f.write(script_dir + '/sample_data_2_gzip.h5\n')
+    f.write('src/caffe/test/test_data/sample_data.h5\n')
+    f.write('src/caffe/test/test_data/sample_data_2_gzip.h5\n')
 
 # Generate GradientBasedSolver solver_data.h5
 
@@ -76,4 +78,4 @@ with h5py.File(script_dir + '/solver_data.h5', 'w') as f:
     f['targets'] = targets
 
 with open(script_dir + '/solver_data_list.txt', 'w') as f:
-    f.write(script_dir + '/solver_data.h5\n')
+    f.write('src/caffe/test/test_data/solver_data.h5\n')
diff --git a/src/caffe/test/test_data/sample_data_2_gzip.h5 b/src/caffe/test/test_data/sample_data_2_gzip.h5
index a138e03..0cb9ef9 100644
Binary files a/src/caffe/test/test_data/sample_data_2_gzip.h5 and b/src/caffe/test/test_data/sample_data_2_gzip.h5 differ
diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
index afe2a40..9e03954 100644
--- a/src/caffe/test/test_data_layer.cpp
+++ b/src/caffe/test/test_data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_OPENCV
 #include <string>
 #include <vector>
 
@@ -348,6 +349,7 @@ class DataLayerTest : public MultiDeviceTest<TypeParam> {
 
 TYPED_TEST_CASE(DataLayerTest, TestDtypesAndDevices);
 
+#ifdef USE_LEVELDB
 TYPED_TEST(DataLayerTest, TestReadLevelDB) {
   const bool unique_pixels = false;  // all pixels the same; images different
   this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
@@ -385,7 +387,9 @@ TYPED_TEST(DataLayerTest, TestReadCropTestLevelDB) {
   this->Fill(unique_pixels, DataParameter_DB_LEVELDB);
   this->TestReadCrop(TEST);
 }
+#endif  // USE_LEVELDB
 
+#ifdef USE_LMDB
 TYPED_TEST(DataLayerTest, TestReadLMDB) {
   const bool unique_pixels = false;  // all pixels the same; images different
   this->Fill(unique_pixels, DataParameter_DB_LMDB);
@@ -424,4 +428,6 @@ TYPED_TEST(DataLayerTest, TestReadCropTestLMDB) {
   this->TestReadCrop(TEST);
 }
 
+#endif  // USE_LMDB
 }  // namespace caffe
+#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_data_transformer.cpp b/src/caffe/test/test_data_transformer.cpp
index 16570e2..8a10137 100644
--- a/src/caffe/test/test_data_transformer.cpp
+++ b/src/caffe/test/test_data_transformer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_OPENCV
 #include <string>
 #include <vector>
 
@@ -353,3 +354,4 @@ TYPED_TEST(DataTransformTest, TestMeanFile) {
 }
 
 }  // namespace caffe
+#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_db.cpp b/src/caffe/test/test_db.cpp
index 5b2ac23..1b487b1 100644
--- a/src/caffe/test/test_db.cpp
+++ b/src/caffe/test/test_db.cpp
@@ -1,3 +1,4 @@
+#if defined(USE_LEVELDB) && defined(USE_LMDB) && defined(USE_OPENCV)
 #include <string>
 
 #include "boost/scoped_ptr.hpp"
@@ -132,3 +133,4 @@ TYPED_TEST(DBTest, TestWrite) {
 }
 
 }  // namespace caffe
+#endif  // USE_LEVELDB, USE_LMDB and USE_OPENCV
diff --git a/src/caffe/test/test_deconvolution_layer.cpp b/src/caffe/test/test_deconvolution_layer.cpp
index fc63d5e..770e7b2 100644
--- a/src/caffe/test/test_deconvolution_layer.cpp
+++ b/src/caffe/test/test_deconvolution_layer.cpp
@@ -58,8 +58,8 @@ TYPED_TEST(DeconvolutionLayerTest, TestSetup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(4);
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
@@ -96,8 +96,8 @@ TYPED_TEST(DeconvolutionLayerTest, TestSimpleDeconvolution) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   convolution_param->set_num_output(4);
   convolution_param->mutable_weight_filler()->set_type("constant");
   convolution_param->mutable_weight_filler()->set_value(1);
@@ -144,8 +144,8 @@ TYPED_TEST(DeconvolutionLayerTest, TestGradient) {
       layer_param.mutable_convolution_param();
   this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
   this->blob_top_vec_.push_back(this->blob_top_2_);
-  convolution_param->set_kernel_size(2);
-  convolution_param->set_stride(1);
+  convolution_param->add_kernel_size(2);
+  convolution_param->add_stride(1);
   convolution_param->set_num_output(1);
   convolution_param->mutable_weight_filler()->set_type("gaussian");
   convolution_param->mutable_bias_filler()->set_type("gaussian");
@@ -155,4 +155,151 @@ TYPED_TEST(DeconvolutionLayerTest, TestGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(DeconvolutionLayerTest, TestNDAgainst2D) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kernel_h = 11;
+  const int kernel_w = 13;
+  vector<int> bottom_shape(4);
+  bottom_shape[0] = 15;
+  bottom_shape[1] = 12;
+  bottom_shape[2] = kernel_h * 2;
+  bottom_shape[3] = kernel_w * 2;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->set_num_output(18);
+  convolution_param->set_bias_term(false);
+  convolution_param->set_group(6);
+  convolution_param->set_kernel_h(kernel_h);
+  convolution_param->set_kernel_w(kernel_w);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  Blob<Dtype> weights;
+  Blob<Dtype> top_diff;
+  // Shape and fill weights and top_diff.
+  bool copy_diff;
+  bool reshape;
+  {
+    DeconvolutionLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    top_diff.ReshapeLike(*this->blob_top_);
+    filler.Fill(&top_diff);
+    ASSERT_EQ(1, layer.blobs().size());
+    copy_diff = false; reshape = true;
+    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
+  }
+  vector<bool> propagate_down(1, true);
+  Blob<Dtype> result_2d;
+  Blob<Dtype> backward_result_2d;
+  Blob<Dtype> backward_weight_result_2d;
+  // Test with 2D im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_2d.
+    convolution_param->set_force_nd_im2col(false);
+    DeconvolutionLayer<Dtype> layer_2d(layer_param);
+    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_2d.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_2d.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_2d.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
+  }
+  Blob<Dtype> result_nd;
+  Blob<Dtype> backward_result_nd;
+  Blob<Dtype> backward_weight_result_nd;
+  // Test with ND im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_nd.
+    convolution_param->set_force_nd_im2col(true);
+    DeconvolutionLayer<Dtype> layer_nd(layer_param);
+    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_nd.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_nd.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_nd.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
+  }
+  ASSERT_EQ(result_nd.count(), result_2d.count());
+  for (int i = 0; i < result_2d.count(); ++i)  {
+    EXPECT_EQ(result_2d.cpu_data()[i], result_nd.cpu_data()[i]);
+  }
+  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
+  for (int i = 0; i < backward_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+              backward_result_nd.cpu_diff()[i]);
+  }
+  ASSERT_EQ(backward_weight_result_nd.count(),
+            backward_weight_result_2d.count());
+  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
+              backward_weight_result_nd.cpu_diff()[i]);
+  }
+}
+
+TYPED_TEST(DeconvolutionLayerTest, TestGradient3D) {
+  typedef typename TypeParam::Dtype Dtype;
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 2;
+  bottom_shape[3] = 3;
+  bottom_shape[4] = 2;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(2);
+  convolution_param->add_stride(2);
+  convolution_param->add_pad(1);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  DeconvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp
index be0c134..8031f6e 100644
--- a/src/caffe/test/test_eltwise_layer.cpp
+++ b/src/caffe/test/test_eltwise_layer.cpp
@@ -80,7 +80,7 @@ TYPED_TEST(EltwiseLayerTest, TestProd) {
   const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
   const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
   for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(data[i], in_data_a[i] * in_data_b[i] * in_data_c[i]);
+    EXPECT_NEAR(data[i], in_data_a[i] * in_data_b[i] * in_data_c[i], 1e-4);
   }
 }
 
@@ -99,7 +99,7 @@ TYPED_TEST(EltwiseLayerTest, TestSum) {
   const Dtype* in_data_b = this->blob_bottom_b_->cpu_data();
   const Dtype* in_data_c = this->blob_bottom_c_->cpu_data();
   for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(data[i], in_data_a[i] + in_data_b[i] + in_data_c[i]);
+    EXPECT_NEAR(data[i], in_data_a[i] + in_data_b[i] + in_data_c[i], 1e-4);
   }
 }
 
diff --git a/src/caffe/test/test_im2col_kernel.cu b/src/caffe/test/test_im2col_kernel.cu
index 0017ac2..f0b75fc 100644
--- a/src/caffe/test/test_im2col_kernel.cu
+++ b/src/caffe/test/test_im2col_kernel.cu
@@ -22,6 +22,12 @@ __global__ void im2col_gpu_kernel(const int n, const Dtype* data_im,
     const int height_col, const int width_col,
     Dtype* data_col);
 
+template <typename Dtype, int num_axes>
+__global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_col);
+
 extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
 
 template <typename Dtype>
@@ -30,11 +36,18 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
   Im2colKernelTest()
         // big so launches > 1024 threads
       : blob_bottom_(new Blob<Dtype>(5, 500, 10, 10)),
+        blob_kernel_shape_(new Blob<int>()),
+        blob_stride_(new Blob<int>()),
+        blob_pad_(new Blob<int>()),
         blob_top_(new Blob<Dtype>()),
         blob_top_cpu_(new Blob<Dtype>()) {
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_);
+    vector<int> dim_blob_shape(1, 2);
+    blob_kernel_shape_->Reshape(dim_blob_shape);
+    blob_stride_->Reshape(dim_blob_shape);
+    blob_pad_->Reshape(dim_blob_shape);
 
     height_ = blob_bottom_->height();
     width_ = blob_bottom_->width();
@@ -44,14 +57,26 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
     kernel_size_ = 3;
     height_col_ = (height_ + 2 * pad_ - kernel_size_) / stride_ + 1;
     width_col_ = (width_ + 2 * pad_ - kernel_size_) / stride_ + 1;
+
+    for (int i = 0; i < 2; ++i) {
+      blob_kernel_shape_->mutable_cpu_data()[i] = kernel_size_;
+      blob_stride_->mutable_cpu_data()[i] = stride_;
+      blob_pad_->mutable_cpu_data()[i] = pad_;
+    }
   }
 
   virtual ~Im2colKernelTest() {
-      delete blob_bottom_;
-      delete blob_top_;
-      delete blob_top_cpu_;
+    delete blob_bottom_;
+    delete blob_top_;
+    delete blob_top_cpu_;
+    delete blob_kernel_shape_;
+    delete blob_stride_;
+    delete blob_pad_;
   }
 
+  Blob<int>* const blob_kernel_shape_;
+  Blob<int>* const blob_stride_;
+  Blob<int>* const blob_pad_;
   Blob<Dtype>* const blob_bottom_;
   Blob<Dtype>* const blob_top_;
   Blob<Dtype>* const blob_top_cpu_;
@@ -67,7 +92,7 @@ class Im2colKernelTest : public GPUDeviceTest<Dtype> {
 
 TYPED_TEST_CASE(Im2colKernelTest, TestDtypes);
 
-TYPED_TEST(Im2colKernelTest, TestGPU) {
+TYPED_TEST(Im2colKernelTest, Test2D) {
   // Reshape the blobs to correct size for im2col output
   this->blob_top_->Reshape(this->blob_bottom_->num(),
           this->channels_ * this->kernel_size_ * this->kernel_size_,
@@ -122,4 +147,58 @@ TYPED_TEST(Im2colKernelTest, TestGPU) {
   }
 }
 
+TYPED_TEST(Im2colKernelTest, TestND) {
+  // Reshape the blobs to correct size for im2col output
+  this->blob_top_->Reshape(this->blob_bottom_->num(),
+      this->channels_ * this->kernel_size_ * this->kernel_size_,
+      this->height_col_,
+      this->width_col_);
+
+  this->blob_top_cpu_->ReshapeLike(*this->blob_top_);
+
+  const TypeParam* bottom_data_cpu = this->blob_bottom_->cpu_data();
+  TypeParam* top_data_cpu = this->blob_top_cpu_->mutable_cpu_data();
+
+  // CPU Version
+  for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+    im2col_nd_cpu(bottom_data_cpu + this->blob_bottom_->offset(n), 2,
+        this->blob_bottom_->shape().data() + 1,
+        this->blob_top_cpu_->shape().data() + 1,
+        this->blob_kernel_shape_->cpu_data(),
+        this->blob_pad_->cpu_data(), this->blob_stride_->cpu_data(),
+        top_data_cpu + this->blob_top_cpu_->offset(n));
+  }
+
+  // GPU version
+  int num_kernels = this->channels_ * this->height_col_ * this->width_col_;
+  int default_grid_dim = CAFFE_GET_BLOCKS(num_kernels);
+  const TypeParam* bottom_data_gpu = this->blob_bottom_->gpu_data();
+
+  // Launch with different grid sizes
+  for (int grid_div = 2; grid_div <= 8; grid_div++) {
+    for (int n = 0; n < this->blob_bottom_->num(); ++n) {
+      const int grid_dim = default_grid_dim / grid_div;
+      TypeParam* top_data_gpu = this->blob_top_->mutable_gpu_data();
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      im2col_nd_gpu_kernel<TypeParam, 2><<<grid_dim, CAFFE_CUDA_NUM_THREADS>>>(
+          num_kernels, bottom_data_gpu + this->blob_bottom_->offset(n),
+          this->blob_bottom_->gpu_shape() + 1, this->blob_top_->gpu_shape() + 1,
+          this->blob_kernel_shape_->gpu_data(), this->blob_pad_->gpu_data(),
+          this->blob_stride_->gpu_data(),
+          top_data_gpu + this->blob_top_->offset(n));
+      CUDA_POST_KERNEL_CHECK;
+    }
+
+    // Compare results against CPU version
+    for (int i = 0; i < this->blob_top_->count(); ++i) {
+      TypeParam cpuval = top_data_cpu[i];
+      TypeParam gpuval = this->blob_top_->cpu_data()[i];
+      EXPECT_EQ(cpuval, gpuval);
+      if (cpuval != gpuval) {
+        break;
+      }
+    }
+  }
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_im2col_layer.cpp b/src/caffe/test/test_im2col_layer.cpp
index f50abe1..293aa26 100644
--- a/src/caffe/test/test_im2col_layer.cpp
+++ b/src/caffe/test/test_im2col_layer.cpp
@@ -21,6 +21,7 @@ class Im2colLayerTest : public MultiDeviceTest<TypeParam> {
       : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
         blob_top_(new Blob<Dtype>()) {
     // fill the values
+    Caffe::set_random_seed(1701);
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_);
@@ -41,8 +42,8 @@ TYPED_TEST(Im2colLayerTest, TestSetup) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 2);
@@ -56,8 +57,8 @@ TYPED_TEST(Im2colLayerTest, TestForward) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -73,14 +74,27 @@ TYPED_TEST(Im2colLayerTest, TestGradient) {
   LayerParameter layer_param;
   ConvolutionParameter* convolution_param =
       layer_param.mutable_convolution_param();
-  convolution_param->set_kernel_size(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-2);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_);
 }
 
+TYPED_TEST(Im2colLayerTest, TestGradientForceND) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_force_nd_im2col(true);
+  Im2colLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
 
 TYPED_TEST(Im2colLayerTest, TestRect) {
   typedef typename TypeParam::Dtype Dtype;
@@ -89,7 +103,7 @@ TYPED_TEST(Im2colLayerTest, TestRect) {
       layer_param.mutable_convolution_param();
   convolution_param->set_kernel_h(5);
   convolution_param->set_kernel_w(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -108,7 +122,7 @@ TYPED_TEST(Im2colLayerTest, TestRectGradient) {
       layer_param.mutable_convolution_param();
   convolution_param->set_kernel_h(5);
   convolution_param->set_kernel_w(3);
-  convolution_param->set_stride(2);
+  convolution_param->add_stride(2);
   Im2colLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-2);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
diff --git a/src/caffe/test/test_image_data_layer.cpp b/src/caffe/test/test_image_data_layer.cpp
index 931a5eb..481fcef 100644
--- a/src/caffe/test/test_image_data_layer.cpp
+++ b/src/caffe/test/test_image_data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_OPENCV
 #include <map>
 #include <string>
 #include <vector>
@@ -177,3 +178,4 @@ TYPED_TEST(ImageDataLayerTest, TestShuffle) {
 }
 
 }  // namespace caffe
+#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_io.cpp b/src/caffe/test/test_io.cpp
index 4ab9631..c2c919e 100644
--- a/src/caffe/test/test_io.cpp
+++ b/src/caffe/test/test_io.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/highgui/highgui_c.h>
@@ -420,3 +421,4 @@ TEST_F(IOTest, TestDecodeDatumToCVMatContentNative) {
 }
 
 }  // namespace caffe
+#endif  // USE_OPENCV
diff --git a/src/caffe/test/test_layer_factory.cpp b/src/caffe/test/test_layer_factory.cpp
index c86fafd..7d5d39d 100644
--- a/src/caffe/test/test_layer_factory.cpp
+++ b/src/caffe/test/test_layer_factory.cpp
@@ -31,12 +31,16 @@ TYPED_TEST(LayerFactoryTest, TestCreateLayer) {
     LayerParameter layer_param;
     // Data layers expect a DB
     if (iter->first == "Data") {
+#ifdef USE_LEVELDB
       string tmp;
       MakeTempDir(&tmp);
       boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
       db->Open(tmp, db::NEW);
       db->Close();
       layer_param.mutable_data_param()->set_source(tmp);
+#else
+      continue;
+#endif  // USE_LEVELDB
     }
     layer_param.set_type(iter->first);
     layer = LayerRegistry<Dtype>::CreateLayer(layer_param);
diff --git a/src/caffe/test/test_memory_data_layer.cpp b/src/caffe/test/test_memory_data_layer.cpp
index a79033f..7269a4d 100644
--- a/src/caffe/test/test_memory_data_layer.cpp
+++ b/src/caffe/test/test_memory_data_layer.cpp
@@ -1,4 +1,6 @@
+#ifdef USE_OPENCV
 #include <opencv2/core/core.hpp>
+#endif  // USE_OPENCV
 
 #include <string>
 #include <vector>
@@ -113,6 +115,7 @@ TYPED_TEST(MemoryDataLayerTest, TestForward) {
   }
 }
 
+#ifdef USE_OPENCV
 TYPED_TEST(MemoryDataLayerTest, AddDatumVectorDefaultTransform) {
   typedef typename TypeParam::Dtype Dtype;
 
@@ -292,5 +295,5 @@ TYPED_TEST(MemoryDataLayerTest, TestSetBatchSize) {
     }
   }
 }
-
+#endif  // USE_OPENCV
 }  // namespace caffe
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 12998d8..ab4afba 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -2262,15 +2262,17 @@ TEST_F(FilterNetTest, TestFilterInOutByExcludeMultiRule) {
 TYPED_TEST(NetTest, TestReshape) {
   typedef typename TypeParam::Dtype Dtype;
   // We set up bottom blobs of two different sizes, switch between
-  // them, and check that forward and backward both run and the results
-  // are the same.
+  // them, check that forward and backward both run and the results
+  // are the same, and check that the output shapes change.
   Caffe::set_random_seed(this->seed_);
   Caffe::set_mode(Caffe::CPU);
   FillerParameter filler_param;
   filler_param.set_std(1);
   GaussianFiller<Dtype> filler(filler_param);
-  Blob<Dtype> blob1(4, 3, 9, 11);
-  Blob<Dtype> blob2(2, 3, 12, 10);
+  // Check smaller shape first as larger first could hide realloc failures.
+  Blob<Dtype> blob1(2, 3, 12, 10);
+  Blob<Dtype> blob2(4, 3, 9, 11);
+  ASSERT_LT(blob1.count(), blob2.count());
   filler.Fill(&blob1);
   filler.Fill(&blob2);
 
@@ -2304,7 +2306,7 @@ TYPED_TEST(NetTest, TestReshape) {
   this->net_->ForwardPrefilled();
   this->net_->Backward();
   for (int i = 0; i < output1.count(); ++i) {
-    CHECK_EQ(*(output1.cpu_data() + i), *(output_blob->cpu_data() + i));
+    EXPECT_FLOAT_EQ(*(output1.cpu_data() + i), *(output_blob->cpu_data() + i));
   }
 
   input_blob->Reshape(blob2.num(), blob2.channels(), blob2.height(),
@@ -2313,8 +2315,20 @@ TYPED_TEST(NetTest, TestReshape) {
   this->net_->ForwardPrefilled();
   this->net_->Backward();
   for (int i = 0; i < output2.count(); ++i) {
-    CHECK_EQ(*(output2.cpu_data() + i), *(output_blob->cpu_data() + i));
+    EXPECT_FLOAT_EQ(*(output2.cpu_data() + i), *(output_blob->cpu_data() + i));
   }
+
+  EXPECT_EQ(output1.num(), blob1.num());
+  EXPECT_EQ(output2.num(), blob2.num());
+  bool same_spatial_shape = true;
+  const int kFirstSpatialAxis = 2;
+  for (int i = kFirstSpatialAxis; i < output1.num_axes(); ++i) {
+    if (output1.shape(i) != output2.shape(i)) {
+      same_spatial_shape = false;
+      break;
+    }
+  }
+  EXPECT_FALSE(same_spatial_shape);
 }
 
 TYPED_TEST(NetTest, TestSkipPropagateDown) {
diff --git a/src/caffe/test/test_slice_layer.cpp b/src/caffe/test/test_slice_layer.cpp
index ccd0364..2d2d0fd 100644
--- a/src/caffe/test/test_slice_layer.cpp
+++ b/src/caffe/test/test_slice_layer.cpp
@@ -88,6 +88,21 @@ TYPED_TEST(SliceLayerTest, TestSetupChannels) {
   EXPECT_EQ(this->blob_bottom_->width(), this->blob_top_0_->width());
 }
 
+TYPED_TEST(SliceLayerTest, TestTrivialSlice) {
+  // Test the trivial (single output) "slice" operation --
+  // should be the identity.
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  SliceLayer<Dtype> layer(layer_param);
+  this->blob_top_vec_0_.resize(1);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_0_);
+  ASSERT_EQ(this->blob_bottom_->shape(), this->blob_top_0_->shape());
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_EQ(this->blob_bottom_->cpu_data()[i],
+              this->blob_top_0_->cpu_data()[i]);
+  }
+}
+
 TYPED_TEST(SliceLayerTest, TestSliceAcrossNum) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -161,6 +176,18 @@ TYPED_TEST(SliceLayerTest, TestSliceAcrossChannels) {
   }
 }
 
+TYPED_TEST(SliceLayerTest, TestGradientTrivial) {
+  // Test the trivial (single output) "slice" operation --
+  // should be the identity.
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  SliceLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  this->blob_top_vec_0_.resize(1);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_0_);
+}
+
 TYPED_TEST(SliceLayerTest, TestGradientAcrossNum) {
   typedef typename TypeParam::Dtype Dtype;
   // Gradient checks are slow; reduce blob size.
diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp
index 0067202..ee05b15 100644
--- a/src/caffe/test/test_upgrade_proto.cpp
+++ b/src/caffe/test/test_upgrade_proto.cpp
@@ -2892,6 +2892,7 @@ TEST_F(NetUpgradeTest, TestImageNet) {
   this->RunV1UpgradeTest(expected_v1_proto, expected_v2_proto);
 }  // NOLINT(readability/fn_size)
 
+#ifdef USE_OPENCV
 TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
   LayerParameter layer_param;
   shared_ptr<Layer<float> > layer;
@@ -2906,16 +2907,25 @@ TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
     layer_param.set_type(v2_layer_type);
     // Data layers expect a DB
     if (v2_layer_type == "Data") {
+      #ifdef USE_LEVELDB
       string tmp;
       MakeTempDir(&tmp);
       boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
       db->Open(tmp, db::NEW);
       db->Close();
       layer_param.mutable_data_param()->set_source(tmp);
+      #else
+      continue;
+      #endif  // USE_LEVELDB
     }
+    #ifndef USE_OPENCV
+    if (v2_layer_type == "ImageData" || v2_layer_type == "WindowData") {
+     continue;
+    }
+    #endif  // !USE_OPENCV
     layer = LayerRegistry<float>::CreateLayer(layer_param);
     EXPECT_EQ(v2_layer_type, layer->type());
   }
 }
-
+#endif  // USE_OPENCV
 }  // NOLINT(readability/fn_size)  // namespace caffe
diff --git a/src/caffe/util/db.cpp b/src/caffe/util/db.cpp
index f55420e..ccda054 100644
--- a/src/caffe/util/db.cpp
+++ b/src/caffe/util/db.cpp
@@ -8,23 +8,31 @@ namespace caffe { namespace db {
 
 DB* GetDB(DataParameter::DB backend) {
   switch (backend) {
+#ifdef USE_LEVELDB
   case DataParameter_DB_LEVELDB:
     return new LevelDB();
+#endif  // USE_LEVELDB
+#ifdef USE_LMDB
   case DataParameter_DB_LMDB:
     return new LMDB();
+#endif  // USE_LMDB
   default:
     LOG(FATAL) << "Unknown database backend";
   }
 }
 
 DB* GetDB(const string& backend) {
+#ifdef USE_LEVELDB
   if (backend == "leveldb") {
     return new LevelDB();
-  } else if (backend == "lmdb") {
+  }
+#endif  // USE_LEVELDB
+#ifdef USE_LMDB
+  if (backend == "lmdb") {
     return new LMDB();
-  } else {
-    LOG(FATAL) << "Unknown database backend";
   }
+#endif  // USE_LMDB
+  LOG(FATAL) << "Unknown database backend";
 }
 
 }  // namespace db
diff --git a/src/caffe/util/db_leveldb.cpp b/src/caffe/util/db_leveldb.cpp
index 06c4662..f5c4d8a 100644
--- a/src/caffe/util/db_leveldb.cpp
+++ b/src/caffe/util/db_leveldb.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_LEVELDB
 #include "caffe/util/db_leveldb.hpp"
 
 #include <string>
@@ -19,3 +20,4 @@ void LevelDB::Open(const string& source, Mode mode) {
 
 }  // namespace db
 }  // namespace caffe
+#endif  // USE_LEVELDB
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index a054b79..78dd880 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_LMDB
 #include "caffe/util/db_lmdb.hpp"
 
 #include <sys/stat.h>
@@ -49,3 +50,4 @@ void LMDBTransaction::Put(const string& key, const string& value) {
 
 }  // namespace db
 }  // namespace caffe
+#endif  // USE_LMDB
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index d0d05f7..7730e76 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -27,7 +27,34 @@ void hdf5_load_nd_dataset_helper(
   status = H5LTget_dataset_info(
       file_id, dataset_name_, dims.data(), &class_, NULL);
   CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
-  CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
+  switch (class_) {
+  case H5T_FLOAT:
+    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_FLOAT";
+    break;
+  case H5T_INTEGER:
+    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_INTEGER";
+    break;
+  case H5T_TIME:
+    LOG(FATAL) << "Unsupported datatype class: H5T_TIME";
+  case H5T_STRING:
+    LOG(FATAL) << "Unsupported datatype class: H5T_STRING";
+  case H5T_BITFIELD:
+    LOG(FATAL) << "Unsupported datatype class: H5T_BITFIELD";
+  case H5T_OPAQUE:
+    LOG(FATAL) << "Unsupported datatype class: H5T_OPAQUE";
+  case H5T_COMPOUND:
+    LOG(FATAL) << "Unsupported datatype class: H5T_COMPOUND";
+  case H5T_REFERENCE:
+    LOG(FATAL) << "Unsupported datatype class: H5T_REFERENCE";
+  case H5T_ENUM:
+    LOG(FATAL) << "Unsupported datatype class: H5T_ENUM";
+  case H5T_VLEN:
+    LOG(FATAL) << "Unsupported datatype class: H5T_VLEN";
+  case H5T_ARRAY:
+    LOG(FATAL) << "Unsupported datatype class: H5T_ARRAY";
+  default:
+    LOG(FATAL) << "Datatype class unknown";
+  }
 
   vector<int> blob_dims(dims.size());
   for (int i = 0; i < dims.size(); ++i) {
diff --git a/src/caffe/util/im2col.cpp b/src/caffe/util/im2col.cpp
index c48f31f..b0a7be5 100644
--- a/src/caffe/util/im2col.cpp
+++ b/src/caffe/util/im2col.cpp
@@ -1,6 +1,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
+#include <vector>
 
 #include "caffe/util/im2col.hpp"
 #include "caffe/util/math_functions.hpp"
@@ -45,6 +46,98 @@ template void im2col_cpu<double>(const double* data_im, const int channels,
     const int stride_w, double* data_col);
 
 template <typename Dtype>
+inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
+    const int num_spatial_axes, const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_output) {
+  if (!im2col) {
+    int im_size = im_shape[0];
+    for (int i = 0; i < num_spatial_axes; ++i) {
+      im_size *= im_shape[1 + i];
+    }
+    caffe_set(im_size, Dtype(0), data_output);
+  }
+  int kernel_size = 1;
+  for (int i = 0; i < num_spatial_axes; ++i) {
+    kernel_size *= kernel_shape[i];
+  }
+  const int channels_col = col_shape[0];
+  vector<int> d_offset(num_spatial_axes, 0);
+  vector<int> d_iter(num_spatial_axes, 0);
+  for (int c = 0; c < channels_col; ++c) {
+    // Loop over spatial axes in reverse order to compute a per-axis offset.
+    int offset = c;
+    for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
+      if (d_i < num_spatial_axes - 1) {
+        offset /= kernel_shape[d_i + 1];
+      }
+      d_offset[d_i] = offset % kernel_shape[d_i];
+    }
+    for (bool incremented = true; incremented; ) {
+      // Loop over spatial axes in forward order to compute the indices in the
+      // image and column, and whether the index lies in the padding.
+      int index_col = c;
+      int index_im = c / kernel_size;
+      bool is_padding = false;
+      for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
+        const int d = d_iter[d_i];
+        const int d_pad = d * stride[d_i] - pad[d_i] + d_offset[d_i];
+        is_padding |= d_pad < 0 || d_pad >= im_shape[d_i + 1];
+        index_col *= col_shape[d_i + 1];
+        index_col += d;
+        index_im *= im_shape[d_i + 1];
+        index_im += d_pad;
+      }
+      if (im2col) {
+        if (is_padding) {
+          data_output[index_col] = 0;
+        } else {
+          data_output[index_col] = data_input[index_im];
+        }
+      } else if (!is_padding) {  // col2im
+        data_output[index_im] += data_input[index_col];
+      }
+      // Loop over spatial axes in reverse order to choose an index,
+      // like counting.
+      incremented = false;
+      for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
+        const int d_max = col_shape[d_i + 1];
+        DCHECK_LT(d_iter[d_i], d_max);
+        if (d_iter[d_i] == d_max - 1) {
+          d_iter[d_i] = 0;
+        } else {  // d_iter[d_i] < d_max - 1
+          ++d_iter[d_i];
+          incremented = true;
+          break;
+        }
+      }
+    }  // while(incremented) {
+  }  // for (int c = 0; c < channels_col; ++c) {
+}
+
+template <typename Dtype>
+void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_col) {
+  const bool kIm2Col = true;
+  im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape,
+                  kernel_shape, pad, stride, data_col);
+}
+
+// Explicit instantiation
+template void im2col_nd_cpu<float>(const float* data_im,
+    const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    float* data_col);
+template void im2col_nd_cpu<double>(const double* data_im,
+    const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    double* data_col);
+
+template <typename Dtype>
 void col2im_cpu(const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w,
@@ -80,4 +173,27 @@ template void col2im_cpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 
+template <typename Dtype>
+void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_im) {
+  const bool kIm2Col = false;
+  im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape,
+                     kernel_shape, pad, stride, data_im);
+}
+
+// Explicit instantiation
+template void col2im_nd_cpu<float>(const float* data_col,
+    const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    float* data_im);
+template void col2im_nd_cpu<double>(const double* data_col,
+    const int num_spatial_axes,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    double* data_im);
+
+
 }  // namespace caffe
diff --git a/src/caffe/util/im2col.cu b/src/caffe/util/im2col.cu
index c90f93e..5a478ba 100644
--- a/src/caffe/util/im2col.cu
+++ b/src/caffe/util/im2col.cu
@@ -59,7 +59,6 @@ void im2col_gpu(const Dtype* data_im, const int channels,
   CUDA_POST_KERNEL_CHECK;
 }
 
-
 // Explicit instantiation
 template void im2col_gpu<float>(const float* data_im, const int channels,
     const int height, const int width, const int kernel_h, const int kernel_w,
@@ -70,6 +69,156 @@ template void im2col_gpu<double>(const double* data_im, const int channels,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     double* data_col);
 
+template <typename Dtype, int num_axes>
+__global__ void im2col_nd_gpu_kernel(const int n, const Dtype* data_im,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_col) {
+  int d_temp[num_axes];  // NOLINT(runtime/arrays)
+  int d_iter[num_axes];  // NOLINT(runtime/arrays)
+  int i;
+  CUDA_KERNEL_LOOP(index, n) {
+    // Initialize channel_in, computed in the loop below, with intermediate
+    // computations used to compute the spatial indices.
+    int channel_in = index;
+    int channel_out = 1;
+    for (i = num_axes - 1; i >= 0; --i) {
+      d_temp[i] = channel_in % col_shape[i + 1];
+      channel_in /= col_shape[i + 1];
+      channel_out *= kernel_shape[i];
+    }
+    channel_out *= channel_in;
+    int data_col_inc = 1;
+    for (i = 0; i < num_axes; ++i) {
+      channel_out *= col_shape[i + 1];
+      channel_out += d_temp[i];
+      d_temp[i] = d_temp[i] * stride[i] - pad[i];
+      channel_in *= im_shape[i + 1];
+      channel_in += d_temp[i];
+      data_col_inc *= col_shape[i + 1];
+      d_iter[i] = 0;
+    }
+    Dtype* data_col_ptr = data_col + channel_out;
+    const Dtype* data_im_ptr = data_im + channel_in;
+    bool incremented;
+    do {
+      bool in_range = true;
+      for (i = 0; i < num_axes; ++i) {
+        const int d_iter_im = d_iter[i] + d_temp[i];
+        in_range &= d_iter_im >= 0 && d_iter_im < im_shape[i + 1];
+        if (!in_range) { break; }
+      }
+      if (in_range) {
+        int data_im_offset = d_iter[0];
+        for (i = 1; i < num_axes; ++i) {
+          data_im_offset *= im_shape[i + 1];
+          data_im_offset += d_iter[i];
+        }
+        *data_col_ptr = data_im_ptr[data_im_offset];
+      } else {
+        *data_col_ptr = 0;
+      }
+      data_col_ptr += data_col_inc;
+      incremented = false;
+      for (i = num_axes - 1; i >= 0; --i) {
+        const int d_max = kernel_shape[i];
+        if (d_iter[i] == d_max - 1) {
+          d_iter[i] = 0;
+        } else {  // d_iter[i] < d_max - 1
+          ++d_iter[i];
+          incremented = true;
+          break;
+        }
+      }  // for (int i = num_axes - 1; i >= 0; --i)
+    } while (incremented);  // do
+  }  // CUDA_KERNEL_LOOP(index, n)
+}
+
+template <typename Dtype>
+void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
+    const int num_kernels, const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_col) {
+  switch (num_spatial_axes) {
+  case 1:
+    im2col_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 2:
+    im2col_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 3:
+    im2col_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 4:
+    im2col_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 5:
+    im2col_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 6:
+    im2col_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 7:
+    im2col_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 8:
+    im2col_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 9:
+    im2col_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  case 10:
+    im2col_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>>(
+        num_kernels, data_im, im_shape, col_shape,
+        kernel_shape, pad, stride, data_col);
+    break;
+  default:
+    LOG(FATAL) << "im2col_nd_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void im2col_nd_gpu<float>(const float* data_im,
+    const int num_spatial_axes, const int col_size,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    float* data_col);
+template void im2col_nd_gpu<double>(const double* data_im,
+    const int num_spatial_axes, const int col_size,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    double* data_col);
+
 template <typename Dtype>
 __global__ void col2im_gpu_kernel(const int n, const Dtype* data_col,
     const int height, const int width, const int channels,
@@ -141,4 +290,159 @@ template void col2im_gpu<double>(const double* data_col, const int channels,
     const int pad_h, const int pad_w, const int stride_h,
     const int stride_w, double* data_im);
 
+template <typename Dtype, int num_axes>
+__global__ void col2im_nd_gpu_kernel(const int n, const Dtype* data_col,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_im) {
+  int d_im[num_axes];  // NOLINT(runtime/arrays)
+  int d_col_iter[num_axes];  // NOLINT(runtime/arrays)
+  int d_col_start[num_axes];  // NOLINT(runtime/arrays)
+  int d_col_end[num_axes];  // NOLINT(runtime/arrays)
+  CUDA_KERNEL_LOOP(index, n) {
+    // Initialize channel_in, computed in the loop below, with intermediate
+    // computations used to compute the spatial indices.
+    int channel_im = index;
+    // Calculate d_im (image dimensions).
+    for (int i = num_axes - 1; i >= 0; --i) {
+      d_im[i] = channel_im % im_shape[i + 1] + pad[i];
+      channel_im /= im_shape[i + 1];
+    }
+    // Calculate col start/end indices.
+    bool done = false;
+    for (int i = 0; i < num_axes; ++i) {
+      d_col_start[i] = d_col_iter[i] =
+          (d_im[i] < kernel_shape[i]) ?
+          0 : (d_im[i] - kernel_shape[i]) / stride[i] + 1;
+      d_col_end[i] = min(d_im[i] / stride[i] + 1, col_shape[i + 1]);
+      if (d_col_start[i] >= d_col_end[i]) {
+        // Skip computation if the dimension is 0 at any spatial axis --
+        // final val will be 0.
+        data_im[index] = 0;
+        done = true;
+        break;  // for (int i = 0; i < num_axes; ++i)
+      }
+    }
+    if (done) {
+      continue;  // CUDA_KERNEL_LOOP(index, n)
+    }
+    // Loop over the col to compute the output val.
+    Dtype val = 0;
+    bool incremented = true;
+    do {
+      // Compute the final offset.
+      int final_offset = 0;
+      int kernel_shape_prod = 1;
+      for (int i = num_axes - 1; i >= 0; --i) {
+        final_offset +=
+            (d_im[i] - d_col_iter[i] * stride[i]) * kernel_shape_prod;
+        kernel_shape_prod *= kernel_shape[i];
+      }
+      final_offset += kernel_shape_prod * channel_im;
+      for (int i = 0; i < num_axes; ++i) {
+        final_offset *= col_shape[i + 1];
+        final_offset += d_col_iter[i];
+      }
+      val += data_col[final_offset];
+      incremented = false;
+      for (int i = num_axes - 1; i >= 0; --i) {
+        const int d_max = d_col_end[i];
+        if (d_col_iter[i] == d_max - 1) {
+          d_col_iter[i] = d_col_start[i];
+        } else {  // d_col_iter[i] < d_max - 1
+          ++d_col_iter[i];
+          incremented = true;
+          break;  // for (int i = num_axes - 1; i >= 0; --i)
+        }
+      }  // for (int i = num_axes - 1; i >= 0; --i)
+    }  while (incremented);
+    data_im[index] = val;
+  }  // CUDA_KERNEL_LOOP(index, n)
+}
+
+template <typename Dtype>
+void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
+    const int im_size, const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    Dtype* data_im) {
+  switch (num_spatial_axes) {
+  case 1:
+    col2im_nd_gpu_kernel<Dtype, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 2:
+    col2im_nd_gpu_kernel<Dtype, 2>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 3:
+    col2im_nd_gpu_kernel<Dtype, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 4:
+    col2im_nd_gpu_kernel<Dtype, 4>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 5:
+    col2im_nd_gpu_kernel<Dtype, 5>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 6:
+    col2im_nd_gpu_kernel<Dtype, 6>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 7:
+    col2im_nd_gpu_kernel<Dtype, 7>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 8:
+    col2im_nd_gpu_kernel<Dtype, 8>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 9:
+    col2im_nd_gpu_kernel<Dtype, 9>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  case 10:
+    col2im_nd_gpu_kernel<Dtype, 10>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(im_size), CAFFE_CUDA_NUM_THREADS>>>(
+          im_size, data_col, im_shape, col_shape,
+          kernel_shape, pad, stride, data_im);
+    break;
+  default:
+    LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+// Explicit instantiation
+template void col2im_nd_gpu<float>(const float* data_col,
+    const int num_spatial_axes, const int im_size,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    float* data_im);
+template void col2im_nd_gpu<double>(const double* data_col,
+    const int num_spatial_axes, const int im_size,
+    const int* im_shape, const int* col_shape,
+    const int* kernel_shape, const int* pad, const int* stride,
+    double* data_im);
+
 }  // namespace caffe
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 6f03314..f2b1dd9 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -3,9 +3,11 @@
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
 #include <opencv2/core/core.hpp>
+#ifdef USE_OPENCV
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/highgui/highgui_c.h>
 #include <opencv2/imgproc/imgproc.hpp>
+#endif  // USE_OPENCV
 #include <stdint.h>
 
 #include <algorithm>
@@ -67,6 +69,7 @@ void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
   CHECK(proto.SerializeToOstream(&output));
 }
 
+#ifdef USE_OPENCV
 cv::Mat ReadImageToCVMat(const string& filename,
     const int height, const int width, const bool is_color) {
   cv::Mat cv_img;
@@ -98,6 +101,7 @@ cv::Mat ReadImageToCVMat(const string& filename,
 cv::Mat ReadImageToCVMat(const string& filename) {
   return ReadImageToCVMat(filename, 0, 0, true);
 }
+
 // Do the file extension and encoding match?
 static bool matchExt(const std::string & fn,
                      std::string en) {
@@ -111,6 +115,7 @@ static bool matchExt(const std::string & fn,
     return true;
   return false;
 }
+
 bool ReadImageToDatum(const string& filename, const int label,
     const int height, const int width, const bool is_color,
     const std::string & encoding, Datum* datum) {
@@ -135,6 +140,7 @@ bool ReadImageToDatum(const string& filename, const int label,
     return false;
   }
 }
+#endif  // USE_OPENCV
 
 bool ReadFileToDatum(const string& filename, const int label,
     Datum* datum) {
@@ -156,6 +162,7 @@ bool ReadFileToDatum(const string& filename, const int label,
   }
 }
 
+#ifdef USE_OPENCV
 cv::Mat DecodeDatumToCVMatNative(const Datum& datum) {
   cv::Mat cv_img;
   CHECK(datum.encoded()) << "Datum not encoded";
@@ -227,6 +234,5 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
   }
   datum->set_data(buffer);
 }
-
-
+#endif  // USE_OPENCV
 }  // namespace caffe
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 92e5cf5..ac379e5 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -193,7 +193,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_pad()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_pad(v0_layer_param.pad());
+        layer_param->mutable_convolution_param()->add_pad(v0_layer_param.pad());
       } else if (type == "pool") {
         layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad());
       } else {
@@ -203,7 +203,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_kernelsize()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_kernel_size(
+        layer_param->mutable_convolution_param()->add_kernel_size(
             v0_layer_param.kernelsize());
       } else if (type == "pool") {
         layer_param->mutable_pooling_param()->set_kernel_size(
@@ -224,7 +224,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
     }
     if (v0_layer_param.has_stride()) {
       if (type == "conv") {
-        layer_param->mutable_convolution_param()->set_stride(
+        layer_param->mutable_convolution_param()->add_stride(
             v0_layer_param.stride());
       } else if (type == "pool") {
         layer_param->mutable_pooling_param()->set_stride(
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index ff63860..e3f684b 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -174,6 +174,7 @@ int train() {
   vector<int> gpus;
   get_gpus(&gpus);
   if (gpus.size() == 0) {
+    LOG(INFO) << "Use CPU.";
     Caffe::set_mode(Caffe::CPU);
   } else {
     ostringstream s;
diff --git a/tools/compute_image_mean.cpp b/tools/compute_image_mean.cpp
index b1fc7ca..2035d51 100644
--- a/tools/compute_image_mean.cpp
+++ b/tools/compute_image_mean.cpp
@@ -24,6 +24,7 @@ DEFINE_string(backend, "lmdb",
 int main(int argc, char** argv) {
   ::google::InitGoogleLogging(argv[0]);
 
+#ifdef USE_OPENCV
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
@@ -115,5 +116,8 @@ int main(int argc, char** argv) {
     }
     LOG(INFO) << "mean_value channel [" << c << "]:" << mean_values[c] / dim;
   }
+#else
+  LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";
+#endif  // USE_OPENCV
   return 0;
 }
diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp
index aad1f1f..e51a263 100644
--- a/tools/convert_imageset.cpp
+++ b/tools/convert_imageset.cpp
@@ -43,6 +43,7 @@ DEFINE_string(encode_type, "",
     "Optional: What type should we encode the image as ('png','jpg',...).");
 
 int main(int argc, char** argv) {
+#ifdef USE_OPENCV
   ::google::InitGoogleLogging(argv[0]);
   // Print output to stderr (while still logging)
   FLAGS_alsologtostderr = 1;
@@ -150,5 +151,8 @@ int main(int argc, char** argv) {
     txn->Commit();
     LOG(INFO) << "Processed " << count << " files.";
   }
+#else
+  LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";
+#endif  // USE_OPENCV
   return 0;
 }
diff --git a/tools/extract_features.cpp b/tools/extract_features.cpp
index 365dd49..084c9bf 100644
--- a/tools/extract_features.cpp
+++ b/tools/extract_features.cpp
@@ -42,7 +42,7 @@ int feature_extraction_pipeline(int argc, char** argv) {
     "  save_feature_dataset_name1[,name2,...]  num_mini_batches  db_type"
     "  [CPU/GPU] [DEVICE_ID=0]\n"
     "Note: you can extract multiple features in one pass by specifying"
-    " multiple feature blob names and dataset names seperated by ','."
+    " multiple feature blob names and dataset names separated by ','."
     " The names cannot contain white space characters and the number of blobs"
     " and datasets must be equal.";
     return 1;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/caffe-contrib.git