[caffe-contrib] 01/14: New upstream version 1.0.0~rc4

Mon Jan 23 00:54:47 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository caffe-contrib.

commit 185e2d5fac0a682d973a92c2a2b36e02bed07c66
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Sun Jan 22 01:53:14 2017 +0000

    New upstream version 1.0.0~rc4
---
 .gitignore                                      |   1 +
 CMakeLists.txt                                  |   1 +
 LICENSE                                         |   4 +-
 Makefile                                        |   6 +
 Makefile.config.example                         |   6 +-
 cmake/ConfigGen.cmake                           |   2 +-
 cmake/Cuda.cmake                                |   8 +-
 cmake/Dependencies.cmake                        |  15 +-
 cmake/External/glog.cmake                       |   1 +
 cmake/Modules/FindNCCL.cmake                    |  26 ++
 cmake/Summary.cmake                             |   1 +
 cmake/Targets.cmake                             |   6 +-
 docs/install_apt.md                             |   4 +-
 docs/install_apt_debian.md                      | 155 +++++++
 docs/installation.md                            |   1 +
 docs/tutorial/layers.md                         | 562 ++++--------------------
 docs/tutorial/layers/absval.md                  |  22 +
 docs/tutorial/layers/accuracy.md                |  21 +
 docs/tutorial/layers/argmax.md                  |  19 +
 docs/tutorial/layers/batchnorm.md               |  20 +
 docs/tutorial/layers/batchreindex.md            |  16 +
 docs/tutorial/layers/bias.md                    |  19 +
 docs/tutorial/layers/bnll.md                    |  25 ++
 docs/tutorial/layers/concat.md                  |  40 ++
 docs/tutorial/layers/contrastiveloss.md         |  20 +
 docs/tutorial/layers/convolution.md             |  63 +++
 docs/tutorial/layers/crop.md                    |  20 +
 docs/tutorial/layers/data.md                    |  29 ++
 docs/tutorial/layers/deconvolution.md           |  22 +
 docs/tutorial/layers/dropout.md                 |  20 +
 docs/tutorial/layers/dummydata.md               |  20 +
 docs/tutorial/layers/eltwise.md                 |  20 +
 docs/tutorial/layers/elu.md                     |  25 ++
 docs/tutorial/layers/embed.md                   |  20 +
 docs/tutorial/layers/euclideanloss.md           |  16 +
 docs/tutorial/layers/exp.md                     |  24 +
 docs/tutorial/layers/filter.md                  |  15 +
 docs/tutorial/layers/flatten.md                 |  21 +
 docs/tutorial/layers/hdf5data.md                |  20 +
 docs/tutorial/layers/hdf5output.md              |  25 ++
 docs/tutorial/layers/hingeloss.md               |  19 +
 docs/tutorial/layers/im2col.md                  |  16 +
 docs/tutorial/layers/imagedata.md               |  27 ++
 docs/tutorial/layers/infogainloss.md            |  24 +
 docs/tutorial/layers/innerproduct.md            |  59 +++
 docs/tutorial/layers/input.md                   |  19 +
 docs/tutorial/layers/log.md                     |  20 +
 docs/tutorial/layers/lrn.md                     |  28 ++
 docs/tutorial/layers/lstm.md                    |  21 +
 docs/tutorial/layers/memorydata.md              |  25 ++
 docs/tutorial/layers/multinomiallogisticloss.md |  19 +
 docs/tutorial/layers/mvn.md                     |  20 +
 docs/tutorial/layers/parameter.md               |  21 +
 docs/tutorial/layers/pooling.md                 |  47 ++
 docs/tutorial/layers/power.md                   |  46 ++
 docs/tutorial/layers/prelu.md                   |  20 +
 docs/tutorial/layers/python.md                  |  27 ++
 docs/tutorial/layers/recurrent.md               |  20 +
 docs/tutorial/layers/reduction.md               |  20 +
 docs/tutorial/layers/relu.md                    |  32 ++
 docs/tutorial/layers/reshape.md                 |  51 +++
 docs/tutorial/layers/rnn.md                     |  19 +
 docs/tutorial/layers/scale.md                   |  20 +
 docs/tutorial/layers/sigmoid.md                 |  20 +
 docs/tutorial/layers/sigmoidcrossentropyloss.md |  13 +
 docs/tutorial/layers/silence.md                 |  23 +
 docs/tutorial/layers/slice.md                   |  42 ++
 docs/tutorial/layers/softmax.md                 |  24 +
 docs/tutorial/layers/softmaxwithloss.md         |  33 ++
 docs/tutorial/layers/split.md                   |  17 +
 docs/tutorial/layers/spp.md                     |  20 +
 docs/tutorial/layers/tanh.md                    |  18 +
 docs/tutorial/layers/threshold.md               |  18 +
 docs/tutorial/layers/tile.md                    |  20 +
 docs/tutorial/layers/windowdata.md              |  19 +
 examples/02-fine-tuning.ipynb                   |   4 +-
 examples/CMakeLists.txt                         |   2 +-
 include/caffe/blob.hpp                          |   1 +
 include/caffe/common.hpp                        |  14 +-
 include/caffe/data_reader.hpp                   |  82 ----
 include/caffe/internal_thread.hpp               |   4 +-
 include/caffe/layer.hpp                         |  43 +-
 include/caffe/layers/base_data_layer.hpp        |   6 +-
 include/caffe/layers/data_layer.hpp             |   7 +-
 include/caffe/layers/hdf5_data_layer.hpp        |   6 +-
 include/caffe/layers/python_layer.hpp           |   4 +-
 include/caffe/net.hpp                           |  40 +-
 include/caffe/parallel.hpp                      |  96 ++--
 include/caffe/solver.hpp                        |  40 +-
 include/caffe/syncedmem.hpp                     |  26 +-
 include/caffe/util/db_leveldb.hpp               |   5 +-
 include/caffe/util/math_functions.hpp           |   5 +
 include/caffe/util/nccl.hpp                     |  37 ++
 models/bvlc_googlenet/train_val.prototxt        |   2 +-
 python/caffe/__init__.py                        |   4 +-
 python/caffe/_caffe.cpp                         | 113 ++++-
 python/caffe/pycaffe.py                         |   2 +-
 python/caffe/test/test_net.py                   |  18 +-
 python/train.py                                 | 100 +++++
 scripts/build_docs.sh                           |   3 +
 scripts/download_model_binary.py                |   5 +-
 scripts/split_caffe_proto.py                    |  35 ++
 src/caffe/blob.cpp                              |  18 +
 src/caffe/common.cpp                            |   5 +-
 src/caffe/data_reader.cpp                       | 119 -----
 src/caffe/data_transformer.cpp                  |   2 +-
 src/caffe/internal_thread.cpp                   |  10 +-
 src/caffe/layer.cpp                             |  20 -
 src/caffe/layers/base_data_layer.cpp            |  44 +-
 src/caffe/layers/base_data_layer.cu             |  21 +-
 src/caffe/layers/crop_layer.cpp                 |  40 +-
 src/caffe/layers/crop_layer.cu                  |  22 +-
 src/caffe/layers/data_layer.cpp                 |  82 ++--
 src/caffe/layers/hdf5_data_layer.cpp            |  55 ++-
 src/caffe/layers/hdf5_data_layer.cu             |  22 +-
 src/caffe/layers/image_data_layer.cpp           |  13 +-
 src/caffe/layers/window_data_layer.cpp          |   8 +-
 src/caffe/net.cpp                               |  47 +-
 src/caffe/parallel.cpp                          | 514 ++++++++++------------
 src/caffe/proto/caffe.proto                     |   9 +-
 src/caffe/solver.cpp                            |  44 +-
 src/caffe/solvers/adagrad_solver.cpp            |   1 -
 src/caffe/solvers/nesterov_solver.cpp           |   1 -
 src/caffe/solvers/sgd_solver.cpp                |   4 +-
 src/caffe/syncedmem.cpp                         |  59 ++-
 src/caffe/test/test_data_layer.cpp              |  36 ++
 src/caffe/test/test_gradient_based_solver.cpp   |  34 +-
 src/caffe/test/test_hdf5data_layer.cpp          |  30 ++
 src/caffe/test/test_inner_product_layer.cpp     |  10 +-
 src/caffe/util/blocking_queue.cpp               |   5 -
 src/caffe/util/db_lmdb.cpp                      |   2 +-
 src/caffe/util/hdf5.cpp                         |   4 +-
 src/caffe/util/math_functions.cu                |  20 +
 src/caffe/util/upgrade_proto.cpp                |   8 +-
 tools/caffe.cpp                                 |  11 +-
 tools/extra/extract_seconds.py                  |   8 +
 tools/extra/parse_log.py                        |   7 +
 137 files changed, 2856 insertions(+), 1431 deletions(-)

diff --git a/.gitignore b/.gitignore
index 281ef32..eff292b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,6 +84,7 @@ cmake_build
 
 # Generated documentation
 docs/_site
+docs/_includes
 docs/gathered
 _site
 doxygen
diff --git a/CMakeLists.txt b/CMakeLists.txt
index da7142c..3af394f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ include(cmake/ConfigGen.cmake)
 # ---[ Options
 caffe_option(CPU_ONLY  "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
 caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
+caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
 set(python_version "2" CACHE STRING "Specify which Python version to use")
diff --git a/LICENSE b/LICENSE
index d69d16f..0c99adc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,11 +1,11 @@
 COPYRIGHT
 
 All contributions by the University of California:
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
 All rights reserved.
 
 All other contributions:
-Copyright (c) 2014, 2015, the respective contributors
+Copyright (c) 2014-2017, the respective contributors
 All rights reserved.
 
 Caffe uses a shared copyright model: each contributor holds copyright over
diff --git a/Makefile b/Makefile
index ccc4d8b..65d08f7 100644
--- a/Makefile
+++ b/Makefile
@@ -328,6 +328,12 @@ ifeq ($(USE_CUDNN), 1)
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
+# NCCL acceleration configuration
+ifeq ($(USE_NCCL), 1)
+	LIBRARIES += nccl
+	COMMON_FLAGS += -DUSE_NCCL
+endif
+
 # configure IO libraries
 ifeq ($(USE_OPENCV), 1)
 	COMMON_FLAGS += -DUSE_OPENCV
diff --git a/Makefile.config.example b/Makefile.config.example
index 07bed63..b590bd1 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -68,7 +68,7 @@ PYTHON_INCLUDE := /usr/include/python2.7 \
 # ANACONDA_HOME := $(HOME)/anaconda
 # PYTHON_INCLUDE := $(ANACONDA_HOME)/include \
 		# $(ANACONDA_HOME)/include/python2.7 \
-		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include \
+		# $(ANACONDA_HOME)/lib/python2.7/site-packages/numpy/core/include
 
 # Uncomment to use Python 3 (default is Python 2)
 # PYTHON_LIBRARIES := boost_python3 python3.5m
@@ -94,6 +94,10 @@ LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
 # INCLUDE_DIRS += $(shell brew --prefix)/include
 # LIBRARY_DIRS += $(shell brew --prefix)/lib
 
+# NCCL acceleration switch (uncomment to build with NCCL)
+# https://github.com/NVIDIA/nccl (last tested version: v1.2.3-1+cuda8.0)
+# USE_NCCL := 1
+
 # Uncomment to use `pkg-config` to specify OpenCV library paths.
 # (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
 # USE_PKG_CONFIG := 1
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 0563711..fd9dd2d 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -109,7 +109,7 @@ function(caffe_generate_export_configs)
 
   # ---[ Configure and install version file ]---
 
-  # TODO: Lines below are commented because Caffe does't declare its version in headers.
+  # TODO: Lines below are commented because Caffe doesn't declare its version in headers.
   # When the declarations are added, modify `caffe_extract_caffe_version()` macro and uncomment
 
   # configure_file(cmake/Templates/CaffeConfigVersion.cmake.in "${PROJECT_BINARY_DIR}/CaffeConfigVersion.cmake" @ONLY)
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index eeeb732..0fbf301 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -4,7 +4,7 @@ endif()
 
 # Known NVIDIA GPU achitectures Caffe can be compiled for.
 # This list will be used for CUDA_ARCH_NAME = All option
-set(Caffe_known_gpu_archs "20 21(20) 30 35 50")
+set(Caffe_known_gpu_archs "20 21(20) 30 35 50 60 61")
 
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@@ -56,7 +56,7 @@ endfunction()
 #   caffe_select_nvcc_arch_flags(out_variable)
 function(caffe_select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "All" "Manual")
+  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "All" "Manual")
   set(__archs_name_default "All")
   if(NOT CMAKE_CROSSCOMPILING)
     list(APPEND __archs_names "Auto")
@@ -89,6 +89,8 @@ function(caffe_select_nvcc_arch_flags out_variable)
     set(__cuda_arch_bin "30 35")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     set(__cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(__cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(__cuda_arch_bin ${Caffe_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -282,7 +284,7 @@ mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
 if(APPLE)
   caffe_detect_darwin_version(OSX_VERSION)
 
-  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
+  # OSX 10.9 and higher uses clang/libc++ by default which is incompatible with old CUDA toolkits
   if(OSX_VERSION VERSION_GREATER 10.8)
     # enabled by default if and only if CUDA version is less than 7.0
     caffe_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ae9ce8e..ba28a12 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -67,6 +67,13 @@ if(NOT HAVE_CUDA)
   add_definitions(-DCPU_ONLY)
 endif()
 
+if(USE_NCCL)
+  find_package(NCCL REQUIRED)
+  include_directories(SYSTEM ${NCCL_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
+  add_definitions(-DUSE_NCCL)
+endif()
+
 # ---[ OpenCV
 if(USE_OPENCV)
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
@@ -119,18 +126,18 @@ if(BUILD_python)
     find_package(NumPy 1.7.1)
     # Find the matching boost python implementation
     set(version ${PYTHONLIBS_VERSION_STRING})
-    
+
     STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
     find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
     set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-    
+
     while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
       STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
-      
+
       STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
       find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
       set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-      
+
       STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} )
       if("${has_more_version}" STREQUAL "")
         break()
diff --git a/cmake/External/glog.cmake b/cmake/External/glog.cmake
index a44672f..f9d0549 100644
--- a/cmake/External/glog.cmake
+++ b/cmake/External/glog.cmake
@@ -37,6 +37,7 @@ if (NOT __GLOG_INCLUDED)
       GIT_TAG "v0.3.4"
       UPDATE_COMMAND ""
       INSTALL_DIR ${gflags_INSTALL}
+      PATCH_COMMAND autoreconf -i ${glog_PREFIX}/src/glog
       CONFIGURE_COMMAND env "CFLAGS=${GLOG_C_FLAGS}" "CXXFLAGS=${GLOG_CXX_FLAGS}" ${glog_PREFIX}/src/glog/configure --prefix=${glog_INSTALL} --enable-shared=no --enable-static=yes --with-gflags=${GFLAGS_LIBRARY_DIRS}/..
       LOG_DOWNLOAD 1
       LOG_CONFIGURE 1
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 0000000..c884593
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,26 @@
+set(NCCL_INC_PATHS
+    /usr/include
+    /usr/local/include
+    $ENV{NCCL_DIR}/include
+    )
+
+set(NCCL_LIB_PATHS
+    /lib
+    /lib64
+    /usr/lib
+    /usr/lib64
+    /usr/local/lib
+    /usr/local/lib64
+    $ENV{NCCL_DIR}/lib
+    )
+
+find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
+find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+
+if (NCCL_FOUND)
+  message(STATUS "Found NCCL    (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+endif ()
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index ba025cf..ed8c252 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -117,6 +117,7 @@ function(caffe_print_configuration_summary)
   caffe_status("  USE_OPENCV        :   ${USE_OPENCV}")
   caffe_status("  USE_LEVELDB       :   ${USE_LEVELDB}")
   caffe_status("  USE_LMDB          :   ${USE_LMDB}")
+  caffe_status("  USE_NCCL          :   ${USE_NCCL}")
   caffe_status("  ALLOW_LMDB_NOLOCK :   ${ALLOW_LMDB_NOLOCK}")
   caffe_status("")
   caffe_status("Dependencies:")
diff --git a/cmake/Targets.cmake b/cmake/Targets.cmake
index 2cb1158..090f86c 100644
--- a/cmake/Targets.cmake
+++ b/cmake/Targets.cmake
@@ -88,7 +88,7 @@ function(caffe_pickup_caffe_sources root)
   file(GLOB_RECURSE proto_files ${root}/src/caffe/*.proto)
   list(APPEND srcs ${proto_files})
 
-  # convet to absolute paths
+  # convert to absolute paths
   caffe_convert_absolute_paths(srcs)
   caffe_convert_absolute_paths(cuda)
   caffe_convert_absolute_paths(test_srcs)
@@ -102,7 +102,7 @@ function(caffe_pickup_caffe_sources root)
 endfunction()
 
 ################################################################################################
-# Short command for setting defeault target properties
+# Short command for setting default target properties
 # Usage:
 #   caffe_default_properties(<target>)
 function(caffe_default_properties target)
@@ -111,7 +111,7 @@ function(caffe_default_properties target)
     ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
     LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
     RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
-  # make sure we build all external depepdencies first
+  # make sure we build all external dependencies first
   if (DEFINED external_project_dependencies)
     add_dependencies(${target} ${external_project_dependencies})
   endif()
diff --git a/docs/install_apt.md b/docs/install_apt.md
index e95b022..bc1566b 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -33,8 +33,8 @@ Everything is packaged in 14.04.
 These dependencies need manual installation in 12.04.
 
     # glog
-    wget https://google-glog.googlecode.com/files/glog-0.3.3.tar.gz
-    tar zxvf glog-0.3.3.tar.gz
+    wget https://github.com/google/glog/archive/v0.3.3.tar.gz
+    tar zxvf v0.3.3.tar.gz
     cd glog-0.3.3
     ./configure
     make && make install
diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
new file mode 100644
index 0000000..0d39e3a
--- /dev/null
+++ b/docs/install_apt_debian.md
@@ -0,0 +1,155 @@
+---
+title: "Installation: Debian"
+---
+
+# Debian Installation
+
+Caffe packages are available for several Debian versions, as shown in the
+following chart
+
+```
+Your Distro     |  CPU_ONLY  |  CUDA  |     Alias
+----------------+------------+--------+-------------------
+Debian/stable   |     ✘      |   ✘    | Debian Jessie
+Debian/testing  |     ✔      |   ☐    | Debian Stretch/Sid
+Debian/unstable |     ✔      |   ✔    | Debian Sid
+```
+
+* `✘ ` You should take a look at [Ubuntu installation instruction](install_apt.html).
+
+* `✔ ` You can install caffe with a single command line following this guide.
+
+* `☐ ` The same with `✔ `. However it will not work any more when Debian/Stretch becomes the stable branch.
+
+Last update: 2017-01-05
+
+## Binary installation with APT
+
+Apart from the installation methods based on source, Debian/unstable
+and Debian/testing users can install pre-compiled Caffe packages via the official archive.
+
+Make sure that there is something like the follows in your `/etc/apt/sources.list`:
+```
+deb http://MIRROR/debian CODENAME main contrib non-free
+```
+where `MIRROR` is your favorate Debian mirror, and `CODENAME ∈ {testing,stretch,sid}`.
+
+Then we update APT cache and directly install Caffe. Note, the cpu version and
+the cuda version cannot be installed at the same time.
+```
+# apt update
+# apt install [ caffe-cpu | caffe-cuda ]
+# caffe                                              # command line interface working
+# python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
+```
+It should work out of box.
+
+#### Customizing caffe packages
+
+Some users may need to customize the Caffe package. The way to customize
+the package is beyond this guide. Here is only a brief guide of producing
+the customized `.deb` packages. 
+
+Make sure that there is something like this in your `/etc/apt/sources.list`:
+```
+deb http://ftp2.cn.debian.org/debian sid main contrib non-free
+deb-src http://ftp2.cn.debian.org/debian sid main contrib non-free
+```
+
+Then we build caffe deb files with the following commands:
+```
+$ sudo apt update
+$ sudo apt install build-essential debhelper devscripts    # standard package building tools
+$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]            # the most elegant way to pull caffe build dependencies
+$ apt source [ caffe-cpu | caffe-cuda ]               # download the source tarball and extract
+$ cd caffe-XXXX
+[ ... optional, customize caffe code/build ... ]
+$ dch -llocal "Modified XXX in order to XXX"          # write your one-line changelog
+$ debuild -B -j4                                      # build caffe with 4 parallel jobs (similar to make -j4)
+[ ... building ...]
+$ debc                                                # optional, if you want to check the package contents
+$ sudo debi                                           # optional, install the generated packages
+```
+The resulting deb packages can be found under the parent directory of the source tree.
+
+Note, the `dch ...` command line above is for bumping the package version number
+and adding an entry to the package changelog. If you would like to write
+more than one changelog entry, use subsequent `dch` command (see `man 1 dch`)
+instead of manually modifing `debian/changelog` unless you know how to keep its format correct.
+The changelog will be installed at e.g. `/usr/share/doc/caffe-cpu/changelog.Debian.gz`.
+
+## Source installation
+
+Source installation under Debian/unstable is similar to that of Ubuntu, but
+here is a more elegant way to pull caffe build dependencies:
+```
+$ sudo apt build-dep [ caffe-cpu | caffe-cuda ]
+```
+Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
+
+#### Compiler Combinations
+
+Some users may find their favorate compiler doesn't work well with CUDA.
+```
+CXX compiler |  CUDA 7.5  |  CUDA 8.0  |
+-------------+------------+------------+-
+GCC-7        |     ?      |     ?      |
+GCC-6        |     ✘      |     ✘      |
+GCC-5        |     ✔ [1]  |     ✔      |
+CLANG-4.0    |     ?      |     ?      |
+CLANG-3.9    |     ✘      |     ✘      |
+CLANG-3.8    |     ?      |     ✔      |
+```
+
+`[1]` CUDA 7.5 's `host_config.h` must be patched before working with GCC-5.
+
+BTW, please forget the GCC-4.X series, since its `libstdc++` ABI is not compatible with GCC-5's.
+You may encounter failure linking GCC-4.X object files against GCC-5 libraries.
+(See https://wiki.debian.org/GCC5 )
+
+## Notes
+
+* Consider re-compiling OpenBLAS locally with optimization flags for sake of
+performance. This is highly recommended for any kind of production use, including
+academic research.
+
+* If you are installing `caffe-cuda`, APT will automatically pull some of the
+CUDA packages and the nvidia driver packages. Please be careful if you have
+manually installed or hacked nvidia driver or CUDA toolkit or any other
+related stuff, because in this case APT may fail.
+
+* Additionally, a manpage (`man caffe`) and a bash complementation script
+(`caffe <TAB><TAB>`, `caffe train <TAB><TAB>`) are provided.
+Both of the two files are still not merged into caffe master.
+
+* The python interface is Python 3 version: `python3-caffe-{cpu,cuda}`.
+No plan to support python2.
+
+* If you encountered any problem related to the packaging system (e.g. failed to install `caffe-*`),
+please report bug to Debian via Debian's bug tracking system. See https://www.debian.org/Bugs/ .
+Patches and suggestions are also welcome.
+
+## FAQ
+
+* where is caffe-cudnn?
+
+CUDNN library seems not redistributable currently. If you really want the
+caffe-cudnn deb packages, the workaround is to install cudnn by yourself,
+and hack the packaging scripts, then build your customized package.
+
+* I installed the CPU version. How can I switch to the CUDA version?
+
+`sudo apt install caffe-cuda`, apt's dependency resolver is smart enough to deal with this.
+
+* Where are the examples, the models and other documentation stuff?
+
+```
+sudo apt install caffe-doc
+dpkg -L caffe-doc
+```
+
+* Where can I find the Debian package status?
+
+https://tracker.debian.org/pkg/caffe  (for the CPU_ONLY version)
+
+https://tracker.debian.org/pkg/caffe-contrib  (for the CUDA version)
diff --git a/docs/installation.md b/docs/installation.md
index 3254be3..6b2cd3b 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -12,6 +12,7 @@ The official Makefile and `Makefile.config` build are complemented by a [communi
 
 - [Docker setup](https://github.com/BVLC/caffe/tree/master/docker) *out-of-the-box brewing*
 - [Ubuntu installation](install_apt.html) *the standard platform*
+- [Debian installation](install_apt_debian.html) *install caffe with a single command*
 - [OS X installation](install_osx.html)
 - [RHEL / CentOS / Fedora installation](install_yum.html)
 - [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*
diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 7362aac..a903d5a 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -1,186 +1,77 @@
 ---
 title: Layer Catalogue
 ---
+
 # Layers
 
 To create a Caffe model you need to define the model architecture in a protocol buffer definition file (prototxt).
 
 Caffe layers and their parameters are defined in the protocol buffer definitions for the project in [caffe.proto](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto).
 
-### Vision Layers
-
-* Header: `./include/caffe/vision_layers.hpp`
-
-Vision layers usually take *images* as input and produce other *images* as output.
-A typical "image" in the real-world may have one color channel ($$c = 1$$), as in a grayscale image, or three color channels ($$c = 3$$) as in an RGB (red, green, blue) image.
-But in this context, the distinguishing characteristic of an image is its spatial structure: usually an image has some non-trivial height $$h > 1$$ and width $$w > 1$$.
-This 2D geometry naturally lends itself to certain decisions about how to process the input.
-In particular, most of the vision layers work by applying a particular operation to some region of the input to produce a corresponding region of the output.
-In contrast, other layers (with few exceptions) ignore the spatial structure of the input, effectively treating it as "one big vector" with dimension $$chw$$.
-
-
-#### Convolution
-
-* Layer type: `Convolution`
-* CPU implementation: `./src/caffe/layers/convolution_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/convolution_layer.cu`
-* Parameters (`ConvolutionParameter convolution_param`)
-    - Required
-        - `num_output` (`c_o`): the number of filters
-        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
-    - Strongly Recommended
-        - `weight_filler` [default `type: 'constant' value: 0`]
-    - Optional
-        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
-        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
-        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
-        - `group` (g) [default 1]: If g > 1, we restrict the connectivity of each filter to a subset of the input. Specifically, the input and output channels are separated into g groups, and the $$i$$th output group channels will be only connected to the $$i$$th input group channels.
-* Input
-    - `n * c_i * h_i * w_i`
-* Output
-    - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "conv1"
-        type: "Convolution"
-        bottom: "data"
-        top: "conv1"
-        # learning rate and decay multipliers for the filters
-        param { lr_mult: 1 decay_mult: 1 }
-        # learning rate and decay multipliers for the biases
-        param { lr_mult: 2 decay_mult: 0 }
-        convolution_param {
-          num_output: 96     # learn 96 filters
-          kernel_size: 11    # each filter is 11x11
-          stride: 4          # step 4 pixels between each filter application
-          weight_filler {
-            type: "gaussian" # initialize the filters from a Gaussian
-            std: 0.01        # distribution with stdev 0.01 (default mean: 0)
-          }
-          bias_filler {
-            type: "constant" # initialize the biases to zero (0)
-            value: 0
-          }
-        }
-      }
-
-The `Convolution` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
-
-#### Pooling
-
-* Layer type: `Pooling`
-* CPU implementation: `./src/caffe/layers/pooling_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/pooling_layer.cu`
-* Parameters (`PoolingParameter pooling_param`)
-    - Required
-        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
-    - Optional
-        - `pool` [default MAX]: the pooling method. Currently MAX, AVE, or STOCHASTIC
-        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
-        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
-* Input
-    - `n * c * h_i * w_i`
-* Output
-    - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "pool1"
-        type: "Pooling"
-        bottom: "conv1"
-        top: "pool1"
-        pooling_param {
-          pool: MAX
-          kernel_size: 3 # pool over a 3x3 region
-          stride: 2      # step two pixels (in the bottom blob) between pooling regions
-        }
-      }
-
-#### Local Response Normalization (LRN)
-
-* Layer type: `LRN`
-* CPU Implementation: `./src/caffe/layers/lrn_layer.cpp`
-* CUDA GPU Implementation: `./src/caffe/layers/lrn_layer.cu`
-* Parameters (`LRNParameter lrn_param`)
-    - Optional
-        - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
-        - `alpha` [default 1]: the scaling parameter (see below)
-        - `beta` [default 5]: the exponent (see below)
-        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
+## Data Layers
 
-The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where [...]
-
-#### im2col
-
-`Im2col` is a helper for doing the image-to-column transformation that you most likely do not need to know about. This is used in Caffe's original convolution to do matrix multiplication by laying out all patches into a matrix.
-
-### Loss Layers
+Data enters Caffe through data layers: they lie at the bottom of nets. Data can come from efficient databases (LevelDB or LMDB), directly from memory, or, when efficiency is not critical, from files on disk in HDF5 or common image formats.
 
-Loss drives learning by comparing an output to a target and assigning cost to minimize. The loss itself is computed by the forward pass and the gradient w.r.t. to the loss is computed by the backward pass.
+Common input preprocessing (mean subtraction, scaling, random cropping, and mirroring) is available by specifying `TransformationParameter`s by some of the layers.
+The [bias](layers/bias.html), [scale](layers/scale.html), and [crop](layers/crop.html) layers can be helpful with transforming the inputs, when `TransformationParameter` isn't available.
 
-#### Softmax
+Layers:
 
-* Layer type: `SoftmaxWithLoss`
+* [Image Data](layers/imagedata.html) - read raw images.
+* [Database](layers/data.html) - read data from LEVELDB or LMDB.
+* [HDF5 Input](layers/hdf5data.html) - read HDF5 data, allows data of arbitrary dimensions.
+* [HDF5 Output](layers/hdf5output.html) - write data as HDF5.
+* [Input](layers/input.html) - typically used for networks that are being deployed.
+* [Window Data](layers/windowdata.html) - read window data file.
+* [Memory Data](layers/memorydata.html) - read data directly from memory.
+* [Dummy Data](layers/dummydata.html) - for static data and debugging.
 
-The softmax loss layer computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+Note that the [Python](layers/python.html) Layer can be useful for create custom data layers.
 
-#### Sum-of-Squares / Euclidean
+## Vision Layers
 
-* Layer type: `EuclideanLoss`
+Vision layers usually take *images* as input and produce other *images* as output, although they can take data of other types and dimensions.
+A typical "image" in the real-world may have one color channel ($$c = 1$$), as in a grayscale image, or three color channels ($$c = 3$$) as in an RGB (red, green, blue) image.
+But in this context, the distinguishing characteristic of an image is its spatial structure: usually an image has some non-trivial height $$h > 1$$ and width $$w > 1$$.
+This 2D geometry naturally lends itself to certain decisions about how to process the input.
+In particular, most of the vision layers work by applying a particular operation to some region of the input to produce a corresponding region of the output.
+In contrast, other layers (with few exceptions) ignore the spatial structure of the input, effectively treating it as "one big vector" with dimension $$chw$$.
 
-The Euclidean loss layer computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+Layers:
 
-#### Hinge / Margin
+* [Convolution Layer](layers/convolution.html) - convolves the input image with a set of learnable filters, each producing one feature map in the output image.
+* [Pooling Layer](layers/pooling.html) - max, average, or stochastic pooling.
+* [Spatial Pyramid Pooling (SPP)](layers/spp.html)
+* [Crop](layers/crop.html) - perform cropping transformation.
+* [Deconvolution Layer](layers/deconvolution.html) - transposed convolution.
 
-* Layer type: `HingeLoss`
-* CPU implementation: `./src/caffe/layers/hinge_loss_layer.cpp`
-* CUDA GPU implementation: none yet
-* Parameters (`HingeLossParameter hinge_loss_param`)
-    - Optional
-        - `norm` [default L1]: the norm used. Currently L1, L2
-* Inputs
-    - `n * c * h * w` Predictions
-    - `n * 1 * 1 * 1` Labels
-* Output
-    - `1 * 1 * 1 * 1` Computed Loss
-* Samples
+* [Im2Col](layers/im2col.html) - relic helper layer that is not used much anymore.
 
-      # L1 Norm
-      layer {
-        name: "loss"
-        type: "HingeLoss"
-        bottom: "pred"
-        bottom: "label"
-      }
+## Recurrent Layers
 
-      # L2 Norm
-      layer {
-        name: "loss"
-        type: "HingeLoss"
-        bottom: "pred"
-        bottom: "label"
-        top: "loss"
-        hinge_loss_param {
-          norm: L2
-        }
-      }
+Layers:
 
-The hinge loss layer computes a one-vs-all hinge or squared hinge loss.
+* [Recurrent](layers/recurrent.html)
+* [RNN](layers/rnn.html)
+* [Long-Short Term Memory (LSTM)](layers/lstm.html)
 
-#### Sigmoid Cross-Entropy
+## Common Layers
 
-`SigmoidCrossEntropyLoss`
+Layers:
 
-#### Infogain
+* [Inner Product](layers/innerproduct.html) - fully connected layer.
+* [Dropout](layers/dropout.html)
+* [Embed](layers/embed.html) - for learning embeddings of one-hot encoded vector (takes index as input).
 
-`InfogainLoss`
+## Normalization Layers
 
-#### Accuracy and Top-k
+* [Local Response Normalization (LRN)](layers/lrn.html) - performs a kind of "lateral inhibition" by normalizing over local input regions.
+* [Mean Variance Normalization (MVN)](layers/mvn.html) - performs contrast normalization / instance normalization.
+* [Batch Normalization](layers/batchnorm.html) - performs normalization over mini-batches.
 
-`Accuracy` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
+The [bias](layers/bias.html) and [scale](layers/scale.html) layers can be helpful in combination with normalization.
 
-### Activation / Neuron Layers
+## Activation / Neuron Layers
 
 In general, activation / Neuron layers are element-wise operators, taking one bottom blob and producing one top blob of the same size. In the layers below, we will ignore the input and out sizes as they are identical:
 
@@ -189,337 +80,56 @@ In general, activation / Neuron layers are element-wise operators, taking one bo
 * Output
     - n * c * h * w
 
-#### ReLU / Rectified-Linear and Leaky-ReLU
-
-* Layer type: `ReLU`
-* CPU implementation: `./src/caffe/layers/relu_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/relu_layer.cu`
-* Parameters (`ReLUParameter relu_param`)
-    - Optional
-        - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
-* Sample (as seen in `./models/bvlc_reference_caffenet/train_val.prototxt`)
-
-      layer {
-        name: "relu1"
-        type: "ReLU"
-        bottom: "conv1"
-        top: "conv1"
-      }
-
-Given an input value x, The `ReLU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
-
-#### Sigmoid
-
-* Layer type: `Sigmoid`
-* CPU implementation: `./src/caffe/layers/sigmoid_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/sigmoid_layer.cu`
-* Sample (as seen in `./examples/mnist/mnist_autoencoder.prototxt`)
-
-      layer {
-        name: "encode1neuron"
-        bottom: "encode1"
-        top: "encode1neuron"
-        type: "Sigmoid"
-      }
-
-The `Sigmoid` layer computes the output as sigmoid(x) for each input element x.
-
-#### TanH / Hyperbolic Tangent
-
-* Layer type: `TanH`
-* CPU implementation: `./src/caffe/layers/tanh_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/tanh_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "TanH"
-      }
-
-The `TanH` layer computes the output as tanh(x) for each input element x.
-
-#### Absolute Value
-
-* Layer type: `AbsVal`
-* CPU implementation: `./src/caffe/layers/absval_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/absval_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "AbsVal"
-      }
-
-The `AbsVal` layer computes the output as abs(x) for each input element x.
-
-#### Power
-
-* Layer type: `Power`
-* CPU implementation: `./src/caffe/layers/power_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/power_layer.cu`
-* Parameters (`PowerParameter power_param`)
-    - Optional
-        - `power` [default 1]
-        - `scale` [default 1]
-        - `shift` [default 0]
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: "Power"
-        power_param {
-          power: 1
-          scale: 1
-          shift: 0
-        }
-      }
-
-The `Power` layer computes the output as (shift + scale * x) ^ power for each input element x.
-
-#### BNLL
-
-* Layer type: `BNLL`
-* CPU implementation: `./src/caffe/layers/bnll_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/bnll_layer.cu`
-* Sample
-
-      layer {
-        name: "layer"
-        bottom: "in"
-        top: "out"
-        type: BNLL
-      }
-
-The `BNLL` (binomial normal log likelihood) layer computes the output as log(1 + exp(x)) for each input element x.
-
-
-### Data Layers
-
-Data enters Caffe through data layers: they lie at the bottom of nets. Data can come from efficient databases (LevelDB or LMDB), directly from memory, or, when efficiency is not critical, from files on disk in HDF5 or common image formats.
-
-Common input preprocessing (mean subtraction, scaling, random cropping, and mirroring) is available by specifying `TransformationParameter`s.
-
-#### Database
+Layers:
 
-* Layer type: `Data`
-* Parameters
-    - Required
-        - `source`: the name of the directory containing the database
-        - `batch_size`: the number of inputs to process at one time
-    - Optional
-        - `rand_skip`: skip up to this number of inputs at the beginning; useful for asynchronous sgd
-        - `backend` [default `LEVELDB`]: choose whether to use a `LEVELDB` or `LMDB`
+* [ReLU / Rectified-Linear and Leaky-ReLU](layers/relu.html) - ReLU and Leaky-ReLU rectification.
+* [PReLU](layers/prelu.html) - parametric ReLU.
+* [ELU](layers/elu.html) - exponential linear rectification.
+* [Sigmoid](layers/sigmoid.html)
+* [TanH](layers/tanh.html)
+* [Absolute Value](layers/abs.html)
+* [Power](layers/power.html) - f(x) = (shift + scale * x) ^ power.
+* [Exp](layers/exp.html) - f(x) = base ^ (shift + scale * x).
+* [Log](layers/log.html) - f(x) = log(x).
+* [BNLL](layers/bnll.html) - f(x) = log(1 + exp(x)).
+* [Threshold](layers/threshold.html) - performs step function at user defined threshold.
+* [Bias](layers/bias.html) - adds a bias to a blob that can either be learned or fixed.
+* [Scale](layers/scale.html) - scales a blob by an amount that can either be learned or fixed.
 
+## Utility Layers
 
+Layers:
 
-#### In-Memory
+* [Flatten](layers/flatten.html)
+* [Reshape](layers/reshape.html)
+* [Batch Reindex](layers/batchreindex.html)
 
-* Layer type: `MemoryData`
-* Parameters
-    - Required
-        - `batch_size`, `channels`, `height`, `width`: specify the size of input chunks to read from memory
+* [Split](layers/split.html)
+* [Concat](layers/concat.html)
+* [Slicing](layers/slice.html)
+* [Eltwise](layers/eltwise.html) - element-wise operations such as product or sum between two blobs.
+* [Filter / Mask](layers/filter.html) - mask or select output using last blob.
+* [Parameter](layers/parameter.html) - enable parameters to be shared between layers.
+* [Reduction](layers/reduction.html) - reduce input blob to scalar blob using operations such as sum or mean.
+* [Silence](layers/silence.html) - prevent top-level blobs from being printed during training.
 
-The memory data layer reads data directly from memory, without copying it. In order to use it, one must call `MemoryDataLayer::Reset` (from C++) or `Net.set_input_arrays` (from Python) in order to specify a source of contiguous data (as 4D row major array), which is read one batch-sized chunk at a time.
+* [ArgMax](layers/argmax.html)
+* [Softmax](layers/softmax.html)
 
-#### HDF5 Input
+* [Python](layers/python.html) - allows custom Python layers.
 
-* Layer type: `HDF5Data`
-* Parameters
-    - Required
-        - `source`: the name of the file to read from
-        - `batch_size`
+## Loss Layers
 
-#### HDF5 Output
-
-* Layer type: `HDF5Output`
-* Parameters
-    - Required
-        - `file_name`: name of file to write to
-
-The HDF5 output layer performs the opposite function of the other layers in this section: it writes its input blobs to disk.
-
-#### Images
-
-* Layer type: `ImageData`
-* Parameters
-    - Required
-        - `source`: name of a text file, with each line giving an image filename and label
-        - `batch_size`: number of images to batch together
-    - Optional
-        - `rand_skip`
-        - `shuffle` [default false]
-        - `new_height`, `new_width`: if provided, resize all images to this size
-
-#### Windows
-
-`WindowData`
-
-#### Dummy
-
-`DummyData` is for development and debugging. See `DummyDataParameter`.
-
-### Common Layers
-
-#### Inner Product
-
-* Layer type: `InnerProduct`
-* CPU implementation: `./src/caffe/layers/inner_product_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/inner_product_layer.cu`
-* Parameters (`InnerProductParameter inner_product_param`)
-    - Required
-        - `num_output` (`c_o`): the number of filters
-    - Strongly recommended
-        - `weight_filler` [default `type: 'constant' value: 0`]
-    - Optional
-        - `bias_filler` [default `type: 'constant' value: 0`]
-        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
-* Input
-    - `n * c_i * h_i * w_i`
-* Output
-    - `n * c_o * 1 * 1`
-* Sample
-
-      layer {
-        name: "fc8"
-        type: "InnerProduct"
-        # learning rate and decay multipliers for the weights
-        param { lr_mult: 1 decay_mult: 1 }
-        # learning rate and decay multipliers for the biases
-        param { lr_mult: 2 decay_mult: 0 }
-        inner_product_param {
-          num_output: 1000
-          weight_filler {
-            type: "gaussian"
-            std: 0.01
-          }
-          bias_filler {
-            type: "constant"
-            value: 0
-          }
-        }
-        bottom: "fc7"
-        top: "fc8"
-      }
-
-The `InnerProduct` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
-
-#### Splitting
-
-The `Split` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
-
-#### Flattening
-
-The `Flatten` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w)`
-
-#### Reshape
-
-* Layer type: `Reshape`
-* Implementation: `./src/caffe/layers/reshape_layer.cpp`
-* Parameters (`ReshapeParameter reshape_param`)
-    - Optional: (also see detailed description below)
-        - `shape`
-
-* Input
-    - a single blob with arbitrary dimensions
-* Output
-    - the same blob, with modified dimensions, as specified by `reshape_param`
-
-* Sample
-
-        layer {
-          name: "reshape"
-          type: "Reshape"
-          bottom: "input"
-          top: "output"
-          reshape_param {
-            shape {
-              dim: 0  # copy the dimension from below
-              dim: 2
-              dim: 3
-              dim: -1 # infer it from the other dimensions
-            }
-          }
-        }
-
-The `Reshape` layer can be used to change the dimensions of its input, without changing its data. Just like the `Flatten` layer, only the dimensions are changed; no data is copied in the process.
-
-Output dimensions are specified by the `ReshapeParam` proto. Positive numbers are used directly, setting the corresponding dimension of the output blob. In addition, two special values are accepted for any of the target dimension values:
-
-* **0** means "copy the respective dimension of the bottom layer". That is, if the bottom has 2 as its 1st dimension, the top will have 2 as its 1st dimension as well, given `dim: 0` as the 1st target dimension.
-* **-1** stands for "infer this from the other dimensions". This behavior is similar to that of -1 in *numpy*'s or `[]` for *MATLAB*'s reshape: this dimension is calculated to keep the overall element count the same as in the bottom layer. At most one -1 can be used in a reshape operation.
-
-As another example, specifying `reshape_param { shape { dim: 0 dim: -1 } }` makes the layer behave in exactly the same way as the `Flatten` layer.
-
-#### Concatenation
-
-* Layer type: `Concat`
-* CPU implementation: `./src/caffe/layers/concat_layer.cpp`
-* CUDA GPU implementation: `./src/caffe/layers/concat_layer.cu`
-* Parameters (`ConcatParameter concat_param`)
-    - Optional
-        - `axis` [default 1]: 0 for concatenation along num and 1 for channels.
-* Input
-    - `n_i * c_i * h * w` for each input blob i from 1 to K.
-* Output
-    - if `axis = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
-    - if `axis = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
-* Sample
-
-      layer {
-        name: "concat"
-        bottom: "in1"
-        bottom: "in2"
-        top: "out"
-        type: "Concat"
-        concat_param {
-          axis: 1
-        }
-      }
-
-The `Concat` layer is a utility layer that concatenates its multiple input blobs to one single output blob.
-
-#### Slicing
-
-The `Slice` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
-
-* Sample
-
-      layer {
-        name: "slicer_label"
-        type: "Slice"
-        bottom: "label"
-        ## Example of label with a shape N x 3 x 1 x 1
-        top: "label1"
-        top: "label2"
-        top: "label3"
-        slice_param {
-          axis: 1
-          slice_point: 1
-          slice_point: 2
-        }
-      }
-
-`axis` indicates the target axis; `slice_point` indicates indexes in the selected dimension (the number of indices must be equal to the number of top blobs minus one).
-
-
-#### Elementwise Operations
-
-`Eltwise`
-
-#### Argmax
-
-`ArgMax`
-
-#### Softmax
+Loss drives learning by comparing an output to a target and assigning cost to minimize. The loss itself is computed by the forward pass and the gradient w.r.t. to the loss is computed by the backward pass.
 
-`Softmax`
+Layers:
 
-#### Mean-Variance Normalization
+* [Multinomial Logistic Loss](layers/multinomiallogisticloss.html)
+* [Infogain Loss](layers/infogainloss.html) - a generalization of MultinomialLogisticLossLayer.
+* [Softmax with Loss](layers/softmaxwithloss.html) - computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+* [Sum-of-Squares / Euclidean](layers/euclideanloss.html) - computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+* [Hinge / Margin](layers/hiddenloss.html) - The hinge loss layer computes a one-vs-all hinge (L1) or squared hinge loss (L2).
+* [Sigmoid Cross-Entropy Loss](layers/sigmoidcrossentropyloss.html) - computes the cross-entropy (logistic) loss, often used for predicting targets interpreted as probabilities.
+* [Accuracy / Top-k layer](layers/accuracy.html) - scores the output as an accuracy with respect to target -- it is not actually a loss and has no backward step.
+* [Contrastive Loss](layers/contrastiveloss.html)
 
-`MVN`
diff --git a/docs/tutorial/layers/absval.md b/docs/tutorial/layers/absval.md
new file mode 100644
index 0000000..220c411
--- /dev/null
+++ b/docs/tutorial/layers/absval.md
@@ -0,0 +1,22 @@
+---
+title: Absolute Value Layer
+---
+
+# Absolute Value Layer
+
+* Layer type: `AbsVal`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1AbsValLayer.html)
+* Header: [`./include/caffe/layers/absval_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/absval_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/absval_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/absval_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/absval_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/absval_layer.cu)
+
+* Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: "AbsVal"
+      }
+
+The `AbsVal` layer computes the output as abs(x) for each input element x.
diff --git a/docs/tutorial/layers/accuracy.md b/docs/tutorial/layers/accuracy.md
new file mode 100644
index 0000000..ecf8409
--- /dev/null
+++ b/docs/tutorial/layers/accuracy.md
@@ -0,0 +1,21 @@
+---
+title: Accuracy and Top-k
+---
+
+# Accuracy and Top-k
+
+`Accuracy` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
+
+* Layer type: `Accuracy`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1AccuracyLayer.html)
+* Header: [`./include/caffe/layers/accuracy_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/accuracy_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/accuracy_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/accuracy_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/accuracy_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/accuracy_layer.cu)
+
+## Parameters
+* Parameters (`AccuracyParameter accuracy_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/AccuracyParameter.txt %}
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/tutorial/layers/argmax.md b/docs/tutorial/layers/argmax.md
new file mode 100644
index 0000000..f5f173a
--- /dev/null
+++ b/docs/tutorial/layers/argmax.md
@@ -0,0 +1,19 @@
+---
+title: ArgMax Layer
+---
+
+# ArgMax Layer
+
+* Layer type: `ArgMax`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ArgMaxLayer.html)
+* Header: [`./include/caffe/layers/argmax_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/argmax_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/argmax_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/argmax_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/argmax_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/argmax_layer.cu)
+
+## Parameters
+* Parameters (`ArgMaxParameter argmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ArgMaxParameter.txt %}
+{% endhighlight %}
\ No newline at end of file
diff --git a/docs/tutorial/layers/batchnorm.md b/docs/tutorial/layers/batchnorm.md
new file mode 100644
index 0000000..a5be5ce
--- /dev/null
+++ b/docs/tutorial/layers/batchnorm.md
@@ -0,0 +1,20 @@
+---
+title: Batch Norm Layer
+---
+
+# Batch Norm Layer
+
+* Layer type: `BatchNorm`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BatchNormLayer.html)
+* Header: [`./include/caffe/layers/batch_norm_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/batch_norm_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/batch_norm_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_norm_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/batch_norm_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_norm_layer.cu)
+
+## Parameters
+
+* Parameters (`BatchNormParameter batch_norm_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/BatchNormParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/batchreindex.md b/docs/tutorial/layers/batchreindex.md
new file mode 100644
index 0000000..21b36c3
--- /dev/null
+++ b/docs/tutorial/layers/batchreindex.md
@@ -0,0 +1,16 @@
+---
+title: Batch Reindex Layer
+---
+
+# Batch Reindex Layer
+
+* Layer type: `BatchReindex`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BatchReindexLayer.html)
+* Header: [`./include/caffe/layers/batch_reindex_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/batch_reindex_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/batch_reindex_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_reindex_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/batch_reindex_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/batch_reindex_layer.cu)
+
+
+## Parameters
+
+No parameters.
diff --git a/docs/tutorial/layers/bias.md b/docs/tutorial/layers/bias.md
new file mode 100644
index 0000000..d3a00c2
--- /dev/null
+++ b/docs/tutorial/layers/bias.md
@@ -0,0 +1,19 @@
+---
+title: Bias Layer
+---
+
+# Bias Layer
+
+* Layer type: `Bias`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BiasLayer.html)
+* Header: [`./include/caffe/layers/bias_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/bias_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/bias_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bias_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/bias_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bias_layer.cu)
+
+## Parameters
+* Parameters (`BiasParameter bias_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/BiasParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/bnll.md b/docs/tutorial/layers/bnll.md
new file mode 100644
index 0000000..2b68b79
--- /dev/null
+++ b/docs/tutorial/layers/bnll.md
@@ -0,0 +1,25 @@
+---
+title: BNLL Layer
+---
+
+# BNLL Layer
+
+* Layer type: `BNLL`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1BNLLLayer.html)
+* Header: [`./include/caffe/layers/bnll_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/bnll_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/bnll_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bnll_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/bnll_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/bnll_layer.cu)
+
+The `BNLL` (binomial normal log likelihood) layer computes the output as log(1 + exp(x)) for each input element x.
+
+## Parameters
+No parameters.
+
+## Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: BNLL
+      }
diff --git a/docs/tutorial/layers/concat.md b/docs/tutorial/layers/concat.md
new file mode 100644
index 0000000..c7b2539
--- /dev/null
+++ b/docs/tutorial/layers/concat.md
@@ -0,0 +1,40 @@
+---
+title: Concat Layer
+---
+
+# Concat Layer
+
+* Layer type: `Concat`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ConcatLayer.html)
+* Header: [`./include/caffe/layers/concat_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/concat_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/concat_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/concat_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/concat_layer.cu)
+* Input
+    - `n_i * c_i * h * w` for each input blob i from 1 to K.
+* Output
+    - if `axis = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
+    - if `axis = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
+* Sample
+
+      layer {
+        name: "concat"
+        bottom: "in1"
+        bottom: "in2"
+        top: "out"
+        type: "Concat"
+        concat_param {
+          axis: 1
+        }
+      }
+
+The `Concat` layer is a utility layer that concatenates its multiple input blobs to one single output blob.
+
+## Parameters
+* Parameters (`ConcatParameter concat_param`)
+    - Optional
+        - `axis` [default 1]: 0 for concatenation along num and 1 for channels.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConcatParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/contrastiveloss.md b/docs/tutorial/layers/contrastiveloss.md
new file mode 100644
index 0000000..bb1859d
--- /dev/null
+++ b/docs/tutorial/layers/contrastiveloss.md
@@ -0,0 +1,20 @@
+---
+title: Contrastive Loss Layer
+---
+
+# Contrastive Loss Layer
+
+* Layer type: `ContrastiveLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ContrastiveLossLayer.html)
+* Header: [`./include/caffe/layers/contrastive_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/contrastive_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/contrastive_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/contrastive_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/contrastive_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/contrastive_loss_layer.cu)
+
+## Parameters
+
+* Parameters (`ContrastiveLossParameter contrastive_loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ContrastiveLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/convolution.md b/docs/tutorial/layers/convolution.md
new file mode 100644
index 0000000..cc9f4fd
--- /dev/null
+++ b/docs/tutorial/layers/convolution.md
@@ -0,0 +1,63 @@
+---
+title: Convolution Layer
+---
+
+# Convolution Layer
+
+* Layer type: `Convolution`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ConvolutionLayer.html)
+* Header: [`./include/caffe/layers/conv_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/conv_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/conv_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/conv_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+* Input
+    - `n * c_i * h_i * w_i`
+* Output
+    - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
+
+The `Convolution` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
+
+## Sample
+
+Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt)):
+
+      layer {
+        name: "conv1"
+        type: "Convolution"
+        bottom: "data"
+        top: "conv1"
+        # learning rate and decay multipliers for the filters
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
+        convolution_param {
+          num_output: 96     # learn 96 filters
+          kernel_size: 11    # each filter is 11x11
+          stride: 4          # step 4 pixels between each filter application
+          weight_filler {
+            type: "gaussian" # initialize the filters from a Gaussian
+            std: 0.01        # distribution with stdev 0.01 (default mean: 0)
+          }
+          bias_filler {
+            type: "constant" # initialize the biases to zero (0)
+            value: 0
+          }
+        }
+      }
+
+## Parameters
+* Parameters (`ConvolutionParameter convolution_param`)
+    - Required
+        - `num_output` (`c_o`): the number of filters
+        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
+    - Strongly Recommended
+        - `weight_filler` [default `type: 'constant' value: 0`]
+    - Optional
+        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
+        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
+        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
+        - `group` (g) [default 1]: If g > 1, we restrict the connectivity of each filter to a subset of the input. Specifically, the input and output channels are separated into g groups, and the $$i$$th output group channels will be only connected to the $$i$$th input group channels.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConvolutionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/crop.md b/docs/tutorial/layers/crop.md
new file mode 100644
index 0000000..28f9124
--- /dev/null
+++ b/docs/tutorial/layers/crop.md
@@ -0,0 +1,20 @@
+---
+title: Crop Layer
+---
+
+# Crop Layer
+
+* Layer type: `Crop`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1CropLayer.html)
+* Header: [`./include/caffe/layers/crop_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/crop_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/crop_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/crop_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/crop_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/crop_layer.cu)
+
+## Parameters
+
+* Parameters (`CropParameter crop_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/CropParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/data.md b/docs/tutorial/layers/data.md
new file mode 100644
index 0000000..58e0dca
--- /dev/null
+++ b/docs/tutorial/layers/data.md
@@ -0,0 +1,29 @@
+---
+title: Database Layer
+---
+
+# Database Layer
+
+* Layer type: `Data`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DataLayer.html)
+* Header: [`./include/caffe/layers/data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/data_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`DataParameter data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DataParameter.txt %}
+{% endhighlight %}
+
+* Parameters
+    - Required
+        - `source`: the name of the directory containing the database
+        - `batch_size`: the number of inputs to process at one time
+    - Optional
+        - `rand_skip`: skip up to this number of inputs at the beginning; useful for asynchronous sgd
+        - `backend` [default `LEVELDB`]: choose whether to use a `LEVELDB` or `LMDB`
+
diff --git a/docs/tutorial/layers/deconvolution.md b/docs/tutorial/layers/deconvolution.md
new file mode 100644
index 0000000..2eff967
--- /dev/null
+++ b/docs/tutorial/layers/deconvolution.md
@@ -0,0 +1,22 @@
+---
+title: Deconvolution Layer
+---
+
+# Deconvolution Layer
+
+* Layer type: `Deconvolution`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DeconvolutionLayer.html)
+* Header: [`./include/caffe/layers/deconv_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/deconv_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/deconv_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/deconv_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/deconv_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/deconv_layer.cu)
+
+## Parameters
+
+Uses the same parameters as the Convolution layer.
+
+* Parameters (`ConvolutionParameter convolution_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/ConvolutionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/dropout.md b/docs/tutorial/layers/dropout.md
new file mode 100644
index 0000000..d8c6f95
--- /dev/null
+++ b/docs/tutorial/layers/dropout.md
@@ -0,0 +1,20 @@
+---
+title: Dropout Layer
+---
+
+# Dropout Layer
+
+* Layer type: `Dropout`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DropoutLayer.html)
+* Header: [`./include/caffe/layers/dropout_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/dropout_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/dropout_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dropout_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/dropout_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dropout_layer.cu)
+
+## Parameters
+
+* Parameters (`DropoutParameter dropout_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DropoutParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/dummydata.md b/docs/tutorial/layers/dummydata.md
new file mode 100644
index 0000000..d069f9c
--- /dev/null
+++ b/docs/tutorial/layers/dummydata.md
@@ -0,0 +1,20 @@
+---
+title: Dummy Data Layer
+---
+
+# Dummy Data Layer
+
+* Layer type: `DummyData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1DummyDataLayer.html)
+* Header: [`./include/caffe/layers/dummy_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/dummy_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/dummy_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/dummy_data_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`DummyDataParameter dummy_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/DummyDataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/eltwise.md b/docs/tutorial/layers/eltwise.md
new file mode 100644
index 0000000..70fe791
--- /dev/null
+++ b/docs/tutorial/layers/eltwise.md
@@ -0,0 +1,20 @@
+---
+title: Eltwise Layer
+---
+
+# Eltwise Layer
+
+* Layer type: `Eltwise`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EltwiseLayer.html)
+* Header: [`./include/caffe/layers/eltwise_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/eltwise_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/eltwise_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/eltwise_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/eltwise_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/eltwise_layer.cu)
+
+## Parameters
+
+* Parameters (`EltwiseParameter eltwise_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/EltwiseParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/elu.md b/docs/tutorial/layers/elu.md
new file mode 100644
index 0000000..11db0f0
--- /dev/null
+++ b/docs/tutorial/layers/elu.md
@@ -0,0 +1,25 @@
+---
+title: ELU Layer
+---
+
+# ELU Layer
+
+* Layer type: `ELU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ELULayer.html)
+* Header: [`./include/caffe/layers/elu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/elu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/elu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/elu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/elu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/elu_layer.cu)
+
+## References
+
+* Clevert, Djork-Arne, Thomas Unterthiner, and Sepp Hochreiter.
+  "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)" [arXiv:1511.07289](https://arxiv.org/abs/1511.07289). (2015).
+
+## Parameters
+
+* Parameters (`ELUParameter elu_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ELUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/embed.md b/docs/tutorial/layers/embed.md
new file mode 100644
index 0000000..271636d
--- /dev/null
+++ b/docs/tutorial/layers/embed.md
@@ -0,0 +1,20 @@
+---
+title: Embed Layer
+---
+
+# Embed Layer
+
+* Layer type: `Embed`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EmbedLayer.html)
+* Header: [`./include/caffe/layers/embed_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/embed_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/embed_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/embed_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/embed_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/embed_layer.cu)
+
+## Parameters
+
+* Parameters (`EmbedParameter embed_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/EmbedParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/euclideanloss.md b/docs/tutorial/layers/euclideanloss.md
new file mode 100644
index 0000000..c1b7208
--- /dev/null
+++ b/docs/tutorial/layers/euclideanloss.md
@@ -0,0 +1,16 @@
+---
+title: Euclidean Loss Layer
+---
+# Sum-of-Squares / Euclidean Loss Layer
+
+* Layer type: `EuclideanLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1EuclideanLossLayer.html)
+* Header: [`./include/caffe/layers/euclidean_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/euclidean_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/euclidean_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/euclidean_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/euclidean_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/euclidean_loss_layer.cu)
+
+The Euclidean loss layer computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/exp.md b/docs/tutorial/layers/exp.md
new file mode 100644
index 0000000..ef2500e
--- /dev/null
+++ b/docs/tutorial/layers/exp.md
@@ -0,0 +1,24 @@
+---
+title: Exponential Layer
+---
+
+# Exponential Layer
+
+* Layer type: `Exp`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ExpLayer.html)
+* Header: [`./include/caffe/layers/exp_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/exp_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/exp_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/exp_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/exp_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/exp_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter exp_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ExpParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Power layer](power.html)
diff --git a/docs/tutorial/layers/filter.md b/docs/tutorial/layers/filter.md
new file mode 100644
index 0000000..aeda9ee
--- /dev/null
+++ b/docs/tutorial/layers/filter.md
@@ -0,0 +1,15 @@
+---
+title: Filter Layer
+---
+
+# Filter Layer
+
+* Layer type: `Filter`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1FilterLayer.html)
+* Header: [`./include/caffe/layers/filter_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/filter_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/filter_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/filter_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/filter_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/filter_layer.cu)
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/flatten.md b/docs/tutorial/layers/flatten.md
new file mode 100644
index 0000000..ecf0826
--- /dev/null
+++ b/docs/tutorial/layers/flatten.md
@@ -0,0 +1,21 @@
+---
+title: Flatten Layer
+---
+
+# Flatten Layer
+
+* Layer type: `Flatten`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1FlattenLayer.html)
+* Header: [`./include/caffe/layers/flatten_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/flatten_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/flatten_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/flatten_layer.cpp)
+
+The `Flatten` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w)`.
+
+## Parameters
+
+* Parameters (`FlattenParameter flatten_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/FlattenParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hdf5data.md b/docs/tutorial/layers/hdf5data.md
new file mode 100644
index 0000000..d6b7ea2
--- /dev/null
+++ b/docs/tutorial/layers/hdf5data.md
@@ -0,0 +1,20 @@
+---
+title: HDF5 Data Layer
+---
+
+# HDF5 Data Layer
+
+* Layer type: `HDF5Data`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HDF5DataLayer.html)
+* Header: [`./include/caffe/layers/hdf5_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hdf5_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hdf5_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_data_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/hdf5_data_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_data_layer.cu)
+
+## Parameters
+
+* Parameters (`HDF5DataParameter hdf5_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HDF5DataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hdf5output.md b/docs/tutorial/layers/hdf5output.md
new file mode 100644
index 0000000..cfbe4dd
--- /dev/null
+++ b/docs/tutorial/layers/hdf5output.md
@@ -0,0 +1,25 @@
+---
+title: HDF5 Output Layer
+---
+
+# HDF5 Output Layer
+
+* Layer type: `HDF5Output`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HDF5OutputLayer.html)
+* Header: [`./include/caffe/layers/hdf5_output_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hdf5_output_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hdf5_output_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_output_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/hdf5_output_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hdf5_output_layer.cu)
+
+The HDF5 output layer performs the opposite function of the other layers in this section: it writes its input blobs to disk.
+
+## Parameters
+
+* Parameters (`HDF5OutputParameter hdf5_output_param`)
+    - Required
+        - `file_name`: name of file to write to
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HDF5OutputParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/hingeloss.md b/docs/tutorial/layers/hingeloss.md
new file mode 100644
index 0000000..ef4fd95
--- /dev/null
+++ b/docs/tutorial/layers/hingeloss.md
@@ -0,0 +1,19 @@
+---
+title: Hinge Loss Layer
+---
+
+# Hinge (L1, L2) Loss Layer
+
+* Layer type: `HingeLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1HingeLossLayer.html)
+* Header: [`./include/caffe/layers/hinge_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/hinge_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/hinge_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/hinge_loss_layer.cpp)
+
+## Parameters
+
+* Parameters (`HingeLossParameter hinge_loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/HingeLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/im2col.md b/docs/tutorial/layers/im2col.md
new file mode 100644
index 0000000..0badc1c
--- /dev/null
+++ b/docs/tutorial/layers/im2col.md
@@ -0,0 +1,16 @@
+---
+title: Im2col Layer
+---
+
+# im2col
+
+* File type: `Im2col`
+* Header: [`./include/caffe/layers/im2col_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/im2col_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/im2col_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/im2col_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/im2col_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/im2col_layer.cu)
+
+`Im2col` is a helper for doing the image-to-column transformation that you most
+likely do not need to know about. This is used in Caffe's original convolution
+to do matrix multiplication by laying out all patches into a matrix.
+
+
diff --git a/docs/tutorial/layers/imagedata.md b/docs/tutorial/layers/imagedata.md
new file mode 100644
index 0000000..82c8a60
--- /dev/null
+++ b/docs/tutorial/layers/imagedata.md
@@ -0,0 +1,27 @@
+---
+title: ImageData Layer
+---
+
+# ImageData Layer
+
+* Layer type: `ImageData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ImageDataLayer.html)
+* Header: [`./include/caffe/layers/image_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/image_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/image_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/image_data_layer.cpp)
+
+## Parameters
+
+* Parameters (`ImageDataParameter image_data_parameter`)
+    - Required
+        - `source`: name of a text file, with each line giving an image filename and label
+        - `batch_size`: number of images to batch together
+    - Optional
+        - `rand_skip`
+        - `shuffle` [default false]
+        - `new_height`, `new_width`: if provided, resize all images to this size
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ImageDataParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/infogainloss.md b/docs/tutorial/layers/infogainloss.md
new file mode 100644
index 0000000..86140b6
--- /dev/null
+++ b/docs/tutorial/layers/infogainloss.md
@@ -0,0 +1,24 @@
+---
+title: Infogain Loss Layer
+---
+
+# Infogain Loss Layer
+
+* Layer type: `InfogainLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InfogainLossLayer.html)
+* Header: [`./include/caffe/layers/infogain_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/infogain_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/infogain_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/infogain_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/infogain_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/infogain_loss_layer.cu)
+
+A generalization of [MultinomialLogisticLossLayer](layers/multinomiallogisticloss.md) that takes an "information gain" (infogain) matrix specifying the "value" of all label pairs.
+
+Equivalent to the [MultinomialLogisticLossLayer](layers/multinomiallogisticloss.md) if the infogain matrix is the identity.
+
+## Parameters
+
+* Parameters (`Parameter infogain_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/InfogainLossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/innerproduct.md b/docs/tutorial/layers/innerproduct.md
new file mode 100644
index 0000000..98b9bea
--- /dev/null
+++ b/docs/tutorial/layers/innerproduct.md
@@ -0,0 +1,59 @@
+---
+title: Inner Product / Fully Connected Layer
+---
+
+# Inner Product / Fully Connected Layer
+
+* Layer type: `InnerProduct`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InnerProductLayer.html)
+* Header: [`./include/caffe/layers/inner_product_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/inner_product_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/inner_product_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/inner_product_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/inner_product_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/inner_product_layer.cu)
+
+* Input
+    - `n * c_i * h_i * w_i`
+* Output
+    - `n * c_o * 1 * 1`
+* Sample
+
+      layer {
+        name: "fc8"
+        type: "InnerProduct"
+        # learning rate and decay multipliers for the weights
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
+        inner_product_param {
+          num_output: 1000
+          weight_filler {
+            type: "gaussian"
+            std: 0.01
+          }
+          bias_filler {
+            type: "constant"
+            value: 0
+          }
+        }
+        bottom: "fc7"
+        top: "fc8"
+      }
+
+The `InnerProduct` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
+
+
+## Parameters
+
+* Parameters (`InnerProductParameter inner_product_param`)
+    - Required
+        - `num_output` (`c_o`): the number of filters
+    - Strongly recommended
+        - `weight_filler` [default `type: 'constant' value: 0`]
+    - Optional
+        - `bias_filler` [default `type: 'constant' value: 0`]
+        - `bias_term` [default `true`]: specifies whether to learn and apply a set of additive biases to the filter outputs
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/InnerProductParameter.txt %}
+{% endhighlight %}
+ 
diff --git a/docs/tutorial/layers/input.md b/docs/tutorial/layers/input.md
new file mode 100644
index 0000000..b74c35d
--- /dev/null
+++ b/docs/tutorial/layers/input.md
@@ -0,0 +1,19 @@
+---
+title: Input Layer
+---
+
+# Input Layer
+
+* Layer type: `Input`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1InputLayer.html)
+* Header: [`./include/caffe/layers/input_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/input_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/input_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/input_layer.cpp)
+
+## Parameters
+
+* Parameters (`InputParameter input_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto)):
+
+{% highlight Protobuf %}
+{% include proto/InputParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/log.md b/docs/tutorial/layers/log.md
new file mode 100644
index 0000000..df52037
--- /dev/null
+++ b/docs/tutorial/layers/log.md
@@ -0,0 +1,20 @@
+---
+title: Log Layer
+---
+
+# Log Layer
+
+* Layer type: `Log`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LogLayer.html)
+* Header: [`./include/caffe/layers/log_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/log_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/log_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/log_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/log_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/log_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter log_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LogParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/lrn.md b/docs/tutorial/layers/lrn.md
new file mode 100644
index 0000000..387311c
--- /dev/null
+++ b/docs/tutorial/layers/lrn.md
@@ -0,0 +1,28 @@
+---
+title: Local Response Normalization (LRN)
+---
+
+# Local Response Normalization (LRN)
+
+* Layer type: `LRN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LRNLayer.html)
+* Header: [`./include/caffe/layers/lrn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/lrn_layer.hpp)
+* CPU Implementation: [`./src/caffe/layers/lrn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lrn_layer.cpp)
+* CUDA GPU Implementation: [`./src/caffe/layers/lrn_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lrn_layer.cu)
+* Parameters (`LRNParameter lrn_param`)
+    - Optional
+        - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
+        - `alpha` [default 1]: the scaling parameter (see below)
+        - `beta` [default 5]: the exponent (see below)
+        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
+
+The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where [...]
+
+## Parameters
+
+* Parameters (`Parameter lrn_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/BatchNormParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/lstm.md b/docs/tutorial/layers/lstm.md
new file mode 100644
index 0000000..8e4095e
--- /dev/null
+++ b/docs/tutorial/layers/lstm.md
@@ -0,0 +1,21 @@
+---
+title: LSTM Layer
+---
+
+# LSTM Layer
+
+* Layer type: `LSTM`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1LSTMLayer.html)
+* Header: [`./include/caffe/layers/lstm_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/lstm_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/lstm_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_layer.cpp)
+* CPU implementation (helper): [`./src/caffe/layers/lstm_unit_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_unit_layer.cpp)
+* CUDA GPU implementation (helper): [`./src/caffe/layers/lstm_unit_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/lstm_unit_layer.cu)
+
+## Parameters
+
+* Parameters (`Parameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/memorydata.md b/docs/tutorial/layers/memorydata.md
new file mode 100644
index 0000000..754e62a
--- /dev/null
+++ b/docs/tutorial/layers/memorydata.md
@@ -0,0 +1,25 @@
+---
+title: Memory Data Layer
+---
+
+# Memory Data Layer
+
+* Layer type: `MemoryData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MemoryDataLayer.html)
+* Header: [`./include/caffe/layers/memory_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/memory_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/memory_data_layer.cpu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/memory_data_layer.cpu)
+
+The memory data layer reads data directly from memory, without copying it. In order to use it, one must call `MemoryDataLayer::Reset` (from C++) or `Net.set_input_arrays` (from Python) in order to specify a source of contiguous data (as 4D row major array), which is read one batch-sized chunk at a time.
+
+# Parameters
+
+* Parameters (`MemoryDataParameter memory_data_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/MemoryDataParameter.txt %}
+{% endhighlight %}
+
+* Parameters
+    - Required
+        - `batch_size`, `channels`, `height`, `width`: specify the size of input chunks to read from memory
diff --git a/docs/tutorial/layers/multinomiallogisticloss.md b/docs/tutorial/layers/multinomiallogisticloss.md
new file mode 100644
index 0000000..a28ab91
--- /dev/null
+++ b/docs/tutorial/layers/multinomiallogisticloss.md
@@ -0,0 +1,19 @@
+---
+title: Multinomial Logistic Loss Layer
+---
+
+# Multinomial Logistic Loss Layer
+
+* Layer type: `MultinomialLogisticLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MultinomialLogisticLossLayer.html)
+* Header: [`./include/caffe/layers/multinomial_logistic_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/multinomial_logistic_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/multinomial_logistic_loss_layer.cpu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/multinomial_logistic_loss_layer.cpu)
+
+## Parameters
+
+* Parameters (`LossParameter loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LossParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/mvn.md b/docs/tutorial/layers/mvn.md
new file mode 100644
index 0000000..08e4488
--- /dev/null
+++ b/docs/tutorial/layers/mvn.md
@@ -0,0 +1,20 @@
+---
+title: Mean-Variance Normalization (MVN) Layer
+---
+
+# Mean-Variance Normalization (MVN) Layer
+
+* Layer type: `MVN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1MVNLayer.html)
+* Header: [`./include/caffe/layers/mvn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/mvn_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/mvn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/mvn_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/mvn_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/mvn_layer.cu)
+
+## Parameters
+
+* Parameters (`MVNParameter mvn_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/MVNParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/parameter.md b/docs/tutorial/layers/parameter.md
new file mode 100644
index 0000000..b7e85ec
--- /dev/null
+++ b/docs/tutorial/layers/parameter.md
@@ -0,0 +1,21 @@
+---
+title: Parameter Layer
+---
+
+# Parameter Layer
+
+* Layer type: `Parameter`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ParameterLayer.html)
+* Header: [`./include/caffe/layers/parameter_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/parameter_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/parameter_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/parameter_layer.cpp)
+
+See [https://github.com/BVLC/caffe/pull/2079](https://github.com/BVLC/caffe/pull/2079).
+
+## Parameters
+
+* Parameters (`ParameterParameter parameter_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ParameterParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/pooling.md b/docs/tutorial/layers/pooling.md
new file mode 100644
index 0000000..12669ee
--- /dev/null
+++ b/docs/tutorial/layers/pooling.md
@@ -0,0 +1,47 @@
+---
+title: Pooling Layer
+---
+# Pooling
+
+* Layer type: `Pooling`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PoolingLayer.html)
+* Header: [`./include/caffe/layers/pooling_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/pooling_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/pooling_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/pooling_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu)
+
+* Input
+    - `n * c * h_i * w_i`
+* Output
+    - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
+
+## Parameters
+
+* Parameters (`PoolingParameter pooling_param`)
+    - Required
+        - `kernel_size` (or `kernel_h` and `kernel_w`): specifies height and width of each filter
+    - Optional
+        - `pool` [default MAX]: the pooling method. Currently MAX, AVE, or STOCHASTIC
+        - `pad` (or `pad_h` and `pad_w`) [default 0]: specifies the number of pixels to (implicitly) add to each side of the input
+        - `stride` (or `stride_h` and `stride_w`) [default 1]: specifies the intervals at which to apply the filters to the input
+
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PoolingParameter.txt %}
+{% endhighlight %}
+
+## Sample
+* Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt))
+
+      layer {
+        name: "pool1"
+        type: "Pooling"
+        bottom: "conv1"
+        top: "pool1"
+        pooling_param {
+          pool: MAX
+          kernel_size: 3 # pool over a 3x3 region
+          stride: 2      # step two pixels (in the bottom blob) between pooling regions
+        }
+      }
diff --git a/docs/tutorial/layers/power.md b/docs/tutorial/layers/power.md
new file mode 100644
index 0000000..d661752
--- /dev/null
+++ b/docs/tutorial/layers/power.md
@@ -0,0 +1,46 @@
+---
+title: Power Layer
+---
+
+# Power Layer
+
+* Layer type: `Power`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PowerLayer.html)
+* Header: [`./include/caffe/layers/power_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/power_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/power_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/power_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/power_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/power_layer.cu)
+
+The `Power` layer computes the output as (shift + scale * x) ^ power for each input element x.
+
+## Parameters
+* Parameters (`PowerParameter power_param`)
+    - Optional
+        - `power` [default 1]
+        - `scale` [default 1]
+        - `shift` [default 0]
+
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PowerParameter.txt %}
+{% endhighlight %}
+ 
+ 
+ 
+## Sample
+
+      layer {
+        name: "layer"
+        bottom: "in"
+        top: "out"
+        type: "Power"
+        power_param {
+          power: 1
+          scale: 1
+          shift: 0
+        }
+      }
+
+## See also
+
+* [Exponential layer](exp.html)
diff --git a/docs/tutorial/layers/prelu.md b/docs/tutorial/layers/prelu.md
new file mode 100644
index 0000000..e7b7b44
--- /dev/null
+++ b/docs/tutorial/layers/prelu.md
@@ -0,0 +1,20 @@
+---
+title: PReLU Layer
+---
+
+# PReLU Layer
+
+* Layer type: `PReLU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PReLULayer.html)
+* Header: [`./include/caffe/layers/prelu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/prelu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/prelu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/prelu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/prelu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/prelu_layer.cu)
+
+## Parameters
+
+* Parameters (`PReLUParameter prelu_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PReLUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/python.md b/docs/tutorial/layers/python.md
new file mode 100644
index 0000000..2e30b3a
--- /dev/null
+++ b/docs/tutorial/layers/python.md
@@ -0,0 +1,27 @@
+---
+title: Python Layer
+---
+
+# Python Layer
+
+* Layer type: `Python`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1PythonLayer.html)
+* Header: [`./include/caffe/layers/python_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/python_layer.hpp)
+
+The Python layer allows users to add customized layers without modifying the Caffe core code.
+
+## Parameters
+
+* Parameters (`PythonParameter python_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/PythonParameter.txt %}
+{% endhighlight %}
+
+## Examples and tutorials
+
+* Simple Euclidean loss example
+** [Python code](https://github.com/BVLC/caffe/blob/master/examples/pycaffe/layers/pyloss.py)
+** [Prototxt](https://github.com/BVLC/caffe/blob/master/examples/pycaffe/linreg.prototxt)
+* [Tutorial for writing Python layers with DIGITS](https://github.com/NVIDIA/DIGITS/tree/master/examples/python-layer)
diff --git a/docs/tutorial/layers/recurrent.md b/docs/tutorial/layers/recurrent.md
new file mode 100644
index 0000000..a882b72
--- /dev/null
+++ b/docs/tutorial/layers/recurrent.md
@@ -0,0 +1,20 @@
+---
+title: Recurrent Layer
+---
+
+# Recurrent Layer
+
+* Layer type: `Recurrent`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1RecurrentLayer.html)
+* Header: [`./include/caffe/layers/recurrent_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/recurrent_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/recurrent_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/recurrent_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/recurrent_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/recurrent_layer.cu)
+
+## Parameters
+
+* Parameters (`RecurrentParameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/reduction.md b/docs/tutorial/layers/reduction.md
new file mode 100644
index 0000000..db55414
--- /dev/null
+++ b/docs/tutorial/layers/reduction.md
@@ -0,0 +1,20 @@
+---
+title: Reduction Layer
+---
+
+# Reduction Layer
+
+* Layer type: `Reduction`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReductionLayer.html)
+* Header: [`./include/caffe/layers/reduction_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/reduction_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/reduction_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reduction_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/reduction_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reduction_layer.cu)
+
+## Parameters
+
+* Parameters (`ReductionParameter reduction_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReductionParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/relu.md b/docs/tutorial/layers/relu.md
new file mode 100644
index 0000000..01aab0a
--- /dev/null
+++ b/docs/tutorial/layers/relu.md
@@ -0,0 +1,32 @@
+---
+title: ReLU / Rectified-Linear and Leaky-ReLU Layer
+---
+
+# ReLU / Rectified-Linear and Leaky-ReLU Layer
+
+* Layer type: `ReLU`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReLULayer.html)
+* Header: [`./include/caffe/layers/relu_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/relu_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/relu_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/relu_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/relu_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/relu_layer.cu)
+* Sample (as seen in [`./models/bvlc_reference_caffenet/train_val.prototxt`](https://github.com/BVLC/caffe/blob/master/models/bvlc_reference_caffenet/train_val.prototxt))
+
+      layer {
+        name: "relu1"
+        type: "ReLU"
+        bottom: "conv1"
+        top: "conv1"
+      }
+
+Given an input value x, The `ReLU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
+
+## Parameters
+
+* Parameters (`ReLUParameter relu_param`)
+    - Optional
+        - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReLUParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/reshape.md b/docs/tutorial/layers/reshape.md
new file mode 100644
index 0000000..92d23f2
--- /dev/null
+++ b/docs/tutorial/layers/reshape.md
@@ -0,0 +1,51 @@
+---
+title: Reshape Layer
+---
+
+# Reshape Layer
+* Layer type: `Reshape`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ReshapeLayer.html)
+* Header: [`./include/caffe/layers/reshape_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/reshape_layer.hpp)
+* Implementation: [`./src/caffe/layers/reshape_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/reshape_layer.cpp)
+
+* Input
+    - a single blob with arbitrary dimensions
+* Output
+    - the same blob, with modified dimensions, as specified by `reshape_param`
+
+* Sample
+
+        layer {
+          name: "reshape"
+          type: "Reshape"
+          bottom: "input"
+          top: "output"
+          reshape_param {
+            shape {
+              dim: 0  # copy the dimension from below
+              dim: 2
+              dim: 3
+              dim: -1 # infer it from the other dimensions
+            }
+          }
+        }
+
+The `Reshape` layer can be used to change the dimensions of its input, without changing its data. Just like the `Flatten` layer, only the dimensions are changed; no data is copied in the process.
+
+Output dimensions are specified by the `ReshapeParam` proto. Positive numbers are used directly, setting the corresponding dimension of the output blob. In addition, two special values are accepted for any of the target dimension values:
+
+* **0** means "copy the respective dimension of the bottom layer". That is, if the bottom has 2 as its 1st dimension, the top will have 2 as its 1st dimension as well, given `dim: 0` as the 1st target dimension.
+* **-1** stands for "infer this from the other dimensions". This behavior is similar to that of -1 in *numpy*'s or `[]` for *MATLAB*'s reshape: this dimension is calculated to keep the overall element count the same as in the bottom layer. At most one -1 can be used in a reshape operation.
+
+As another example, specifying `reshape_param { shape { dim: 0 dim: -1 } }` makes the layer behave in exactly the same way as the `Flatten` layer.
+ 
+## Parameters
+
+* Parameters (`ReshapeParameter reshape_param`)
+    - Optional: (also see detailed description below)
+        - `shape`
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ReshapeParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/rnn.md b/docs/tutorial/layers/rnn.md
new file mode 100644
index 0000000..b6fcf47
--- /dev/null
+++ b/docs/tutorial/layers/rnn.md
@@ -0,0 +1,19 @@
+---
+title: RNN Layer
+---
+
+# RNN Layer
+
+* Layer type: `RNN`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1RNNLayer.html)
+* Header: [`./include/caffe/layers/rnn_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/rnn_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/rnn_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/rnn_layer.cpp)
+
+## Parameters
+
+* Parameters (`RecurrentParameter recurrent_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/RecurrentParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/scale.md b/docs/tutorial/layers/scale.md
new file mode 100644
index 0000000..0e27549
--- /dev/null
+++ b/docs/tutorial/layers/scale.md
@@ -0,0 +1,20 @@
+---
+title: Scale Layer
+---
+
+# Scale Layer
+
+* Layer type: `Scale`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ScaleLayer.html)
+* Header: [`./include/caffe/layers/scale_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/scale_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/scale_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/scale_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/scale_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/scale_layer.cu)
+
+## Parameters
+
+* Parameters (`ScaleParameter scale_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ScaleParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/sigmoid.md b/docs/tutorial/layers/sigmoid.md
new file mode 100644
index 0000000..5053183
--- /dev/null
+++ b/docs/tutorial/layers/sigmoid.md
@@ -0,0 +1,20 @@
+---
+title: Sigmoid Layer
+---
+
+# Sigmoid Layer
+
+* Layer type: `Sigmoid`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SigmoidLayer.html)
+* Header: [`./include/caffe/layers/sigmoid_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/sigmoid_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/sigmoid_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cu)
+
+## Parameters
+
+* Parameters (`SigmoidParameter sigmoid_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SigmoidParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/sigmoidcrossentropyloss.md b/docs/tutorial/layers/sigmoidcrossentropyloss.md
new file mode 100644
index 0000000..a6e42ca
--- /dev/null
+++ b/docs/tutorial/layers/sigmoidcrossentropyloss.md
@@ -0,0 +1,13 @@
+---
+title: Sigmoid Cross-Entropy Loss Layer
+---
+
+# Sigmoid Cross-Entropy Loss Layer
+
+* Layer type: `SigmoidCrossEntropyLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SigmoidCrossEntropyLossLayer.html)
+* Header: [`./include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu)
+
+To-do.
diff --git a/docs/tutorial/layers/silence.md b/docs/tutorial/layers/silence.md
new file mode 100644
index 0000000..2c37a9c
--- /dev/null
+++ b/docs/tutorial/layers/silence.md
@@ -0,0 +1,23 @@
+---
+title: Silence Layer
+---
+
+# Silence Layer
+
+* Layer type: `Silence`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SilenceLayer.html)
+* Header: [`./include/caffe/layers/silence_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/silence_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/silence_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/silence_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/silence_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/silence_layer.cu)
+
+Silences a blob, so that it is not printed.
+
+## Parameters
+
+* Parameters (`SilenceParameter silence_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/BatchNormParameter.txt %}
+{% endhighlight %}
+
diff --git a/docs/tutorial/layers/slice.md b/docs/tutorial/layers/slice.md
new file mode 100644
index 0000000..a492f1e
--- /dev/null
+++ b/docs/tutorial/layers/slice.md
@@ -0,0 +1,42 @@
+---
+title: Slice Layer
+---
+
+# Slice Layer
+
+* Layer type: `Slice`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SliceLayer.html)
+* Header: [`./include/caffe/layers/slice_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/slice_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/slice_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/slice_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/slice_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/slice_layer.cu)
+
+The `Slice` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
+
+* Sample
+
+      layer {
+        name: "slicer_label"
+        type: "Slice"
+        bottom: "label"
+        ## Example of label with a shape N x 3 x 1 x 1
+        top: "label1"
+        top: "label2"
+        top: "label3"
+        slice_param {
+          axis: 1
+          slice_point: 1
+          slice_point: 2
+        }
+      }
+
+`axis` indicates the target axis; `slice_point` indicates indexes in the selected dimension (the number of indices must be equal to the number of top blobs minus one).
+
+## Parameters
+
+* Parameters (`SliceParameter slice_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SliceParameter.txt %}
+{% endhighlight %}
+
diff --git a/docs/tutorial/layers/softmax.md b/docs/tutorial/layers/softmax.md
new file mode 100644
index 0000000..e5d5342
--- /dev/null
+++ b/docs/tutorial/layers/softmax.md
@@ -0,0 +1,24 @@
+---
+title: Softmax Layer
+---
+
+# Softmax Layer
+
+* Layer type: `Softmax`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SoftmaxLayer.html)
+* Header: [`./include/caffe/layers/softmax_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/softmax_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/softmax_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/softmax_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_layer.cu)
+
+## Parameters
+
+* Parameters (`SoftmaxParameter softmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SoftmaxParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Softmax loss layer](softmaxwithloss.html)
diff --git a/docs/tutorial/layers/softmaxwithloss.md b/docs/tutorial/layers/softmaxwithloss.md
new file mode 100644
index 0000000..d9a6774
--- /dev/null
+++ b/docs/tutorial/layers/softmaxwithloss.md
@@ -0,0 +1,33 @@
+---
+title: Softmax with Loss Layer
+---
+
+# Softmax with Loss Layer
+
+* Layer type: `SoftmaxWithLoss`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SoftmaxWithLossLayer.html)
+* Header: [`./include/caffe/layers/softmax_loss_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/softmax_loss_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/softmax_loss_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_loss_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/softmax_loss_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/softmax_loss_layer.cu)
+
+The softmax loss layer computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
+
+## Parameters
+
+* Parameters (`SoftmaxParameter softmax_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SoftmaxParameter.txt %}
+{% endhighlight %}
+
+* Parameters (`LossParameter loss_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/LossParameter.txt %}
+{% endhighlight %}
+
+## See also
+
+* [Softmax layer](softmax.html)
diff --git a/docs/tutorial/layers/split.md b/docs/tutorial/layers/split.md
new file mode 100644
index 0000000..4fb71d1
--- /dev/null
+++ b/docs/tutorial/layers/split.md
@@ -0,0 +1,17 @@
+---
+title: Split Layer
+---
+
+# Split Layer
+
+* Layer type: `Split`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SplitLayer.html)
+* Header: [`./include/caffe/layers/split_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/split_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/split_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/split_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/split_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/split_layer.cu)
+
+The `Split` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
+
+## Parameters
+
+Does not take any parameters.
diff --git a/docs/tutorial/layers/spp.md b/docs/tutorial/layers/spp.md
new file mode 100644
index 0000000..26e5862
--- /dev/null
+++ b/docs/tutorial/layers/spp.md
@@ -0,0 +1,20 @@
+---
+title: Spatial Pyramid Pooling Layer
+---
+
+# Spatial Pyramid Pooling Layer
+
+* Layer type: `SPP`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1SPPLayer.html)
+* Header: [`./include/caffe/layers/spp_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/spp_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/spp_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/spp_layer.cpp)
+
+
+## Parameters
+
+* Parameters (`SPPParameter spp_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/SPPParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/tanh.md b/docs/tutorial/layers/tanh.md
new file mode 100644
index 0000000..3606345
--- /dev/null
+++ b/docs/tutorial/layers/tanh.md
@@ -0,0 +1,18 @@
+---
+title: TanH Layer
+---
+
+# TanH Layer
+
+* Header: [`./include/caffe/layers/tanh_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/tanh_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/tanh_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tanh_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/tanh_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tanh_layer.cu)
+
+## Parameters
+
+* Parameters (`TanHParameter tanh_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/TanHParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/threshold.md b/docs/tutorial/layers/threshold.md
new file mode 100644
index 0000000..819e9e6
--- /dev/null
+++ b/docs/tutorial/layers/threshold.md
@@ -0,0 +1,18 @@
+---
+title: Threshold Layer
+---
+
+# Threshold Layer
+
+* Header: [`./include/caffe/layers/threshold_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/threshold_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/threshold_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/threshold_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/threshold_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/threshold_layer.cu)
+
+## Parameters
+
+* Parameters (`ThresholdParameter threshold_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/ThresholdParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/tile.md b/docs/tutorial/layers/tile.md
new file mode 100644
index 0000000..ea03aaa
--- /dev/null
+++ b/docs/tutorial/layers/tile.md
@@ -0,0 +1,20 @@
+---
+title: Tile Layer
+---
+
+# Tile Layer
+
+* Layer type: `Tile`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1TileLayer.html)
+* Header: [`./include/caffe/layers/tile_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/tile_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/tile_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tile_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/tile_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/tile_layer.cu)
+
+## Parameters
+
+* Parameters (`TileParameter tile_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/TileParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/windowdata.md b/docs/tutorial/layers/windowdata.md
new file mode 100644
index 0000000..0cb4a8d
--- /dev/null
+++ b/docs/tutorial/layers/windowdata.md
@@ -0,0 +1,19 @@
+---
+title: WindowData Layer
+---
+
+# WindowData Layer
+
+* Layer type: `WindowData`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1WindowDataLayer.html)
+* Header: [`./include/caffe/layers/window_data_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/window_data_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/window_data_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/window_data_layer.cpp)
+
+## Parameters
+
+* Parameters (`WindowDataParameter`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+{% highlight Protobuf %}
+{% include proto/WindowDataParameter.txt %}
+{% endhighlight %}
diff --git a/examples/02-fine-tuning.ipynb b/examples/02-fine-tuning.ipynb
index f44eaf9..422259d 100644
--- a/examples/02-fine-tuning.ipynb
+++ b/examples/02-fine-tuning.ipynb
@@ -70,7 +70,7 @@
     "\n",
     "- `get_ilsvrc_aux.sh` to download the ImageNet data mean, labels, etc.\n",
     "- `download_model_binary.py` to download the pretrained reference model\n",
-    "- `finetune_flickr_style/assemble_data.py` downloadsd the style training and testing data\n",
+    "- `finetune_flickr_style/assemble_data.py` downloads the style training and testing data\n",
     "\n",
     "We'll download just a small subset of the full dataset for this exercise: just 2000 of the 80K images, from 5 of the 20 style categories.  (To download the full dataset, set `full_dataset = True` in the cell below.)"
    ]
@@ -146,7 +146,7 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "weights = caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'\n",
+    "weights = os.path.join(caffe_root, 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel')\n",
     "assert os.path.exists(weights)"
    ]
   },
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 663d736..a59e0df 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,7 +23,7 @@ foreach(source_file ${examples_srcs})
 
   if(UNIX OR APPLE)
     # Funny command to make tutorials work
-    # TODO: remove in future as soon as naming is standartaized everywhere
+    # TODO: remove in future as soon as naming is standardized everywhere
     set(__outname ${PROJECT_BINARY_DIR}/examples/${folder}/${name}${Caffe_POSTFIX})
     add_custom_command(TARGET ${name} POST_BUILD
                        COMMAND ln -sf "${__outname}" "${__outname}.bin")
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index af360ac..2f59471 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -220,6 +220,7 @@ class Blob {
   void set_cpu_data(Dtype* data);
   const int* gpu_shape() const;
   const Dtype* gpu_data() const;
+  void set_gpu_data(Dtype* data);
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
   Dtype* mutable_cpu_data();
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 3c6a076..4904d1d 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -158,11 +158,14 @@ class Caffe {
   // Search from start_id to the highest possible device ordinal,
   // return the ordinal of the first available device.
   static int FindDevice(const int start_id = 0);
-  // Parallel training info
+  // Parallel training
   inline static int solver_count() { return Get().solver_count_; }
   inline static void set_solver_count(int val) { Get().solver_count_ = val; }
-  inline static bool root_solver() { return Get().root_solver_; }
-  inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
+  inline static int solver_rank() { return Get().solver_rank_; }
+  inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
+  inline static bool multiprocess() { return Get().multiprocess_; }
+  inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
+  inline static bool root_solver() { return Get().solver_rank_ == 0; }
 
  protected:
 #ifndef CPU_ONLY
@@ -172,8 +175,11 @@ class Caffe {
   shared_ptr<RNG> random_generator_;
 
   Brew mode_;
+
+  // Parallel training
   int solver_count_;
-  bool root_solver_;
+  int solver_rank_;
+  bool multiprocess_;
 
  private:
   // The private constructor to avoid duplicate instantiation.
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
deleted file mode 100644
index 8ed5542..0000000
--- a/include/caffe/data_reader.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef CAFFE_DATA_READER_HPP_
-#define CAFFE_DATA_READER_HPP_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/internal_thread.hpp"
-#include "caffe/util/blocking_queue.hpp"
-#include "caffe/util/db.hpp"
-
-namespace caffe {
-
-/**
- * @brief Reads data from a source to queues available to data layers.
- * A single reading thread is created per source, even if multiple solvers
- * are running in parallel, e.g. for multi-GPU training. This makes sure
- * databases are read sequentially, and that each solver accesses a different
- * subset of the database. Data is distributed to solvers in a round-robin
- * way to keep parallel training deterministic.
- */
-class DataReader {
- public:
-  explicit DataReader(const LayerParameter& param);
-  ~DataReader();
-
-  inline BlockingQueue<Datum*>& free() const {
-    return queue_pair_->free_;
-  }
-  inline BlockingQueue<Datum*>& full() const {
-    return queue_pair_->full_;
-  }
-
- protected:
-  // Queue pairs are shared between a body and its readers
-  class QueuePair {
-   public:
-    explicit QueuePair(int size);
-    ~QueuePair();
-
-    BlockingQueue<Datum*> free_;
-    BlockingQueue<Datum*> full_;
-
-  DISABLE_COPY_AND_ASSIGN(QueuePair);
-  };
-
-  // A single body is created per source
-  class Body : public InternalThread {
-   public:
-    explicit Body(const LayerParameter& param);
-    virtual ~Body();
-
-   protected:
-    void InternalThreadEntry();
-    void read_one(db::Cursor* cursor, QueuePair* qp);
-
-    const LayerParameter param_;
-    BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;
-
-    friend class DataReader;
-
-  DISABLE_COPY_AND_ASSIGN(Body);
-  };
-
-  // A source is uniquely identified by its layer name + path, in case
-  // the same database is read from two different locations in the net.
-  static inline string source_key(const LayerParameter& param) {
-    return param.name() + ":" + param.data_param().source();
-  }
-
-  const shared_ptr<QueuePair> queue_pair_;
-  shared_ptr<Body> body_;
-
-  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
-
-DISABLE_COPY_AND_ASSIGN(DataReader);
-};
-
-}  // namespace caffe
-
-#endif  // CAFFE_DATA_READER_HPP_
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 6a8c5a0..0ba6766 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -42,8 +42,8 @@ class InternalThread {
   bool must_stop();
 
  private:
-  void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
-      bool root_solver);
+  void entry(int device, Caffe::Brew mode, int rand_seed,
+      int solver_count, int solver_rank, bool multiprocess);
 
   shared_ptr<boost::thread> thread_;
 };
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 10f353f..30dbfd5 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -38,7 +38,7 @@ class Layer {
    * layer.
    */
   explicit Layer(const LayerParameter& param)
-    : layer_param_(param), is_shared_(false) {
+    : layer_param_(param) {
       // Set phase and copy blobs (if there are any).
       phase_ = param.phase();
       if (layer_param_.blobs_size() > 0) {
@@ -66,7 +66,6 @@ class Layer {
    */
   void SetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-    InitMutex();
     CheckBlobCounts(bottom, top);
     LayerSetUp(bottom, top);
     Reshape(bottom, top);
@@ -93,30 +92,6 @@ class Layer {
       const vector<Blob<Dtype>*>& top) {}
 
   /**
-   * @brief Whether a layer should be shared by multiple nets during data
-   *        parallelism. By default, all layers except for data layers should
-   *        not be shared. data layers should be shared to ensure each worker
-   *        solver access data sequentially during data parallelism.
-   */
-  virtual inline bool ShareInParallel() const { return false; }
-
-  /** @brief Return whether this layer is actually shared by other nets.
-   *         If ShareInParallel() is true and using more than one GPU and the
-   *         net has TRAIN phase, then this function is expected return true.
-   */
-  inline bool IsShared() const { return is_shared_; }
-
-  /** @brief Set whether this layer is actually shared by other nets
-   *         If ShareInParallel() is true and using more than one GPU and the
-   *         net has TRAIN phase, then is_shared should be set true.
-   */
-  inline void SetShared(bool is_shared) {
-    CHECK(ShareInParallel() || !is_shared)
-        << type() << "Layer does not support sharing.";
-    is_shared_ = is_shared;
-  }
-
-  /**
    * @brief Adjust the shapes of top blobs and internal buffers to accommodate
    *        the shapes of the bottom blobs.
    *
@@ -428,19 +403,6 @@ class Layer {
   }
 
  private:
-  /** Whether this layer is actually shared by other nets*/
-  bool is_shared_;
-
-  /** The mutex for sequential forward if this layer is shared */
-  shared_ptr<boost::mutex> forward_mutex_;
-
-  /** Initialize forward_mutex_ */
-  void InitMutex();
-  /** Lock forward_mutex_ if this layer is shared */
-  void Lock();
-  /** Unlock forward_mutex_ if this layer is shared */
-  void Unlock();
-
   DISABLE_COPY_AND_ASSIGN(Layer);
 };  // class Layer
 
@@ -450,8 +412,6 @@ class Layer {
 template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  // Lock during forward to ensure sequential forward
-  Lock();
   Dtype loss = 0;
   Reshape(bottom, top);
   switch (Caffe::mode()) {
@@ -482,7 +442,6 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
   default:
     LOG(FATAL) << "Unknown caffe mode.";
   }
-  Unlock();
   return loss;
 }
 
diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 2c49b73..21d3ada 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -67,16 +67,14 @@ class BasePrefetchingDataLayer :
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
 
-  // Prefetches batches (asynchronously if to GPU memory)
-  static const int PREFETCH_COUNT = 3;
-
  protected:
   virtual void InternalThreadEntry();
   virtual void load_batch(Batch<Dtype>* batch) = 0;
 
-  Batch<Dtype> prefetch_[PREFETCH_COUNT];
+  vector<shared_ptr<Batch<Dtype> > > prefetch_;
   BlockingQueue<Batch<Dtype>*> prefetch_free_;
   BlockingQueue<Batch<Dtype>*> prefetch_full_;
+  Batch<Dtype>* prefetch_current_;
 
   Blob<Dtype> transformed_data_;
 };
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index 6c36179..dec5818 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -4,7 +4,6 @@
 #include <vector>
 
 #include "caffe/blob.hpp"
-#include "caffe/data_reader.hpp"
 #include "caffe/data_transformer.hpp"
 #include "caffe/internal_thread.hpp"
 #include "caffe/layer.hpp"
@@ -29,9 +28,13 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
+  void Next();
+  bool Skip();
   virtual void load_batch(Batch<Dtype>* batch);
 
-  DataReader reader_;
+  shared_ptr<db::DB> db_;
+  shared_ptr<db::Cursor> cursor_;
+  uint64_t offset_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
index b04cf8e..650a3fb 100644
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ b/include/caffe/layers/hdf5_data_layer.hpp
@@ -23,7 +23,7 @@ template <typename Dtype>
 class HDF5DataLayer : public Layer<Dtype> {
  public:
   explicit HDF5DataLayer(const LayerParameter& param)
-      : Layer<Dtype>(param) {}
+      : Layer<Dtype>(param), offset_() {}
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
@@ -38,6 +38,9 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual inline int MinTopBlobs() const { return 1; }
 
  protected:
+  void Next();
+  bool Skip();
+
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -55,6 +58,7 @@ class HDF5DataLayer : public Layer<Dtype> {
   std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
   std::vector<unsigned int> data_permutation_;
   std::vector<unsigned int> file_permutation_;
+  uint64_t offset_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index 66dbbdf..10c4bfd 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -21,8 +21,8 @@ class PythonLayer : public Layer<Dtype> {
     // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
     // Details: https://github.com/BVLC/caffe/issues/2936
     if (this->phase_ == TRAIN && Caffe::solver_count() > 1
-        && !ShareInParallel()) {
-      LOG(FATAL) << "PythonLayer is not implemented in Multi-GPU training";
+        && !Caffe::multiprocess()) {
+      LOG(FATAL) << "PythonLayer does not support CLI Multi-GPU, use train.py";
     }
     self_.attr("param_str") = bp::str(
         this->layer_param_.python_param().param_str());
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 493bdf2..d3c9306 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -23,10 +23,9 @@ namespace caffe {
 template <typename Dtype>
 class Net {
  public:
-  explicit Net(const NetParameter& param, const Net* root_net = NULL);
+  explicit Net(const NetParameter& param);
   explicit Net(const string& param_file, Phase phase,
-      const int level = 0, const vector<string>* stages = NULL,
-      const Net* root_net = NULL);
+      const int level = 0, const vector<string>* stages = NULL);
   virtual ~Net() {}
 
   /// @brief Initialize a network with a NetParameter.
@@ -228,6 +227,31 @@ class Net {
   static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
       const string& layer_name);
 
+  // Invoked at specific points during an iteration
+  class Callback {
+   protected:
+    virtual void run(int layer) = 0;
+
+    template <typename T>
+    friend class Net;
+  };
+  const vector<Callback*>& before_forward() const { return before_forward_; }
+  void add_before_forward(Callback* value) {
+    before_forward_.push_back(value);
+  }
+  const vector<Callback*>& after_forward() const { return after_forward_; }
+  void add_after_forward(Callback* value) {
+    after_forward_.push_back(value);
+  }
+  const vector<Callback*>& before_backward() const { return before_backward_; }
+  void add_before_backward(Callback* value) {
+    before_backward_.push_back(value);
+  }
+  const vector<Callback*>& after_backward() const { return after_backward_; }
+  void add_after_backward(Callback* value) {
+    after_backward_.push_back(value);
+  }
+
  protected:
   // Helpers for Init.
   /// @brief Append a new top blob to the net.
@@ -306,9 +330,13 @@ class Net {
   size_t memory_used_;
   /// Whether to compute and display debug info for the net.
   bool debug_info_;
-  /// The root net that actually holds the shared layers in data parallelism
-  const Net* const root_net_;
-  DISABLE_COPY_AND_ASSIGN(Net);
+  // Callbacks
+  vector<Callback*> before_forward_;
+  vector<Callback*> after_forward_;
+  vector<Callback*> before_backward_;
+  vector<Callback*> after_backward_;
+
+DISABLE_COPY_AND_ASSIGN(Net);
 };
 
 
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
index 6c496c8..64bb48e 100644
--- a/include/caffe/parallel.hpp
+++ b/include/caffe/parallel.hpp
@@ -1,8 +1,11 @@
 #ifndef CAFFE_PARALLEL_HPP_
 #define CAFFE_PARALLEL_HPP_
 
-#include <boost/date_time/posix_time/posix_time.hpp>
+#ifdef USE_NCCL
 
+#include <boost/thread.hpp>
+
+#include <string>
 #include <vector>
 
 #include "caffe/blob.hpp"
@@ -13,6 +16,7 @@
 #include "caffe/solver.hpp"
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/blocking_queue.hpp"
+#include "caffe/util/nccl.hpp"
 
 namespace caffe {
 
@@ -51,7 +55,7 @@ class GPUParams : public Params<Dtype> {
   GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device);
   virtual ~GPUParams();
 
-  void configure(Solver<Dtype>* solver) const;
+  void Configure(Solver<Dtype>* solver) const;
 
  protected:
   using Params<Dtype>::size_;
@@ -59,58 +63,55 @@ class GPUParams : public Params<Dtype> {
   using Params<Dtype>::diff_;
 };
 
-class DevicePair {
- public:
-  DevicePair(int parent, int device)
-      : parent_(parent),
-        device_(device) {
-  }
-  inline int parent() {
-    return parent_;
-  }
-  inline int device() {
-    return device_;
-  }
-
-  // Group GPUs in pairs, by proximity depending on machine's topology
-  static void compute(const vector<int> devices, vector<DevicePair>* pairs);
-
- protected:
-  int parent_;
-  int device_;
-};
-
-// Synchronous data parallelism using map-reduce between local GPUs.
 template<typename Dtype>
-class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
-    public InternalThread {
+class NCCL : public GPUParams<Dtype>,
+             public Solver<Dtype>::Callback,
+             public Net<Dtype>::Callback {
  public:
-  explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                   P2PSync<Dtype>* parent, const SolverParameter& param);
-  virtual ~P2PSync();
-
-  inline const shared_ptr<Solver<Dtype> >& solver() const {
-    return solver_;
-  }
-
-  void Run(const vector<int>& gpus);
-  void Prepare(const vector<int>& gpus,
-               vector<shared_ptr<P2PSync<Dtype> > >* syncs);
-  inline const int initial_iter() const { return initial_iter_; }
+  /**
+   * Single process version.
+   */
+  explicit NCCL(shared_ptr<Solver<Dtype> > solver);
+  /**
+   * In multi-process settings, first create a NCCL id (new_uid), then
+   * pass it to each process to create connected instances.
+   */
+  NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid);
+  ~NCCL();
+
+  boost::barrier* barrier();
+  void set_barrier(boost::barrier* value);
+
+  /**
+   * In single process settings, create instances without uids and
+   * call this to connect them.
+   */
+  static void InitSingleProcess(vector<NCCL<Dtype>*>* nccls);
+
+  static string new_uid();
+
+  /**
+   * Broadcast weights from rank 0 other solvers.
+   */
+  void Broadcast();
+
+  /**
+   * Single process multi-GPU.
+   */
+  void Run(const vector<int>& gpus, const char* restore);
 
  protected:
-  void on_start();
+  void Init();
+  void on_start() {}
+  void run(int layer);  // Net callback
   void on_gradients_ready();
 
-  void InternalThreadEntry();
+  ncclComm_t comm_;
+  cudaStream_t stream_;
 
-  P2PSync<Dtype>* parent_;
-  vector<P2PSync<Dtype>*> children_;
-  BlockingQueue<P2PSync<Dtype>*> queue_;
-  const int initial_iter_;
-  Dtype* parent_grads_;
   shared_ptr<Solver<Dtype> > solver_;
-
+  // Should not be necessary, https://github.com/NVIDIA/nccl/issues/37
+  boost::barrier* barrier_;
   using Params<Dtype>::size_;
   using Params<Dtype>::data_;
   using Params<Dtype>::diff_;
@@ -118,4 +119,5 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
 
 }  // namespace caffe
 
-#endif
+#endif  // USE_NCCL
+#endif  // header
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index eafcee3..a28d8cb 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -6,6 +6,7 @@
 
 #include "caffe/net.hpp"
 #include "caffe/solver_factory.hpp"
+#include "caffe/util/benchmark.hpp"
 
 namespace caffe {
 
@@ -40,9 +41,8 @@ typedef boost::function<SolverAction::Enum()> ActionCallback;
 template <typename Dtype>
 class Solver {
  public:
-  explicit Solver(const SolverParameter& param,
-      const Solver* root_solver = NULL);
-  explicit Solver(const string& param_file, const Solver* root_solver = NULL);
+  explicit Solver(const SolverParameter& param);
+  explicit Solver(const string& param_file);
   void Init(const SolverParameter& param);
   void InitTrainNet();
   void InitTestNets();
@@ -72,7 +72,7 @@ class Solver {
   inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
     return test_nets_;
   }
-  int iter() { return iter_; }
+  int iter() const { return iter_; }
 
   // Invoked at specific points during an iteration
   class Callback {
@@ -118,10 +118,6 @@ class Solver {
   vector<Dtype> losses_;
   Dtype smoothed_loss_;
 
-  // The root solver that holds root nets (actually containing shared layers)
-  // in data parallelism
-  const Solver* const root_solver_;
-
   // A function that can be set by a client of the Solver to provide indication
   // that it wants a snapshot saved and/or to exit early.
   ActionCallback action_request_function_;
@@ -129,31 +125,11 @@ class Solver {
   // True iff a request to stop early was received.
   bool requested_early_exit_;
 
-  DISABLE_COPY_AND_ASSIGN(Solver);
-};
+  // Timing information, handy to tune e.g. nbr of GPUs
+  Timer iteration_timer_;
+  float iterations_last_;
 
-/**
- * @brief Solver that only computes gradients, used as worker
- *        for multi-GPU training.
- */
-template <typename Dtype>
-class WorkerSolver : public Solver<Dtype> {
- public:
-  explicit WorkerSolver(const SolverParameter& param,
-      const Solver<Dtype>* root_solver = NULL)
-      : Solver<Dtype>(param, root_solver) {}
-
- protected:
-  void ApplyUpdate() {}
-  void SnapshotSolverState(const string& model_filename) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromBinaryProto(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
-  void RestoreSolverStateFromHDF5(const string& state_file) {
-    LOG(FATAL) << "Should not be called on worker solver.";
-  }
+  DISABLE_COPY_AND_ASSIGN(Solver);
 };
 
 }  // namespace caffe
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 38ee466..317ce29 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -3,6 +3,10 @@
 
 #include <cstdlib>
 
+#ifdef USE_MKL
+  #include "mkl.h"
+#endif
+
 #include "caffe/common.hpp"
 
 namespace caffe {
@@ -20,7 +24,11 @@ inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) {
     return;
   }
 #endif
+#ifdef USE_MKL
+  *ptr = mkl_malloc(size ? size:1, 64);
+#else
   *ptr = malloc(size);
+#endif
   *use_cuda = false;
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
@@ -32,7 +40,11 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
     return;
   }
 #endif
+#ifdef USE_MKL
+  mkl_free(ptr);
+#else
   free(ptr);
+#endif
 }
 
 
@@ -44,14 +56,8 @@ inline void CaffeFreeHost(void* ptr, bool use_cuda) {
  */
 class SyncedMemory {
  public:
-  SyncedMemory()
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
-        gpu_device_(-1) {}
-  explicit SyncedMemory(size_t size)
-      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
-        gpu_device_(-1) {}
+  SyncedMemory();
+  explicit SyncedMemory(size_t size);
   ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
@@ -68,6 +74,8 @@ class SyncedMemory {
 #endif
 
  private:
+  void check_device();
+
   void to_cpu();
   void to_gpu();
   void* cpu_ptr_;
@@ -77,7 +85,7 @@ class SyncedMemory {
   bool own_cpu_data_;
   bool cpu_malloc_use_cuda_;
   bool own_gpu_data_;
-  int gpu_device_;
+  int device_;
 
   DISABLE_COPY_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
diff --git a/include/caffe/util/db_leveldb.hpp b/include/caffe/util/db_leveldb.hpp
index e9fa0d3..4cdb6db 100644
--- a/include/caffe/util/db_leveldb.hpp
+++ b/include/caffe/util/db_leveldb.hpp
@@ -14,7 +14,10 @@ namespace caffe { namespace db {
 class LevelDBCursor : public Cursor {
  public:
   explicit LevelDBCursor(leveldb::Iterator* iter)
-    : iter_(iter) { SeekToFirst(); }
+    : iter_(iter) {
+    SeekToFirst();
+    CHECK(iter_->status().ok()) << iter_->status().ToString();
+  }
   ~LevelDBCursor() { delete iter_; }
   virtual void SeekToFirst() { iter_->SeekToFirst(); }
   virtual void Next() { iter_->Next(); }
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 6f6d3fe..51068fe 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -185,6 +185,11 @@ void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype* X, cudaStream_t str);
+#endif
+
 template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
diff --git a/include/caffe/util/nccl.hpp b/include/caffe/util/nccl.hpp
new file mode 100644
index 0000000..e01fb74
--- /dev/null
+++ b/include/caffe/util/nccl.hpp
@@ -0,0 +1,37 @@
+#ifndef CAFFE_UTIL_NCCL_H_
+#define CAFFE_UTIL_NCCL_H_
+#ifdef USE_NCCL
+
+#include <nccl.h>
+
+#include "caffe/common.hpp"
+
+#define NCCL_CHECK(condition) \
+{ \
+  ncclResult_t result = condition; \
+  CHECK_EQ(result, ncclSuccess) << " " \
+    << ncclGetErrorString(result); \
+}
+
+namespace caffe {
+
+namespace nccl {
+
+template <typename Dtype> class dataType;
+
+template<> class dataType<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+template<> class dataType<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
+}  // namespace nccl
+
+}  // namespace caffe
+
+#endif  // end USE_NCCL
+
+#endif  // CAFFE_UTIL_NCCL_H_
diff --git a/models/bvlc_googlenet/train_val.prototxt b/models/bvlc_googlenet/train_val.prototxt
old mode 100644
new mode 100755
index 5dee3ab..5fe367f
--- a/models/bvlc_googlenet/train_val.prototxt
+++ b/models/bvlc_googlenet/train_val.prototxt
@@ -1692,7 +1692,7 @@ layer {
   type: "SoftmaxWithLoss"
   bottom: "loss2/classifier"
   bottom: "label"
-  top: "loss2/loss1"
+  top: "loss2/loss2"
   loss_weight: 0.3
 }
 layer {
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 35868a4..43a0c49 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
-from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
+from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, Layer, get_solver
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index bdee75a..3589e47 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -51,6 +51,19 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
+void InitLog(int level) {
+  FLAGS_logtostderr = 1;
+  FLAGS_minloglevel = level;
+  ::google::InitGoogleLogging("");
+  ::google::InstallFailureSignalHandler();
+}
+void InitLogInfo() {
+  InitLog(google::INFO);
+}
+void Log(const string& s) {
+  LOG(INFO) << s;
+}
+
 void set_random_seed(unsigned int seed) { Caffe::set_random_seed(seed); }
 
 // For convenience, check that input files can be opened, and raise an
@@ -254,12 +267,12 @@ bp::object BlobVec_add_blob(bp::tuple args, bp::dict kwargs) {
 }
 
 template<typename Dtype>
-class PythonCallback: public Solver<Dtype>::Callback {
+class SolverCallback: public Solver<Dtype>::Callback {
  protected:
   bp::object on_start_, on_gradients_ready_;
 
  public:
-  PythonCallback(bp::object on_start, bp::object on_gradients_ready)
+  SolverCallback(bp::object on_start, bp::object on_gradients_ready)
     : on_start_(on_start), on_gradients_ready_(on_gradients_ready) { }
   virtual void on_gradients_ready() {
     on_gradients_ready_();
@@ -271,9 +284,61 @@ class PythonCallback: public Solver<Dtype>::Callback {
 template<typename Dtype>
 void Solver_add_callback(Solver<Dtype> * solver, bp::object on_start,
   bp::object on_gradients_ready) {
-  solver->add_callback(new PythonCallback<Dtype>(on_start, on_gradients_ready));
+  solver->add_callback(new SolverCallback<Dtype>(on_start, on_gradients_ready));
+}
+
+// Seems boost cannot call the base method directly
+void Solver_add_nccl(SGDSolver<Dtype>* solver
+#ifdef USE_NCCL
+  , NCCL<Dtype>* nccl
+#endif
+) {
+#ifdef USE_NCCL
+  solver->add_callback(nccl);
+#endif
 }
 
+template<typename Dtype>
+class NetCallback: public Net<Dtype>::Callback {
+ public:
+  explicit NetCallback(bp::object run) : run_(run) {}
+
+ protected:
+  virtual void run(int layer) {
+    run_(layer);
+  }
+  bp::object run_;
+};
+void Net_before_forward(Net<Dtype>* net, bp::object run) {
+  net->add_before_forward(new NetCallback<Dtype>(run));
+}
+void Net_after_forward(Net<Dtype>* net, bp::object run) {
+  net->add_after_forward(new NetCallback<Dtype>(run));
+}
+void Net_before_backward(Net<Dtype>* net, bp::object run) {
+  net->add_before_backward(new NetCallback<Dtype>(run));
+}
+void Net_after_backward(Net<Dtype>* net, bp::object run) {
+  net->add_after_backward(new NetCallback<Dtype>(run));
+}
+
+void Net_add_nccl(Net<Dtype>* net
+#ifdef USE_NCCL
+  , NCCL<Dtype>* nccl
+#endif
+) {
+#ifdef USE_NCCL
+  net->add_after_backward(nccl);
+#endif
+}
+#ifndef USE_NCCL
+template<typename Dtype>
+class NCCL {
+ public:
+  NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid) {}
+};
+#endif
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -283,10 +348,18 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::scope().attr("__version__") = AS_STRING(CAFFE_VERSION);
 
   // Caffe utility functions
+  bp::def("init_log", &InitLog);
+  bp::def("init_log", &InitLogInfo);
+  bp::def("log", &Log);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);
   bp::def("set_device", &Caffe::SetDevice);
+  bp::def("solver_count", &Caffe::solver_count);
+  bp::def("set_solver_count", &Caffe::set_solver_count);
+  bp::def("solver_rank", &Caffe::solver_rank);
+  bp::def("set_solver_rank", &Caffe::set_solver_rank);
+  bp::def("set_multiprocess", &Caffe::set_multiprocess);
 
   bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);
 
@@ -330,7 +403,12 @@ BOOST_PYTHON_MODULE(_caffe) {
         bp::with_custodian_and_ward<1, 2, bp::with_custodian_and_ward<1, 3> >())
     .def("save", &Net_Save)
     .def("save_hdf5", &Net_SaveHDF5)
-    .def("load_hdf5", &Net_LoadHDF5);
+    .def("load_hdf5", &Net_LoadHDF5)
+    .def("before_forward", &Net_before_forward)
+    .def("after_forward", &Net_after_forward)
+    .def("before_backward", &Net_before_backward)
+    .def("after_backward", &Net_after_backward)
+    .def("after_backward", &Net_add_nccl);
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Net<Dtype>);
 
   bp::class_<Blob<Dtype>, shared_ptr<Blob<Dtype> >, boost::noncopyable>(
@@ -362,6 +440,10 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("type", bp::make_function(&Layer<Dtype>::type));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Layer<Dtype>);
 
+  bp::class_<SolverParameter>("SolverParameter", bp::no_init)
+    .add_property("max_iter", &SolverParameter::max_iter)
+    .add_property("display", &SolverParameter::display)
+    .add_property("layer_wise_reduce", &SolverParameter::layer_wise_reduce);
   bp::class_<LayerParameter>("LayerParameter", bp::no_init);
 
   bp::class_<Solver<Dtype>, shared_ptr<Solver<Dtype> >, boost::noncopyable>(
@@ -371,11 +453,14 @@ BOOST_PYTHON_MODULE(_caffe) {
           bp::return_internal_reference<>()))
     .add_property("iter", &Solver<Dtype>::iter)
     .def("add_callback", &Solver_add_callback<Dtype>)
+    .def("add_callback", &Solver_add_nccl)
     .def("solve", static_cast<void (Solver<Dtype>::*)(const char*)>(
           &Solver<Dtype>::Solve), SolveOverloads())
     .def("step", &Solver<Dtype>::Step)
     .def("restore", &Solver<Dtype>::Restore)
-    .def("snapshot", &Solver<Dtype>::Snapshot);
+    .def("snapshot", &Solver<Dtype>::Snapshot)
+    .add_property("param", bp::make_function(&Solver<Dtype>::param,
+              bp::return_value_policy<bp::copy_const_reference>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);
 
   bp::class_<SGDSolver<Dtype>, bp::bases<Solver<Dtype> >,
@@ -419,6 +504,24 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<vector<bool> >("BoolVec")
     .def(bp::vector_indexing_suite<vector<bool> >());
 
+  bp::class_<NCCL<Dtype>, shared_ptr<NCCL<Dtype> >,
+    boost::noncopyable>("NCCL",
+                        bp::init<shared_ptr<Solver<Dtype> >, const string&>())
+#ifdef USE_NCCL
+    .def("new_uid", &NCCL<Dtype>::new_uid).staticmethod("new_uid")
+    .def("bcast", &NCCL<Dtype>::Broadcast)
+#endif
+    /* NOLINT_NEXT_LINE(whitespace/semicolon) */
+  ;
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(NCCL<Dtype>);
+
+  bp::class_<Timer, shared_ptr<Timer>, boost::noncopyable>(
+    "Timer", bp::init<>())
+    .def("start", &Timer::Start)
+    .def("stop", &Timer::Stop)
+    .add_property("ms", &Timer::MilliSeconds);
+  BP_REGISTER_SHARED_PTR_TO_PYTHON(Timer);
+
   // boost python expects a void (missing) return value, while import_array
   // returns NULL for python3. import_array1() forces a void return value.
   import_array1();
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 5bae18d..1880381 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -11,7 +11,7 @@ except:
 import numpy as np
 
 from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
-        RMSPropSolver, AdaDeltaSolver, AdamSolver
+        RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
 import caffe.io
 
 import six
diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index e109093..a0739fb 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -173,12 +173,12 @@ layer {
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, blobs):
         net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
@@ -238,12 +238,12 @@ layer {
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, blobs):
         net_blobs = [b for b in net.blobs.keys() if 'data' not in b]
@@ -320,12 +320,12 @@ layer {
 """
 
     def setUp(self):
-        self.f = tempfile.NamedTemporaryFile(mode='w+')
+        self.f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
         self.f.write(self.TEST_NET)
-        self.f.flush()
+        self.f.close()
 
     def tearDown(self):
-        self.f.close()
+        os.remove(self.f.name)
 
     def check_net(self, net, outputs):
         self.assertEqual(list(net.blobs['data'].shape), [1,1,10,10])
diff --git a/python/train.py b/python/train.py
new file mode 100644
index 0000000..5897f5d
--- /dev/null
+++ b/python/train.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+"""
+Trains a model using one or more GPUs.
+"""
+from multiprocessing import Process
+
+import caffe
+
+
+def train(
+        solver,  # solver proto definition
+        snapshot,  # solver snapshot to restore
+        gpus,  # list of device ids
+        timing=False,  # show timing info for compute and communications
+):
+    # NCCL uses a uid to identify a session
+    uid = caffe.NCCL.new_uid()
+
+    caffe.init_log()
+    caffe.log('Using devices %s' % str(gpus))
+
+    procs = []
+    for rank in range(len(gpus)):
+        p = Process(target=solve,
+                    args=(solver, snapshot, gpus, timing, uid, rank))
+        p.daemon = True
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join()
+
+
+def time(solver, nccl):
+    fprop = []
+    bprop = []
+    total = caffe.Timer()
+    allrd = caffe.Timer()
+    for _ in range(len(solver.net.layers)):
+        fprop.append(caffe.Timer())
+        bprop.append(caffe.Timer())
+    display = solver.param.display
+
+    def show_time():
+        if solver.iter % display == 0:
+            s = '\n'
+            for i in range(len(solver.net.layers)):
+                s += 'forw %3d %8s ' % (i, solver.net._layer_names[i])
+                s += ': %.2f\n' % fprop[i].ms
+            for i in range(len(solver.net.layers) - 1, -1, -1):
+                s += 'back %3d %8s ' % (i, solver.net._layer_names[i])
+                s += ': %.2f\n' % bprop[i].ms
+            s += 'solver total: %.2f\n' % total.ms
+            s += 'allreduce: %.2f\n' % allrd.ms
+            caffe.log(s)
+
+    solver.net.before_forward(lambda layer: fprop[layer].start())
+    solver.net.after_forward(lambda layer: fprop[layer].stop())
+    solver.net.before_backward(lambda layer: bprop[layer].start())
+    solver.net.after_backward(lambda layer: bprop[layer].stop())
+    solver.add_callback(lambda: total.start(), lambda: (total.stop(), allrd.start()))
+    solver.add_callback(nccl)
+    solver.add_callback(lambda: '', lambda: (allrd.stop(), show_time()))
+
+
+def solve(proto, snapshot, gpus, timing, uid, rank):
+    caffe.set_mode_gpu()
+    caffe.set_device(gpus[rank])
+    caffe.set_solver_count(len(gpus))
+    caffe.set_solver_rank(rank)
+    caffe.set_multiprocess(True)
+
+    solver = caffe.SGDSolver(proto)
+    if snapshot and len(snapshot) != 0:
+        solver.restore(snapshot)
+
+    nccl = caffe.NCCL(solver, uid)
+    nccl.bcast()
+
+    if timing and rank == 0:
+        time(solver, nccl)
+    else:
+        solver.add_callback(nccl)
+
+    if solver.param.layer_wise_reduce:
+        solver.net.after_backward(nccl)
+    solver.step(solver.param.max_iter)
+
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--solver", required=True, help="Solver proto definition.")
+    parser.add_argument("--snapshot", help="Solver snapshot to restore.")
+    parser.add_argument("--gpus", type=int, nargs='+', default=[0],
+                        help="List of device ids.")
+    parser.add_argument("--timing", action='store_true', help="Show timing info.")
+    args = parser.parse_args()
+
+    train(args.solver, args.snapshot, args.gpus, args.timing)
diff --git a/scripts/build_docs.sh b/scripts/build_docs.sh
index 0e28bd7..4837587 100755
--- a/scripts/build_docs.sh
+++ b/scripts/build_docs.sh
@@ -12,6 +12,9 @@ cd $ROOT_DIR
 # Gather docs.
 scripts/gather_examples.sh
 
+# Split caffe.proto for inclusion by layer catalogue.
+scripts/split_caffe_proto.py
+
 # Generate developer docs.
 make docs
 
diff --git a/scripts/download_model_binary.py b/scripts/download_model_binary.py
index fcdbb5a..a72fd5d 100755
--- a/scripts/download_model_binary.py
+++ b/scripts/download_model_binary.py
@@ -3,10 +3,11 @@ import os
 import sys
 import time
 import yaml
-import urllib
 import hashlib
 import argparse
 
+from six.moves import urllib
+
 required_keys = ['caffemodel', 'caffemodel_url', 'sha1']
 
 
@@ -69,7 +70,7 @@ if __name__ == '__main__':
         sys.exit(0)
 
     # Download and verify model.
-    urllib.urlretrieve(
+    urllib.request.urlretrieve(
         frontmatter['caffemodel_url'], model_filename, reporthook)
     if not model_checks_out():
         print('ERROR: model did not download correctly! Run this again.')
diff --git a/scripts/split_caffe_proto.py b/scripts/split_caffe_proto.py
new file mode 100755
index 0000000..7e9dc3e
--- /dev/null
+++ b/scripts/split_caffe_proto.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+import mmap
+import re
+import os
+import errno
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+
+# a regex to match the parameter definitions in caffe.proto
+r = re.compile(r'(?://.*\n)*message ([^ ]*) \{\n(?: .*\n|\n)*\}')
+
+# create directory to put caffe.proto fragments
+try:
+    os.mkdir(
+        os.path.join(script_path,
+                     '../docs/_includes/'))
+    os.mkdir(
+        os.path.join(script_path,
+                     '../docs/_includes/proto/'))
+except OSError as exception:
+    if exception.errno != errno.EEXIST:
+        raise
+
+caffe_proto_fn = os.path.join(
+    script_path,
+    '../src/caffe/proto/caffe.proto')
+
+with open(caffe_proto_fn, 'r') as fin:
+
+    for m in r.finditer(fin.read(), re.MULTILINE):
+        fn = os.path.join(
+            script_path,
+            '../docs/_includes/proto/%s.txt' % m.group(1))
+        with open(fn, 'w') as fout:
+            fout.write(m.group(0))
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 4a34e4c..603e52f 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -89,6 +89,12 @@ const Dtype* Blob<Dtype>::cpu_data() const {
 template <typename Dtype>
 void Blob<Dtype>::set_cpu_data(Dtype* data) {
   CHECK(data);
+  // Make sure CPU and GPU sizes remain equal
+  size_t size = count_ * sizeof(Dtype);
+  if (data_->size() != size) {
+    data_.reset(new SyncedMemory(size));
+    diff_.reset(new SyncedMemory(size));
+  }
   data_->set_cpu_data(data);
 }
 
@@ -99,6 +105,18 @@ const Dtype* Blob<Dtype>::gpu_data() const {
 }
 
 template <typename Dtype>
+void Blob<Dtype>::set_gpu_data(Dtype* data) {
+  CHECK(data);
+  // Make sure CPU and GPU sizes remain equal
+  size_t size = count_ * sizeof(Dtype);
+  if (data_->size() != size) {
+    data_.reset(new SyncedMemory(size));
+    diff_.reset(new SyncedMemory(size));
+  }
+  data_->set_gpu_data(data);
+}
+
+template <typename Dtype>
 const Dtype* Blob<Dtype>::cpu_diff() const {
   CHECK(diff_);
   return (const Dtype*)diff_->cpu_data();
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index dee6816..4f6f9bc 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -53,7 +53,7 @@ void GlobalInit(int* pargc, char*** pargv) {
 
 Caffe::Caffe()
     : random_generator_(), mode_(Caffe::CPU),
-      solver_count_(1), root_solver_(true) { }
+      solver_count_(1), solver_rank_(0), multiprocess_(false) { }
 
 Caffe::~Caffe() { }
 
@@ -106,7 +106,8 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
     : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
-    mode_(Caffe::CPU), solver_count_(1), root_solver_(true) {
+    mode_(Caffe::CPU),
+    solver_count_(1), solver_rank_(0), multiprocess_(false) {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
deleted file mode 100644
index 9f019bb..0000000
--- a/src/caffe/data_reader.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <boost/thread.hpp>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "caffe/common.hpp"
-#include "caffe/data_reader.hpp"
-#include "caffe/layers/data_layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-namespace caffe {
-
-using boost::weak_ptr;
-
-map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
-static boost::mutex bodies_mutex_;
-
-DataReader::DataReader(const LayerParameter& param)
-    : queue_pair_(new QueuePair(  //
-        param.data_param().prefetch() * param.data_param().batch_size())) {
-  // Get or create a body
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  string key = source_key(param);
-  weak_ptr<Body>& weak = bodies_[key];
-  body_ = weak.lock();
-  if (!body_) {
-    body_.reset(new Body(param));
-    bodies_[key] = weak_ptr<Body>(body_);
-  }
-  body_->new_queue_pairs_.push(queue_pair_);
-}
-
-DataReader::~DataReader() {
-  string key = source_key(body_->param_);
-  body_.reset();
-  boost::mutex::scoped_lock lock(bodies_mutex_);
-  if (bodies_[key].expired()) {
-    bodies_.erase(key);
-  }
-}
-
-//
-
-DataReader::QueuePair::QueuePair(int size) {
-  // Initialize the free queue with requested number of datums
-  for (int i = 0; i < size; ++i) {
-    free_.push(new Datum());
-  }
-}
-
-DataReader::QueuePair::~QueuePair() {
-  Datum* datum;
-  while (free_.try_pop(&datum)) {
-    delete datum;
-  }
-  while (full_.try_pop(&datum)) {
-    delete datum;
-  }
-}
-
-//
-
-DataReader::Body::Body(const LayerParameter& param)
-    : param_(param),
-      new_queue_pairs_() {
-  StartInternalThread();
-}
-
-DataReader::Body::~Body() {
-  StopInternalThread();
-}
-
-void DataReader::Body::InternalThreadEntry() {
-  shared_ptr<db::DB> db(db::GetDB(param_.data_param().backend()));
-  db->Open(param_.data_param().source(), db::READ);
-  shared_ptr<db::Cursor> cursor(db->NewCursor());
-  vector<shared_ptr<QueuePair> > qps;
-  try {
-    int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1;
-
-    // To ensure deterministic runs, only start running once all solvers
-    // are ready. But solvers need to peek on one item during initialization,
-    // so read one item, then wait for the next solver.
-    for (int i = 0; i < solver_count; ++i) {
-      shared_ptr<QueuePair> qp(new_queue_pairs_.pop());
-      read_one(cursor.get(), qp.get());
-      qps.push_back(qp);
-    }
-    // Main loop
-    while (!must_stop()) {
-      for (int i = 0; i < solver_count; ++i) {
-        read_one(cursor.get(), qps[i].get());
-      }
-      // Check no additional readers have been created. This can happen if
-      // more than one net is trained at a time per process, whether single
-      // or multi solver. It might also happen if two data layers have same
-      // name and same source.
-      CHECK_EQ(new_queue_pairs_.size(), 0);
-    }
-  } catch (boost::thread_interrupted&) {
-    // Interrupted exception is expected on shutdown
-  }
-}
-
-void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) {
-  Datum* datum = qp->free_.pop();
-  // TODO deserialize in-place instead of copy?
-  datum->ParseFromString(cursor->value());
-  qp->full_.push(datum);
-
-  // go to the next iter
-  cursor->Next();
-  if (!cursor->valid()) {
-    DLOG(INFO) << "Restarting data prefetching from start.";
-    cursor->SeekToFirst();
-  }
-}
-
-}  // namespace caffe
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 7189d67..3012251 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -130,7 +130,7 @@ void DataTransformer<Dtype>::Transform(const Datum& datum,
 template<typename Dtype>
 void DataTransformer<Dtype>::Transform(const Datum& datum,
                                        Blob<Dtype>* transformed_blob) {
-  // If datum is encoded, decoded and transform the cv::image.
+  // If datum is encoded, decode and transform the cv::image.
   if (datum.encoded()) {
 #ifdef USE_OPENCV
     CHECK(!(param_.force_color() && param_.force_gray()))
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index 104884e..11de497 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -28,25 +28,27 @@ void InternalThread::StartInternalThread() {
   Caffe::Brew mode = Caffe::mode();
   int rand_seed = caffe_rng_rand();
   int solver_count = Caffe::solver_count();
-  bool root_solver = Caffe::root_solver();
+  int solver_rank = Caffe::solver_rank();
+  bool multiprocess = Caffe::multiprocess();
 
   try {
     thread_.reset(new boost::thread(&InternalThread::entry, this, device, mode,
-          rand_seed, solver_count, root_solver));
+          rand_seed, solver_count, solver_rank, multiprocess));
   } catch (std::exception& e) {
     LOG(FATAL) << "Thread exception: " << e.what();
   }
 }
 
 void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
-    int solver_count, bool root_solver) {
+    int solver_count, int solver_rank, bool multiprocess) {
 #ifndef CPU_ONLY
   CUDA_CHECK(cudaSetDevice(device));
 #endif
   Caffe::set_mode(mode);
   Caffe::set_random_seed(rand_seed);
   Caffe::set_solver_count(solver_count);
-  Caffe::set_root_solver(root_solver);
+  Caffe::set_solver_rank(solver_rank);
+  Caffe::set_multiprocess(multiprocess);
 
   InternalThreadEntry();
 }
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index 3b91289..684ae88 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -1,27 +1,7 @@
-#include <boost/thread.hpp>
 #include "caffe/layer.hpp"
 
 namespace caffe {
 
-template <typename Dtype>
-void Layer<Dtype>::InitMutex() {
-  forward_mutex_.reset(new boost::mutex());
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Lock() {
-  if (IsShared()) {
-    forward_mutex_->lock();
-  }
-}
-
-template <typename Dtype>
-void Layer<Dtype>::Unlock() {
-  if (IsShared()) {
-    forward_mutex_->unlock();
-  }
-}
-
 INSTANTIATE_CLASS(Layer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 989319f..93a798f 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -36,9 +36,11 @@ template <typename Dtype>
 BasePrefetchingDataLayer<Dtype>::BasePrefetchingDataLayer(
     const LayerParameter& param)
     : BaseDataLayer<Dtype>(param),
-      prefetch_free_(), prefetch_full_() {
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_free_.push(&prefetch_[i]);
+      prefetch_(param.data_param().prefetch()),
+      prefetch_free_(), prefetch_full_(), prefetch_current_() {
+  for (int i = 0; i < prefetch_.size(); ++i) {
+    prefetch_[i].reset(new Batch<Dtype>());
+    prefetch_free_.push(prefetch_[i].get());
   }
 }
 
@@ -46,22 +48,23 @@ template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
+
   // Before starting the prefetch thread, we make cpu_data and gpu_data
   // calls so that the prefetch thread does not accidentally make simultaneous
   // cudaMalloc calls when the main thread is running. In some GPUs this
   // seems to cause failures if we do not so.
-  for (int i = 0; i < PREFETCH_COUNT; ++i) {
-    prefetch_[i].data_.mutable_cpu_data();
+  for (int i = 0; i < prefetch_.size(); ++i) {
+    prefetch_[i]->data_.mutable_cpu_data();
     if (this->output_labels_) {
-      prefetch_[i].label_.mutable_cpu_data();
+      prefetch_[i]->label_.mutable_cpu_data();
     }
   }
 #ifndef CPU_ONLY
   if (Caffe::mode() == Caffe::GPU) {
-    for (int i = 0; i < PREFETCH_COUNT; ++i) {
-      prefetch_[i].data_.mutable_gpu_data();
+    for (int i = 0; i < prefetch_.size(); ++i) {
+      prefetch_[i]->data_.mutable_gpu_data();
       if (this->output_labels_) {
-        prefetch_[i].label_.mutable_gpu_data();
+        prefetch_[i]->label_.mutable_gpu_data();
       }
     }
   }
@@ -88,6 +91,9 @@ void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
 #ifndef CPU_ONLY
       if (Caffe::mode() == Caffe::GPU) {
         batch->data_.data().get()->async_gpu_push(stream);
+        if (this->output_labels_) {
+          batch->label_.data().get()->async_gpu_push(stream);
+        }
         CUDA_CHECK(cudaStreamSynchronize(stream));
       }
 #endif
@@ -106,22 +112,18 @@ void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  if (prefetch_current_) {
+    prefetch_free_.push(prefetch_current_);
+  }
+  prefetch_current_ = prefetch_full_.pop("Waiting for data");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.cpu_data(),
-             top[0]->mutable_cpu_data());
-  DLOG(INFO) << "Prefetch copied";
+  top[0]->ReshapeLike(prefetch_current_->data_);
+  top[0]->set_cpu_data(prefetch_current_->data_.mutable_cpu_data());
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.cpu_data(),
-        top[1]->mutable_cpu_data());
+    top[1]->ReshapeLike(prefetch_current_->label_);
+    top[1]->set_cpu_data(prefetch_current_->label_.mutable_cpu_data());
   }
-
-  prefetch_free_.push(batch);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu
index 4056d36..64c621a 100644
--- a/src/caffe/layers/base_data_layer.cu
+++ b/src/caffe/layers/base_data_layer.cu
@@ -7,23 +7,18 @@ namespace caffe {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
+  if (prefetch_current_) {
+    prefetch_free_.push(prefetch_current_);
+  }
+  prefetch_current_ = prefetch_full_.pop("Waiting for data");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(batch->data_);
-  // Copy the data
-  caffe_copy(batch->data_.count(), batch->data_.gpu_data(),
-      top[0]->mutable_gpu_data());
+  top[0]->ReshapeLike(prefetch_current_->data_);
+  top[0]->set_gpu_data(prefetch_current_->data_.mutable_gpu_data());
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(batch->label_);
-    // Copy the labels.
-    caffe_copy(batch->label_.count(), batch->label_.gpu_data(),
-        top[1]->mutable_gpu_data());
+    top[1]->ReshapeLike(prefetch_current_->label_);
+    top[1]->set_gpu_data(prefetch_current_->label_.mutable_gpu_data());
   }
-  // Ensure the copy is synchronous wrt the host, so that the next batch isn't
-  // copied in meanwhile.
-  CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-  prefetch_free_.push(batch);
 }
 
 INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index d36b61c..ef8c177 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -86,27 +86,25 @@ void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
     }
   } else {
     // We are at the last dimensions, which is stored continuously in memory
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      // prepare index vector reduced(red) and with offsets(off)
-      std::vector<int> ind_red(cur_dim, 0);
-      std::vector<int> ind_off(cur_dim+1, 0);
-      for (int j = 0; j < cur_dim; ++j) {
-          ind_red[j] = indices[j];
-          ind_off[j] = indices[j] + offsets[j];
-      }
-      ind_off[cur_dim] = offsets[cur_dim];
-      // do the copy
-      if (is_forward) {
-        caffe_copy(top[0]->shape(cur_dim),
-            src_data + bottom[0]->offset(ind_off),
-            dest_data + top[0]->offset(ind_red));
-      } else {
-        // in the backwards pass the src_data is top_diff
-        // and the dest_data is bottom_diff
-        caffe_copy(top[0]->shape(cur_dim),
-            src_data + top[0]->offset(ind_red),
-            dest_data + bottom[0]->offset(ind_off));
-      }
+    // prepare index vector reduced(red) and with offsets(off)
+    std::vector<int> ind_red(cur_dim, 0);
+    std::vector<int> ind_off(cur_dim+1, 0);
+    for (int j = 0; j < cur_dim; ++j) {
+      ind_red[j] = indices[j];
+      ind_off[j] = indices[j] + offsets[j];
+    }
+    ind_off[cur_dim] = offsets[cur_dim];
+    // do the copy
+    if (is_forward) {
+      caffe_copy(top[0]->shape(cur_dim),
+          src_data + bottom[0]->offset(ind_off),
+          dest_data + top[0]->offset(ind_red));
+    } else {
+      // in the backwards pass the src_data is top_diff
+      // and the dest_data is bottom_diff
+      caffe_copy(top[0]->shape(cur_dim),
+          src_data + top[0]->offset(ind_red),
+          dest_data + bottom[0]->offset(ind_off));
     }
   }
 }
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 1ea1325..677077c 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -8,14 +8,12 @@ namespace caffe {
 // strides in the last two dimensions.
 template <typename Dtype>
 __global__ void copy_kernel(const int n, const int height, const int width,
-    const int src_outer_stride, const int src_inner_stride,
-    const int dest_outer_stride, const int dest_inner_stride,
+    const int src_inner_stride,
+    const int dest_inner_stride,
     const Dtype* src, Dtype* dest) {
   CUDA_KERNEL_LOOP(index, n) {
-    int src_start = index / height * src_outer_stride
-                  + index % height * src_inner_stride;
-    int dest_start = index / height * dest_outer_stride
-                   + index % height * dest_inner_stride;
+    int src_start = index * src_inner_stride;
+    int dest_start = index * dest_inner_stride;
     for (int i = 0; i < width; ++i) {
       dest[dest_start + i] = src[src_start + i];
     }
@@ -53,11 +51,7 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
     ind_off[cur_dim] = offsets[cur_dim];
     ind_off[cur_dim+1] = offsets[cur_dim+1];
     // Compute copy strides
-    const int src_outer_stride =
-        bottom[0]->shape(cur_dim)*bottom[0]->shape(cur_dim+1);
     const int src_inner_stride = bottom[0]->shape(cur_dim+1);
-    const int dest_outer_stride =
-        top[0]->shape(cur_dim)*top[0]->shape(cur_dim+1);
     const int dest_inner_stride = top[0]->shape(cur_dim+1);
 
     if (is_forward) {
@@ -68,8 +62,8 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
       // NOLINT_NEXT_LINE(whitespace/operators)
       copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
           lines, height, width,
-          src_outer_stride, src_inner_stride,
-          dest_outer_stride, dest_inner_stride,
+          src_inner_stride,
+          dest_inner_stride,
           bottom_data, top_data);
 
     } else {
@@ -80,8 +74,8 @@ void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
       // NOLINT_NEXT_LINE(whitespace/operators)
       copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
           lines, height, width,
-          dest_outer_stride, dest_inner_stride,
-          src_outer_stride, src_inner_stride,
+          dest_inner_stride,
+          src_inner_stride,
           top_diff, bottom_diff);
     }
   }
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 66e6301..0f1296b 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -14,7 +14,10 @@ namespace caffe {
 template <typename Dtype>
 DataLayer<Dtype>::DataLayer(const LayerParameter& param)
   : BasePrefetchingDataLayer<Dtype>(param),
-    reader_(param) {
+    offset_() {
+  db_.reset(db::GetDB(param.data_param().backend()));
+  db_->Open(param.data_param().source(), db::READ);
+  cursor_.reset(db_->NewCursor());
 }
 
 template <typename Dtype>
@@ -27,7 +30,8 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.data_param().batch_size();
   // Read a data point, and use it to initialize the top blob.
-  Datum& datum = *(reader_.full().peek());
+  Datum datum;
+  datum.ParseFromString(cursor_->value());
 
   // Use data_transformer to infer the expected blob shape from datum.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
@@ -35,22 +39,44 @@ void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // Reshape top[0] and prefetch_data according to the batch_size.
   top_shape[0] = batch_size;
   top[0]->Reshape(top_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->data_.Reshape(top_shape);
   }
-  LOG(INFO) << "output data size: " << top[0]->num() << ","
+  LOG_IF(INFO, Caffe::root_solver())
+      << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
       << top[0]->width();
   // label
   if (this->output_labels_) {
     vector<int> label_shape(1, batch_size);
     top[1]->Reshape(label_shape);
-    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-      this->prefetch_[i].label_.Reshape(label_shape);
+    for (int i = 0; i < this->prefetch_.size(); ++i) {
+      this->prefetch_[i]->label_.Reshape(label_shape);
     }
   }
 }
 
+template <typename Dtype>
+bool DataLayer<Dtype>::Skip() {
+  int size = Caffe::solver_count();
+  int rank = Caffe::solver_rank();
+  bool keep = (offset_ % size) == rank ||
+              // In test mode, only rank 0 runs, so avoid skipping
+              this->layer_param_.phase() == TEST;
+  return !keep;
+}
+
+template<typename Dtype>
+void DataLayer<Dtype>::Next() {
+  cursor_->Next();
+  if (!cursor_->valid()) {
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Restarting data prefetching from start.";
+    cursor_->SeekToFirst();
+  }
+  offset_++;
+}
+
 // This function is called on prefetch thread
 template<typename Dtype>
 void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
@@ -61,41 +87,41 @@ void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   CPUTimer timer;
   CHECK(batch->data_.count());
   CHECK(this->transformed_data_.count());
-
-  // Reshape according to the first datum of each batch
-  // on single input batches allows for inputs of varying dimension.
   const int batch_size = this->layer_param_.data_param().batch_size();
-  Datum& datum = *(reader_.full().peek());
-  // Use data_transformer to infer the expected blob shape from datum.
-  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
-  this->transformed_data_.Reshape(top_shape);
-  // Reshape batch according to the batch_size.
-  top_shape[0] = batch_size;
-  batch->data_.Reshape(top_shape);
-
-  Dtype* top_data = batch->data_.mutable_cpu_data();
-  Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
 
-  if (this->output_labels_) {
-    top_label = batch->label_.mutable_cpu_data();
-  }
+  Datum datum;
   for (int item_id = 0; item_id < batch_size; ++item_id) {
     timer.Start();
-    // get a datum
-    Datum& datum = *(reader_.full().pop("Waiting for data"));
+    while (Skip()) {
+      Next();
+    }
+    datum.ParseFromString(cursor_->value());
     read_time += timer.MicroSeconds();
-    timer.Start();
+
+    if (item_id == 0) {
+      // Reshape according to the first datum of each batch
+      // on single input batches allows for inputs of varying dimension.
+      // Use data_transformer to infer the expected blob shape from datum.
+      vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+      this->transformed_data_.Reshape(top_shape);
+      // Reshape batch according to the batch_size.
+      top_shape[0] = batch_size;
+      batch->data_.Reshape(top_shape);
+    }
+
     // Apply data transformations (mirror, scale, crop...)
+    timer.Start();
     int offset = batch->data_.offset(item_id);
+    Dtype* top_data = batch->data_.mutable_cpu_data();
     this->transformed_data_.set_cpu_data(top_data + offset);
     this->data_transformer_->Transform(datum, &(this->transformed_data_));
     // Copy label.
     if (this->output_labels_) {
+      Dtype* top_label = batch->label_.mutable_cpu_data();
       top_label[item_id] = datum.label();
     }
     trans_time += timer.MicroSeconds();
-
-    reader_.free().push(const_cast<Datum*>(&datum));
+    Next();
   }
   timer.Stop();
   batch_timer.Stop();
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index c957451..b9a071c 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -125,27 +125,45 @@ void HDF5DataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool HDF5DataLayer<Dtype>::Skip() {
+  int size = Caffe::solver_count();
+  int rank = Caffe::solver_rank();
+  bool keep = (offset_ % size) == rank ||
+              // In test mode, only rank 0 runs, so avoid skipping
+              this->layer_param_.phase() == TEST;
+  return !keep;
+}
+
+template<typename Dtype>
+void HDF5DataLayer<Dtype>::Next() {
+  if (++current_row_ == hdf_blobs_[0]->shape(0)) {
+    if (num_files_ > 1) {
+      ++current_file_;
+      if (current_file_ == num_files_) {
+        current_file_ = 0;
+        if (this->layer_param_.hdf5_data_param().shuffle()) {
+          std::random_shuffle(file_permutation_.begin(),
+                              file_permutation_.end());
+        }
+        DLOG(INFO) << "Looping around to first file.";
+      }
+      LoadHDF5FileData(
+        hdf_filenames_[file_permutation_[current_file_]].c_str());
+    }
+    current_row_ = 0;
+    if (this->layer_param_.hdf5_data_param().shuffle())
+      std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  }
+  offset_++;
+}
+
+template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        ++current_file_;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  for (int i = 0; i < batch_size; ++i) {
+    while (Skip()) {
+      Next();
     }
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
@@ -153,6 +171,7 @@ void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
           &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
             * data_dim], &top[j]->mutable_cpu_data()[i * data_dim]);
     }
+    Next();
   }
 }
 
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 595d223..33eebd4 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -17,24 +17,9 @@ template <typename Dtype>
 void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   const int batch_size = this->layer_param_.hdf5_data_param().batch_size();
-  for (int i = 0; i < batch_size; ++i, ++current_row_) {
-    if (current_row_ == hdf_blobs_[0]->shape(0)) {
-      if (num_files_ > 1) {
-        current_file_ += 1;
-        if (current_file_ == num_files_) {
-          current_file_ = 0;
-          if (this->layer_param_.hdf5_data_param().shuffle()) {
-            std::random_shuffle(file_permutation_.begin(),
-                                file_permutation_.end());
-          }
-          DLOG(INFO) << "Looping around to first file.";
-        }
-        LoadHDF5FileData(
-            hdf_filenames_[file_permutation_[current_file_]].c_str());
-      }
-      current_row_ = 0;
-      if (this->layer_param_.hdf5_data_param().shuffle())
-        std::random_shuffle(data_permutation_.begin(), data_permutation_.end());
+  for (int i = 0; i < batch_size; ++i) {
+    while (Skip()) {
+      Next();
     }
     for (int j = 0; j < this->layer_param_.top_size(); ++j) {
       int data_dim = top[j]->count() / top[j]->shape(0);
@@ -42,6 +27,7 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
           &hdf_blobs_[j]->cpu_data()[data_permutation_[current_row_]
             * data_dim], &top[j]->mutable_gpu_data()[i * data_dim]);
     }
+    Next();
   }
 }
 
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 7ee7dc4..ec0fc5b 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -54,6 +54,11 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const unsigned int prefetch_rng_seed = caffe_rng_rand();
     prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
     ShuffleImages();
+  } else {
+    if (this->phase_ == TRAIN && Caffe::solver_rank() > 0 &&
+        this->layer_param_.image_data_param().rand_skip() == 0) {
+      LOG(WARNING) << "Shuffling or skipping recommended for multi-GPU";
+    }
   }
   LOG(INFO) << "A total of " << lines_.size() << " images.";
 
@@ -77,8 +82,8 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int batch_size = this->layer_param_.image_data_param().batch_size();
   CHECK_GT(batch_size, 0) << "Positive batch size required";
   top_shape[0] = batch_size;
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].data_.Reshape(top_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->data_.Reshape(top_shape);
   }
   top[0]->Reshape(top_shape);
 
@@ -88,8 +93,8 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->label_.Reshape(label_shape);
   }
 }
 
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index 103dd4b..1bf3760 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -173,8 +173,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_GT(crop_size, 0);
   const int batch_size = this->layer_param_.window_data_param().batch_size();
   top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i)
-    this->prefetch_[i].data_.Reshape(
+  for (int i = 0; i < this->prefetch_.size(); ++i)
+    this->prefetch_[i]->data_.Reshape(
         batch_size, channels, crop_size, crop_size);
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
@@ -183,8 +183,8 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
-    this->prefetch_[i].label_.Reshape(label_shape);
+  for (int i = 0; i < this->prefetch_.size(); ++i) {
+    this->prefetch_[i]->label_.Reshape(label_shape);
   }
 
   // data mean
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 644cb7e..aa9e8f2 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -22,16 +22,13 @@
 namespace caffe {
 
 template <typename Dtype>
-Net<Dtype>::Net(const NetParameter& param, const Net* root_net)
-    : root_net_(root_net) {
+Net<Dtype>::Net(const NetParameter& param) {
   Init(param);
 }
 
 template <typename Dtype>
 Net<Dtype>::Net(const string& param_file, Phase phase,
-    const int level, const vector<string>* stages,
-    const Net* root_net)
-    : root_net_(root_net) {
+    const int level, const vector<string>* stages) {
   NetParameter param;
   ReadNetParamsFromTextFileOrDie(param_file, &param);
   // Set phase, stages and level
@@ -47,8 +44,6 @@ Net<Dtype>::Net(const string& param_file, Phase phase,
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
-  CHECK(Caffe::root_solver() || root_net_)
-      << "root_net_ needs to be set for all non-root solvers";
   // Set phase from the state.
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
@@ -74,9 +69,6 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   top_id_vecs_.resize(param.layer_size());
   bottom_need_backward_.resize(param.layer_size());
   for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
-    // For non-root solvers, whether this layer is shared from root_net_.
-    bool share_from_root = !Caffe::root_solver()
-        && root_net_->layers_[layer_id]->ShareInParallel();
     // Inherit phase from net if unset.
     if (!param.layer(layer_id).has_phase()) {
       param.mutable_layer(layer_id)->set_phase(phase_);
@@ -89,13 +81,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
           << "propagate_down param must be specified "
           << "either 0 or bottom_size times ";
     }
-    if (share_from_root) {
-      LOG(INFO) << "Sharing layer " << layer_param.name() << " from root net";
-      layers_.push_back(root_net_->layers_[layer_id]);
-      layers_[layer_id]->SetShared(true);
-    } else {
-      layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
-    }
+    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
     layer_names_.push_back(layer_param.name());
     LOG_IF(INFO, Caffe::root_solver())
         << "Creating Layer " << layer_param.name();
@@ -134,19 +120,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
     }
     // After this layer is connected, set it up.
-    if (share_from_root) {
-      // Set up size of top blobs using root_net_
-      const vector<Blob<Dtype>*>& base_top = root_net_->top_vecs_[layer_id];
-      const vector<Blob<Dtype>*>& this_top = this->top_vecs_[layer_id];
-      for (int top_id = 0; top_id < base_top.size(); ++top_id) {
-        this_top[top_id]->ReshapeLike(*base_top[top_id]);
-        LOG(INFO) << "Created top blob " << top_id << " (shape: "
-            << this_top[top_id]->shape_string() <<  ") for shared layer "
-            << layer_param.name();
-      }
-    } else {
-      layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
-    }
+    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
     LOG_IF(INFO, Caffe::root_solver())
         << "Setting up " << layer_names_[layer_id];
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
@@ -546,10 +520,15 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
   CHECK_LT(end, layers_.size());
   Dtype loss = 0;
   for (int i = start; i <= end; ++i) {
-    // LOG(ERROR) << "Forwarding " << layer_names_[i];
+    for (int c = 0; c < before_forward_.size(); ++c) {
+      before_forward_[c]->run(i);
+    }
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
     if (debug_info_) { ForwardDebugInfo(i); }
+    for (int c = 0; c < after_forward_.size(); ++c) {
+      after_forward_[c]->run(i);
+    }
   }
   return loss;
 }
@@ -591,11 +570,17 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
   CHECK_GE(end, 0);
   CHECK_LT(start, layers_.size());
   for (int i = start; i >= end; --i) {
+    for (int c = 0; c < before_backward_.size(); ++c) {
+      before_backward_[c]->run(i);
+    }
     if (layer_need_backward_[i]) {
       layers_[i]->Backward(
           top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
       if (debug_info_) { BackwardDebugInfo(i); }
     }
+    for (int c = 0; c < after_backward_.size(); ++c) {
+      after_backward_[c]->run(i);
+    }
   }
 }
 
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
index 5bc41c6..d943391 100644
--- a/src/caffe/parallel.cpp
+++ b/src/caffe/parallel.cpp
@@ -1,16 +1,15 @@
-#ifndef CPU_ONLY
+#ifdef USE_NCCL
+
 #include <cuda_runtime.h>
-#endif
 #include <glog/logging.h>
 #include <stdio.h>
-
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "boost/thread.hpp"
 #include "caffe/caffe.hpp"
 #include "caffe/parallel.hpp"
+#include "caffe/sgd_solvers.hpp"
 
 namespace caffe {
 
@@ -68,15 +67,14 @@ static size_t total_size(const vector<Blob<Dtype>*>& params) {
 
 template<typename Dtype>
 Params<Dtype>::Params(shared_ptr<Solver<Dtype> > root_solver)
-    : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
-      data_(),
-      diff_() {
+  : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
+    data_(),
+    diff_() {
 }
 
 template<typename Dtype>
 GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
-    : Params<Dtype>(root_solver) {
-#ifndef CPU_ONLY
+  : Params<Dtype>(root_solver) {
   int initial_device;
   CUDA_CHECK(cudaGetDevice(&initial_device));
 
@@ -86,358 +84,288 @@ GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
 
   // Copy blob values
   const vector<Blob<Dtype>*>& net =
-      root_solver->net()->learnable_params();
+    root_solver->net()->learnable_params();
   apply_buffers(net, data_, size_, copy);
 
   CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype)));
   caffe_gpu_set(size_, Dtype(0), diff_);
 
   CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
 }
 
 template<typename Dtype>
 GPUParams<Dtype>::~GPUParams() {
-#ifndef CPU_ONLY
   CUDA_CHECK(cudaFree(data_));
   CUDA_CHECK(cudaFree(diff_));
-#endif
 }
 
 template<typename Dtype>
-void GPUParams<Dtype>::configure(Solver<Dtype>* solver) const {
+void GPUParams<Dtype>::Configure(Solver<Dtype>* solver) const {
   const vector<Blob<Dtype>*>& net =
-      solver->net()->learnable_params();
+    solver->net()->learnable_params();
   apply_buffers(net, data_, size_, replace_gpu);
   apply_buffers(net, diff_, size_, replace_gpu_diff);
 }
 
-void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
-#ifndef CPU_ONLY
-  vector<int> remaining(devices);
-
-  // Depth for reduction tree
-  int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));
-
-  // Group GPUs by board
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        cudaDeviceProp a, b;
-        CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
-        CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
-        if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
-          if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
-            pairs->push_back(DevicePair(remaining[i], remaining[j]));
-            DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
-            remaining.erase(remaining.begin() + j);
-            break;
-          }
-        }
-      }
-    }
-  }
-  ostringstream s;
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();
-
-  // Group by P2P accessibility
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      for (int j = i + 1; j < remaining.size(); ++j) {
-        int access;
-        CUDA_CHECK(
-            cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
-        if (access) {
-          pairs->push_back(DevicePair(remaining[i], remaining[j]));
-          DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
-          remaining.erase(remaining.begin() + j);
-          break;
-        }
-      }
-    }
-  }
-  s.str("");
-  for (int i = 0; i < remaining.size(); ++i) {
-    s << (i ? ", " : "") << remaining[i];
-  }
-  DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();
-
-  // Group remaining
-  remaining_depth = ceil(log2(remaining.size()));
-  for (int d = 0; d < remaining_depth; ++d) {
-    for (int i = 0; i < remaining.size(); ++i) {
-      pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
-      DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
-                 << remaining[i + 1];
-      remaining.erase(remaining.begin() + i + 1);
-    }
-  }
+static int getDevice() {
+  int device = 0;
+  CUDA_CHECK(cudaGetDevice(&device));
+  return device;
+}
 
-  // Should only be the parent node remaining
-  CHECK_EQ(remaining.size(), 1);
+template<typename Dtype>
+NCCL<Dtype>::NCCL(shared_ptr<Solver<Dtype> > solver)
+  : GPUParams<Dtype>(solver, getDevice()),
+    comm_(), solver_(solver), barrier_() {
+  this->Configure(solver.get());
+  Init();
+}
 
-  pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));
+template<typename Dtype>
+NCCL<Dtype>::NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid)
+  : GPUParams<Dtype>(solver, getDevice()),
+    solver_(solver), barrier_() {
+  this->Configure(solver.get());
+  Caffe::set_multiprocess(true);
+  ncclUniqueId nccl_uid;
+  memcpy(&nccl_uid, &uid[0], NCCL_UNIQUE_ID_BYTES);  // NOLINT(caffe/alt_fn)
+  NCCL_CHECK(ncclCommInitRank(&comm_,
+                              Caffe::solver_count(),
+                              nccl_uid,
+                              Caffe::solver_rank()));
+  Init();
+}
 
-  CHECK(pairs->size() == devices.size());
-  for (int i = 0; i < pairs->size(); ++i) {
-    CHECK((*pairs)[i].parent() != (*pairs)[i].device());
-    for (int j = i + 1; j < pairs->size(); ++j) {
-      CHECK((*pairs)[i].device() != (*pairs)[j].device());
-    }
+template<typename Dtype>
+void NCCL<Dtype>::Init() {
+  if (solver_->param().layer_wise_reduce()) {
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
   }
-#else
-  NO_GPU;
-#endif
 }
 
-//
-
 template<typename Dtype>
-P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
-                        P2PSync<Dtype>* parent, const SolverParameter& param)
-    : GPUParams<Dtype>(root_solver, param.device_id()),
-      parent_(parent),
-      children_(),
-      queue_(),
-      initial_iter_(root_solver->iter()),
-      solver_() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = param.device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent == NULL) {
-    solver_ = root_solver;
-  } else {
-    Caffe::set_root_solver(false);
-    solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
-    Caffe::set_root_solver(true);
+NCCL<Dtype>::~NCCL() {
+  if (solver_->param().layer_wise_reduce()) {
+    CUDA_CHECK(cudaStreamDestroy(stream_));
   }
-  this->configure(solver_.get());
-  solver_->add_callback(this);
-
-  if (parent) {
-    // Enable p2p access between devices
-    const int peer = parent->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
-    } else {
-      LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
-    }
-    // Allocate receiving buffer on parent
-    CUDA_CHECK(cudaSetDevice(peer));
-    CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
-    CUDA_CHECK(cudaSetDevice(self));
+  if (comm_) {
+    ncclCommDestroy(comm_);
   }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#else
-  NO_GPU;
-#endif
 }
 
 template<typename Dtype>
-P2PSync<Dtype>::~P2PSync() {
-#ifndef CPU_ONLY
-  int initial_device;
-  CUDA_CHECK(cudaGetDevice(&initial_device));
-  const int self = solver_->param().device_id();
-  CUDA_CHECK(cudaSetDevice(self));
-
-  if (parent_) {
-    CUDA_CHECK(cudaFree(parent_grads_));
-    const int peer = parent_->solver_->param().device_id();
-    int access;
-    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
-    if (access) {
-      CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
-    }
-  }
-
-  CUDA_CHECK(cudaSetDevice(initial_device));
-#endif
+boost::barrier* NCCL<Dtype>::barrier() {
+  return barrier_;
+}
+template<typename Dtype>
+void NCCL<Dtype>::set_barrier(boost::barrier* value) {
+  barrier_ = value;
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::InternalThreadEntry() {
-  Caffe::SetDevice(solver_->param().device_id());
-  CHECK(Caffe::root_solver());
-  Caffe::set_root_solver(false);
-  // See if there is a defined seed and reset random state if so
-  if (solver_->param().random_seed() >= 0) {
-    // Fetch random seed and modulate by device ID to make sure
-    // everyone doesn't have the same seed.  We seem to have some
-    // solver instability if we have everyone with the same seed
-    Caffe::set_random_seed(
-        solver_->param().random_seed() + solver_->param().device_id());
+void NCCL<Dtype>::InitSingleProcess(vector<NCCL<Dtype>*>* nccls) {
+  ncclComm_t* comms = new ncclComm_t[nccls->size()];
+  int* gpu_list = new int[nccls->size()];
+  for (int i = 0; i < nccls->size(); ++i) {
+    gpu_list[i] = (*nccls)[i]->solver_->param().device_id();
+  }
+  NCCL_CHECK(ncclCommInitAll(comms, static_cast<int>(nccls->size()), gpu_list));
+  for (int i = 0; i < nccls->size(); ++i) {
+    (*nccls)[i]->comm_ = comms[i];
   }
-  solver_->Step(solver_->param().max_iter() - initial_iter_);
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::on_start() {
-#ifndef CPU_ONLY
-#ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
-#else
-//  CHECK(false);
-#endif
+string NCCL<Dtype>::new_uid() {
+  string uid;
+  uid.resize(NCCL_UNIQUE_ID_BYTES);
+  ncclUniqueId nccl_uid;
+  NCCL_CHECK(ncclGetUniqueId(&nccl_uid));
+  memcpy(&uid[0], &nccl_uid, NCCL_UNIQUE_ID_BYTES);  // NOLINT(caffe/alt_fn)
+  return uid;
+}
 
-  // Wait for update from parent
-  if (parent_) {
-    P2PSync<Dtype> *parent = queue_.pop();
-    CHECK(parent == parent_);
+template<typename Dtype>
+void NCCL<Dtype>::Broadcast() {
+  if (barrier_) {  // NULL in multi process case
+    barrier_->wait();
   }
-
-  // Update children
-  for (int i = children_.size() - 1; i >= 0; i--) {
-    Dtype* src = data_;
-    Dtype* dst = children_[i]->data_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == children_[i]->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    children_[i]->queue_.push(this);
+  NCCL_CHECK(ncclBcast(data_, static_cast<int>(size_),
+                       nccl::dataType<Dtype>::type, 0,
+                       comm_, cudaStreamDefault));
+  if (barrier_) {
+    barrier_->wait();
   }
-#endif
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::on_gradients_ready() {
-#ifndef CPU_ONLY
+void NCCL<Dtype>::run(int layer) {
+  CHECK(solver_->param().layer_wise_reduce());
+  vector<shared_ptr<Blob<Dtype> > >& blobs =
+    solver_->net()->layers()[layer]->blobs();
 #ifdef DEBUG
-  int device;
-  CUDA_CHECK(cudaGetDevice(&device));
-  CHECK(device == solver_->param().device_id());
+  // Assert blobs are contiguous to reduce in one step (e.g. bias often small)
+  for (int i = 1; i < blobs.size(); ++i) {
+    CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
+             blobs[i + 0]->gpu_diff());
+  }
 #endif
+  if (blobs.size() > 0) {
+    // Make sure default stream is done computing gradients. Could be
+    // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
+    // blocking the default stream, but it's actually slower.
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
 
-  // Sum children gradients as they appear in the queue
-  for (int i = 0; i < children_.size(); ++i) {
-    P2PSync<Dtype> *child = queue_.pop();
-    Dtype* src = child->parent_grads_;
-    Dtype* dst = diff_;
-
-#ifdef DEBUG
-    bool ok = false;
-    for (int j = 0; j < children_.size(); ++j) {
-      if (child == children_[j]) {
-        ok = true;
-      }
+    // Reduce asynchronously
+    int size = 0;
+    for (int i = 0; i < blobs.size(); ++i) {
+      size += blobs[i]->count();
     }
-    CHECK(ok);
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == device);
-#endif
-
-    caffe_gpu_add(size_, src, dst, dst);
+    if (barrier_) {  // NULL in multi process case
+      barrier_->wait();
+    }
+    NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
+                             blobs[0]->mutable_gpu_diff(),
+                             size,
+                             nccl::dataType<Dtype>::type,
+                             ncclSum, comm_, stream_));
+    caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
+                   blobs[0]->mutable_gpu_diff(), stream_);
   }
+}
 
-  // Send gradients to parent
-  if (parent_) {
-    Dtype* src = diff_;
-    Dtype* dst = parent_grads_;
-
-#ifdef DEBUG
-    cudaPointerAttributes attributes;
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
-    CHECK(attributes.device == device);
-    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
-    CHECK(attributes.device == parent_->solver_->param().device_id());
-#endif
-
-    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
-        cudaMemcpyDeviceToDevice, cudaStreamDefault));
-    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
-    parent_->queue_.push(this);
+template<typename Dtype>
+void NCCL<Dtype>::on_gradients_ready() {
+  if (solver_->param().layer_wise_reduce()) {
+    CHECK_EQ(solver_->net()->params().size(),
+             solver_->net()->learnable_params().size())
+      << "Layer-wise reduce is not supported for nets with shared weights.";
+
+    // Make sure reduction is done before applying gradients
+    CUDA_CHECK(cudaStreamSynchronize(stream_));
   } else {
-    // Loss functions divide gradients by the batch size, so to compensate
-    // for split batch, the root solver divides by number of solvers.
-    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
+    if (barrier_) {  // NULL in multi process case
+      barrier_->wait();
+    }
+    NCCL_CHECK(ncclAllReduce(diff_, diff_, static_cast<int>(size_),
+                             nccl::dataType<Dtype>::type, ncclSum, comm_,
+                             cudaStreamDefault));
+    caffe_gpu_scal(static_cast<int>(size_),
+                   (Dtype) 1.0 / Caffe::solver_count(), diff_);
   }
-#endif
 }
 
 template<typename Dtype>
-void P2PSync<Dtype>::Prepare(const vector<int>& gpus,
-            vector<shared_ptr<P2PSync<Dtype> > >* syncs) {
-  // Pair devices for map-reduce synchronization
-  vector<DevicePair> pairs;
-  DevicePair::compute(gpus, &pairs);
-  ostringstream s;
-  for (int i = 1; i < pairs.size(); ++i) {
-    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
+class Worker : public InternalThread {
+ public:
+  explicit Worker(shared_ptr<Solver<Dtype> > rank0, int device,
+                  boost::barrier* barrier, vector<NCCL<Dtype>*>* nccls,
+                  const char* restore)
+    : rank0_(rank0), device_(device), barrier_(barrier),
+      nccls_(nccls), restore_(restore) {
   }
-  LOG(INFO)<< "GPUs pairs " << s.str();
-
-  SolverParameter param(solver_->param());
-
-  // Build the GPU tree by finding the parent for each solver
-  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
-    for (int i = 1; i < pairs.size(); ++i) {
-      if (!syncs->at(i).get()) {
-        P2PSync<Dtype>* parent = NULL;
-        for (int j = 0; j < syncs->size(); ++j) {
-          P2PSync<Dtype>* sync = j == 0 ? this : syncs->at(j).get();
-          if (sync) {
-            const SolverParameter& p = sync->solver()->param();
-            if (p.device_id() == pairs[i].parent()) {
-              parent = sync;
-            }
-          }
-        }
-        if (parent) {
-          param.set_device_id(pairs[i].device());
-          syncs->at(i).reset(new P2PSync<Dtype>(solver_, parent, param));
-          parent->children_.push_back((P2PSync<Dtype>*) syncs->at(i).get());
-        }
+  virtual ~Worker() {}
+
+ protected:
+  void InternalThreadEntry() {
+    // Create solver and install callbacks
+    SolverParameter param(rank0_->param());
+    param.set_device_id(device_);
+#ifdef DEBUG
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
+    CHECK_EQ(device, device_);
+#endif
+    param.set_type(rank0_->type());
+    shared_ptr<Solver<Dtype> > s(SolverRegistry<Dtype>::CreateSolver(param));
+    CHECK_EQ(s->type(), rank0_->type());
+    if (restore_) {
+      // Could not make NCCL broadcast solver state, it seems to crash
+      // if called in a tight loop, regardless of barriers etc. so
+      // restore all solvers from file.
+      s->Restore(restore_);
+    }
+    NCCL<Dtype> nccl(s);
+    nccl.set_barrier(barrier_);
+    s->add_callback(&nccl);
+    if (s->param().layer_wise_reduce()) {
+      s->net()->add_after_backward(&nccl);
+    }
+    (*nccls_)[Caffe::solver_rank()] = &nccl;
+    // Wait for other threads
+    barrier_->wait();
+    // Wait for NCCL init
+    barrier_->wait();
+    // Broadcast rank 0 state
+    nccl.Broadcast();
+    // Solve
+    s->Step(param.max_iter() - s->iter());
+    barrier_->wait();
+#ifdef DEBUG
+    // Check all solvers have same state
+    SGDSolver<Dtype>* sa = static_cast<SGDSolver<Dtype>*>(rank0_.get());
+    SGDSolver<Dtype>* sb = static_cast<SGDSolver<Dtype>*>(s.get());
+    for (int h = 0; h < sa->history().size(); ++h) {
+      CUDA_CHECK(cudaSetDevice(sa->param().device_id()));
+      const Dtype* a = sa->history()[h]->cpu_data();
+      CUDA_CHECK(cudaSetDevice(sb->param().device_id()));
+      const Dtype* b = sb->history()[h]->cpu_data();
+      for (int v = 0; v < sa->history()[h]->count(); ++v) {
+        CHECK_DOUBLE_EQ(a[v], b[v]);
       }
     }
+#endif
   }
-}
-
-template<typename Dtype>
-void P2PSync<Dtype>::Run(const vector<int>& gpus) {
-  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
-  Prepare(gpus, &syncs);
 
-  LOG(INFO)<< "Starting Optimization";
+  shared_ptr<Solver<Dtype> > rank0_;
+  int device_;
+  boost::barrier* barrier_;
+  vector<NCCL<Dtype>*>* nccls_;
+  const char* restore_;
+};
 
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StartInternalThread();
+template<typename Dtype>
+void NCCL<Dtype>::Run(const vector<int>& gpus, const char* restore) {
+  boost::barrier barrier(static_cast<int>(gpus.size()));
+  vector<NCCL<Dtype>*> nccls(gpus.size());
+  // Create workers
+  vector<shared_ptr<Worker<Dtype> > > workers(gpus.size());
+  for (int i = 1; i < gpus.size(); ++i) {
+    CUDA_CHECK(cudaSetDevice(gpus[i]));
+    Caffe::set_solver_rank(i);
+    Worker<Dtype>* w = new Worker<Dtype>(solver_, gpus[i], &barrier,
+                                         &nccls, restore);
+    w->StartInternalThread();
+    workers[i].reset(w);
   }
-
-  // Run root solver on current thread
+  CUDA_CHECK(cudaSetDevice(gpus[0]));
+  Caffe::set_solver_rank(0);
+  barrier_ = &barrier;
+  solver_->add_callback(this);
+  if (solver_->param().layer_wise_reduce()) {
+    solver_->net()->add_after_backward(this);
+  }
+  nccls[0] = this;
+  // Wait for workers
+  barrier.wait();
+  // Init NCCL
+  InitSingleProcess(&nccls);
+  barrier.wait();
+  // Run first solver on current thread
+  Broadcast();
   solver_->Solve();
-
-  for (int i = 1; i < syncs.size(); ++i) {
-    syncs[i]->StopInternalThread();
+  barrier.wait();  // Hangs without it when running tests
+  // Wait for shutdown
+  for (int i = 1; i < gpus.size(); ++i) {
+    workers[i]->StopInternalThread();
   }
 }
 
 INSTANTIATE_CLASS(Params);
 INSTANTIATE_CLASS(GPUParams);
-INSTANTIATE_CLASS(P2PSync);
+INSTANTIATE_CLASS(Worker);
+INSTANTIATE_CLASS(NCCL);
 
 }  // namespace caffe
+
+#endif  // USE_NCCL
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 430a0de..1c85f69 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -98,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 41 (last added: type)
+// SolverParameter next available ID: 42 (last added: layer_wise_reduce)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -239,6 +239,9 @@ message SolverParameter {
   }
   // DEPRECATED: use type instead of solver_type
   optional SolverType solver_type = 30 [default = SGD];
+
+  // Overlap compute and communication for data parallel training
+  optional bool layer_wise_reduce = 41 [default = true];
 }
 
 // A message that stores the solver snapshots
@@ -655,8 +658,8 @@ message DataParameter {
   optional bool mirror = 6 [default = false];
   // Force the encoded image to have 3 color channels
   optional bool force_encoded_color = 9 [default = false];
-  // Prefetch queue (Number of batches to prefetch to host memory, increase if
-  // data access bandwidth varies).
+  // Prefetch queue (Increase if data feeding bandwidth varies, within the
+  // limit of device memory for GPU training)
   optional uint32 prefetch = 10 [default = 4];
 }
 
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index ece3913..1c1a9e5 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -26,16 +26,14 @@ SolverAction::Enum Solver<Dtype>::GetRequestedAction() {
 }
 
 template <typename Dtype>
-Solver<Dtype>::Solver(const SolverParameter& param, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
+Solver<Dtype>::Solver(const SolverParameter& param)
+    : net_(), callbacks_(), requested_early_exit_(false) {
   Init(param);
 }
 
 template <typename Dtype>
-Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
-    : net_(), callbacks_(), root_solver_(root_solver),
-      requested_early_exit_(false) {
+Solver<Dtype>::Solver(const string& param_file)
+    : net_(), callbacks_(), requested_early_exit_(false) {
   SolverParameter param;
   ReadSolverParamsFromTextFileOrDie(param_file, &param);
   Init(param);
@@ -43,15 +41,13 @@ Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
 
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
-  CHECK(Caffe::root_solver() || root_solver_)
-      << "root_solver_ needs to be set for all non-root solvers";
   LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: "
     << std::endl << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
   CheckSnapshotWritePermissions();
-  if (Caffe::root_solver() && param_.random_seed() >= 0) {
-    Caffe::set_random_seed(param_.random_seed());
+  if (param_.random_seed() >= 0) {
+    Caffe::set_random_seed(param_.random_seed() + Caffe::solver_rank());
   }
   // Scaffolding code
   InitTrainNet();
@@ -101,11 +97,7 @@ void Solver<Dtype>::InitTrainNet() {
   net_state.MergeFrom(net_param.state());
   net_state.MergeFrom(param_.train_state());
   net_param.mutable_state()->CopyFrom(net_state);
-  if (Caffe::root_solver()) {
-    net_.reset(new Net<Dtype>(net_param));
-  } else {
-    net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get()));
-  }
+  net_.reset(new Net<Dtype>(net_param));
 }
 
 template <typename Dtype>
@@ -180,12 +172,7 @@ void Solver<Dtype>::InitTestNets() {
     net_params[i].mutable_state()->CopyFrom(net_state);
     LOG(INFO)
         << "Creating test net (#" << i << ") specified by " << sources[i];
-    if (Caffe::root_solver()) {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i]));
-    } else {
-      test_nets_[i].reset(new Net<Dtype>(net_params[i],
-          root_solver_->test_nets_[i].get()));
-    }
+    test_nets_[i].reset(new Net<Dtype>(net_params[i]));
     test_nets_[i]->set_debug_info(param_.debug_info());
   }
 }
@@ -197,14 +184,16 @@ void Solver<Dtype>::Step(int iters) {
   int average_loss = this->param_.average_loss();
   losses_.clear();
   smoothed_loss_ = 0;
+  iteration_timer_.Start();
 
   while (iter_ < stop_iter) {
     // zero-init the params
     net_->ClearParamDiffs();
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
-        && (iter_ > 0 || param_.test_initialization())
-        && Caffe::root_solver()) {
-      TestAll();
+        && (iter_ > 0 || param_.test_initialization())) {
+      if (Caffe::root_solver()) {
+        TestAll();
+      }
       if (requested_early_exit_) {
         // Break out of the while loop because stop was requested while testing.
         break;
@@ -225,8 +214,13 @@ void Solver<Dtype>::Step(int iters) {
     // average the loss across iterations for smoothed reporting
     UpdateSmoothedLoss(loss, start_iter, average_loss);
     if (display) {
+      float lapse = iteration_timer_.Seconds();
+      float per_s = (iter_ - iterations_last_) / (lapse ? lapse : 1);
       LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
-          << ", loss = " << smoothed_loss_;
+          << " (" << per_s << " iter/s, " << lapse << "s/"
+          << param_.display() << " iters), loss = " << smoothed_loss_;
+      iteration_timer_.Start();
+      iterations_last_ = iter_;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
       int score_index = 0;
       for (int j = 0; j < result.size(); ++j) {
diff --git a/src/caffe/solvers/adagrad_solver.cpp b/src/caffe/solvers/adagrad_solver.cpp
index e78eadc..d8107e1 100644
--- a/src/caffe/solvers/adagrad_solver.cpp
+++ b/src/caffe/solvers/adagrad_solver.cpp
@@ -12,7 +12,6 @@ void adagrad_update_gpu(int N, Dtype* g, Dtype* h, Dtype delta,
 
 template <typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype delta = this->param_.delta();
diff --git a/src/caffe/solvers/nesterov_solver.cpp b/src/caffe/solvers/nesterov_solver.cpp
index 23ab2d4..7c1fac1 100644
--- a/src/caffe/solvers/nesterov_solver.cpp
+++ b/src/caffe/solvers/nesterov_solver.cpp
@@ -12,7 +12,6 @@ void nesterov_update_gpu(int N, Dtype* g, Dtype* h, Dtype momentum,
 
 template <typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  CHECK(Caffe::root_solver());
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype momentum = this->param_.momentum();
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index f30f316..ad6abe5 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -100,10 +100,10 @@ void SGDSolver<Dtype>::ClipGradients() {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
-  CHECK(Caffe::root_solver());
   Dtype rate = GetLearningRate();
   if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
-    LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
+    LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << this->iter_
+        << ", lr = " << rate;
   }
   ClipGradients();
   for (int param_id = 0; param_id < this->net_->learnable_params().size();
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 4d35641..88d9b78 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -3,26 +3,41 @@
 #include "caffe/util/math_functions.hpp"
 
 namespace caffe {
+SyncedMemory::SyncedMemory()
+  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
+    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+#endif
+}
+
+SyncedMemory::SyncedMemory(size_t size)
+  : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
+    own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false) {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  CUDA_CHECK(cudaGetDevice(&device_));
+#endif
+#endif
+}
 
 SyncedMemory::~SyncedMemory() {
+  check_device();
   if (cpu_ptr_ && own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
   }
 
 #ifndef CPU_ONLY
   if (gpu_ptr_ && own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
     CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
   }
 #endif  // CPU_ONLY
 }
 
 inline void SyncedMemory::to_cpu() {
+  check_device();
   switch (head_) {
   case UNINITIALIZED:
     CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
@@ -49,10 +64,10 @@ inline void SyncedMemory::to_cpu() {
 }
 
 inline void SyncedMemory::to_gpu() {
+  check_device();
 #ifndef CPU_ONLY
   switch (head_) {
   case UNINITIALIZED:
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     caffe_gpu_memset(size_, 0, gpu_ptr_);
     head_ = HEAD_AT_GPU;
@@ -60,7 +75,6 @@ inline void SyncedMemory::to_gpu() {
     break;
   case HEAD_AT_CPU:
     if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaGetDevice(&gpu_device_));
       CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
       own_gpu_data_ = true;
     }
@@ -77,11 +91,13 @@ inline void SyncedMemory::to_gpu() {
 }
 
 const void* SyncedMemory::cpu_data() {
+  check_device();
   to_cpu();
   return (const void*)cpu_ptr_;
 }
 
 void SyncedMemory::set_cpu_data(void* data) {
+  check_device();
   CHECK(data);
   if (own_cpu_data_) {
     CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
@@ -92,6 +108,7 @@ void SyncedMemory::set_cpu_data(void* data) {
 }
 
 const void* SyncedMemory::gpu_data() {
+  check_device();
 #ifndef CPU_ONLY
   to_gpu();
   return (const void*)gpu_ptr_;
@@ -102,16 +119,11 @@ const void* SyncedMemory::gpu_data() {
 }
 
 void SyncedMemory::set_gpu_data(void* data) {
+  check_device();
 #ifndef CPU_ONLY
   CHECK(data);
   if (own_gpu_data_) {
-    int initial_device;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
     CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
   }
   gpu_ptr_ = data;
   head_ = HEAD_AT_GPU;
@@ -122,12 +134,14 @@ void SyncedMemory::set_gpu_data(void* data) {
 }
 
 void* SyncedMemory::mutable_cpu_data() {
+  check_device();
   to_cpu();
   head_ = HEAD_AT_CPU;
   return cpu_ptr_;
 }
 
 void* SyncedMemory::mutable_gpu_data() {
+  check_device();
 #ifndef CPU_ONLY
   to_gpu();
   head_ = HEAD_AT_GPU;
@@ -140,9 +154,9 @@ void* SyncedMemory::mutable_gpu_data() {
 
 #ifndef CPU_ONLY
 void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
+  check_device();
   CHECK(head_ == HEAD_AT_CPU);
   if (gpu_ptr_ == NULL) {
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     own_gpu_data_ = true;
   }
@@ -153,5 +167,20 @@ void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
 }
 #endif
 
+void SyncedMemory::check_device() {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  int device;
+  cudaGetDevice(&device);
+  CHECK(device == device_);
+  if (gpu_ptr_ && own_gpu_data_) {
+    cudaPointerAttributes attributes;
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, gpu_ptr_));
+    CHECK(attributes.device == device_);
+  }
+#endif
+#endif
+}
+
 }  // namespace caffe
 
diff --git a/src/caffe/test/test_data_layer.cpp b/src/caffe/test/test_data_layer.cpp
index 3e8d113..3835af1 100644
--- a/src/caffe/test/test_data_layer.cpp
+++ b/src/caffe/test/test_data_layer.cpp
@@ -105,6 +105,32 @@ class DataLayerTest : public MultiDeviceTest<TypeParam> {
     }
   }
 
+  void TestSkip() {
+    LayerParameter param;
+    param.set_phase(TRAIN);
+    DataParameter* data_param = param.mutable_data_param();
+    int batch_size = 5;
+    data_param->set_batch_size(batch_size);
+    data_param->set_source(filename_->c_str());
+    data_param->set_backend(backend_);
+    Caffe::set_solver_count(8);
+    for (int dev = 0; dev < Caffe::solver_count(); ++dev) {
+      Caffe::set_solver_rank(dev);
+      DataLayer<Dtype> layer(param);
+      layer.SetUp(blob_bottom_vec_, blob_top_vec_);
+      int label = dev;
+      for (int iter = 0; iter < 10; ++iter) {
+        layer.Forward(blob_bottom_vec_, blob_top_vec_);
+        for (int i = 0; i < batch_size; ++i) {
+          EXPECT_EQ(label % batch_size, blob_top_label_->cpu_data()[i]);
+          label += Caffe::solver_count();
+        }
+      }
+    }
+    Caffe::set_solver_count(1);
+    Caffe::set_solver_rank(0);
+  }
+
   void TestReshape(DataParameter_DB backend) {
     const int num_inputs = 5;
     // Save data of varying shapes.
@@ -356,6 +382,11 @@ TYPED_TEST(DataLayerTest, TestReadLevelDB) {
   this->TestRead();
 }
 
+TYPED_TEST(DataLayerTest, TestSkipLevelDB) {
+  this->Fill(false, DataParameter_DB_LEVELDB);
+  this->TestSkip();
+}
+
 TYPED_TEST(DataLayerTest, TestReshapeLevelDB) {
   this->TestReshape(DataParameter_DB_LEVELDB);
 }
@@ -396,6 +427,11 @@ TYPED_TEST(DataLayerTest, TestReadLMDB) {
   this->TestRead();
 }
 
+TYPED_TEST(DataLayerTest, TestSkipLMDB) {
+  this->Fill(false, DataParameter_DB_LMDB);
+  this->TestSkip();
+}
+
 TYPED_TEST(DataLayerTest, TestReshapeLMDB) {
   this->TestReshape(DataParameter_DB_LMDB);
 }
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 975a8f0..6ad0d8f 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -36,7 +36,9 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
 
   string snapshot_prefix_;
   shared_ptr<SGDSolver<Dtype> > solver_;
-  shared_ptr<P2PSync<Dtype> > sync_;
+#ifdef USE_NCCL
+  shared_ptr<NCCL<Dtype> > nccl_;
+#endif
   int seed_;
   // Dimensions are determined by generate_sample_data.py
   // TODO this is brittle and the hdf5 file should be checked instead.
@@ -85,6 +87,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
        "lr_policy: 'fixed' "
        "iter_size: " << iter_size << " "
        "device_id: " << device_id << " "
+       "layer_wise_reduce: " << (!share_) << " "
        "net_param { "
        "  name: 'TestNetwork' "
        "  layer { "
@@ -183,7 +186,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     }
     Caffe::set_random_seed(this->seed_);
     this->InitSolverFromProtoString(proto.str());
-    if (from_snapshot != NULL) {
+    if (from_snapshot) {
       this->solver_->Restore(from_snapshot);
       for (int i = 0; i < this->solver_->iter(); ++i) {
         this->solver_->net()->Forward();
@@ -202,9 +205,10 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
           gpus.push_back(i);
       }
       Caffe::set_solver_count(gpus.size());
-      this->sync_.reset(new P2PSync<Dtype>(
-          this->solver_, NULL, this->solver_->param()));
-      this->sync_->Run(gpus);
+#ifdef USE_NCCL
+      this->nccl_.reset(new NCCL<Dtype>(this->solver_));
+      this->nccl_->Run(gpus, from_snapshot);
+#endif
       Caffe::set_solver_count(1);
     }
     if (snapshot) {
@@ -457,12 +461,28 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const int kIterSize = 1;
     // Test over all numbers of devices.
     int available_devices = 1;
-#ifndef CPU_ONLY
+#ifdef USE_NCCL
     if (Caffe::mode() == Caffe::GPU) {
       CUDA_CHECK(cudaGetDeviceCount(&available_devices));
     }
 #endif
-    for (int devices = 1; devices <= available_devices; ++devices) {
+    // Takes a while to test all sizes for each test so sparse
+    vector<int> sizes;
+    sizes.push_back(1);
+    if (available_devices >= 2) {
+      sizes.push_back(2);
+    }
+    if (available_devices >= 3) {
+      sizes.push_back(3);
+    }
+    if (available_devices >= 8) {
+      sizes.push_back(8);
+    }
+    if (available_devices >= 16) {
+      sizes.push_back(16);
+    }
+    for (int i = 0; i < sizes.size(); ++i) {
+      int devices = sizes[i];
       // Configure batch size for single / multi device equivalence.
       // Constant data is needed for multi device as for accumulation.
       num_ = kNum * devices;
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 8884ce9..68e1028 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -133,4 +133,34 @@ TYPED_TEST(HDF5DataLayerTest, TestRead) {
   }
 }
 
+TYPED_TEST(HDF5DataLayerTest, TestSkip) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter param;
+  param.add_top("data");
+  param.add_top("label");
+
+  HDF5DataParameter* hdf5_data_param = param.mutable_hdf5_data_param();
+  int batch_size = 5;
+  hdf5_data_param->set_batch_size(batch_size);
+  hdf5_data_param->set_source(*(this->filename));
+
+  Caffe::set_solver_count(8);
+  for (int dev = 0; dev < Caffe::solver_count(); ++dev) {
+    Caffe::set_solver_rank(dev);
+
+    HDF5DataLayer<Dtype> layer(param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    int label = dev;
+    for (int iter = 0; iter < 1; ++iter) {
+      layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+      for (int i = 0; i < batch_size; ++i) {
+        EXPECT_EQ(1 + label, this->blob_top_label_->cpu_data()[i]);
+        label = (label + Caffe::solver_count()) % (batch_size * 2);
+      }
+    }
+  }
+  Caffe::set_solver_count(1);
+  Caffe::set_solver_rank(0);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index f1ec233..6d84d29 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -60,9 +60,9 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) {
   EXPECT_EQ(this->blob_top_->channels(), 10);
 }
 
-/** @brief TestSetUp while toggling tranpose flag
+/** @brief TestSetUp while toggling transpose flag
  */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeFalse) {
   typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_vec_.push_back(this->blob_bottom_);
   LayerParameter layer_param;
@@ -82,9 +82,9 @@ TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) {
   EXPECT_EQ(60, layer->blobs()[0]->shape(1));
 }
 
-/** @brief TestSetUp while toggling tranpose flag
+/** @brief TestSetUp while toggling transpose flag
  */
-TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeTrue) {
+TYPED_TEST(InnerProductLayerTest, TestSetUpTransposeTrue) {
   typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_vec_.push_back(this->blob_bottom_);
   LayerParameter layer_param;
@@ -339,7 +339,7 @@ TYPED_TEST(InnerProductLayerTest, TestBackwardTranspose) {
     // copy bottom diffs
     Blob<Dtype>* const bottom_diff = new Blob<Dtype>();
     bottom_diff->CopyFrom(*this->blob_bottom_vec_[0], true, true);
-    // repeat original top with tranposed ip
+    // repeat original top with transposed ip
     this->blob_top_vec_.clear();
     this->blob_top_vec_.push_back(new Blob<Dtype>());
     inner_product_param->set_transpose(true);
diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp
index 058668f..f69d210 100644
--- a/src/caffe/util/blocking_queue.cpp
+++ b/src/caffe/util/blocking_queue.cpp
@@ -1,7 +1,6 @@
 #include <boost/thread.hpp>
 #include <string>
 
-#include "caffe/data_reader.hpp"
 #include "caffe/layers/base_data_layer.hpp"
 #include "caffe/parallel.hpp"
 #include "caffe/util/blocking_queue.hpp"
@@ -88,9 +87,5 @@ size_t BlockingQueue<T>::size() const {
 
 template class BlockingQueue<Batch<float>*>;
 template class BlockingQueue<Batch<double>*>;
-template class BlockingQueue<Datum*>;
-template class BlockingQueue<shared_ptr<DataReader::QueuePair> >;
-template class BlockingQueue<P2PSync<float>*>;
-template class BlockingQueue<P2PSync<double>*>;
 
 }  // namespace caffe
diff --git a/src/caffe/util/db_lmdb.cpp b/src/caffe/util/db_lmdb.cpp
index fb1d495..491a9bd 100644
--- a/src/caffe/util/db_lmdb.cpp
+++ b/src/caffe/util/db_lmdb.cpp
@@ -32,7 +32,7 @@ void LMDB::Open(const string& source, Mode mode) {
     MDB_CHECK(rc);
   }
 #endif
-  LOG(INFO) << "Opened lmdb " << source;
+  LOG_IF(INFO, Caffe::root_solver()) << "Opened lmdb " << source;
 }
 
 LMDBCursor* LMDB::NewCursor() {
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index 7730e76..d255877 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -29,10 +29,10 @@ void hdf5_load_nd_dataset_helper(
   CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
   switch (class_) {
   case H5T_FLOAT:
-    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_FLOAT";
+    { LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_FLOAT"; }
     break;
   case H5T_INTEGER:
-    LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_INTEGER";
+    { LOG_FIRST_N(INFO, 1) << "Datatype class: H5T_INTEGER"; }
     break;
   case H5T_TIME:
     LOG(FATAL) << "Unsupported datatype class: H5T_TIME";
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 4c58753..6d00102 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -91,6 +91,26 @@ void caffe_gpu_scal<double>(const int N, const double alpha, double *X) {
 }
 
 template <>
+void caffe_gpu_scal<float>(const int N, const float alpha, float* X,
+                           cudaStream_t str) {
+  cudaStream_t initial_stream;
+  CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
+  CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
+}
+
+template <>
+void caffe_gpu_scal<double>(const int N, const double alpha, double* X,
+                            cudaStream_t str) {
+  cudaStream_t initial_stream;
+  CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
+  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
+  CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
+}
+
+template <>
 void caffe_gpu_axpby<float>(const int N, const float alpha, const float* X,
     const float beta, float* Y) {
   caffe_gpu_scal<float>(N, beta, Y);
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index a0aacbe..94771c8 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -1018,7 +1018,13 @@ void UpgradeNetBatchNorm(NetParameter* net_param) {
     // the previous BatchNorm layer definition.
     if (net_param->layer(i).type() == "BatchNorm"
         && net_param->layer(i).param_size() == 3) {
-      net_param->mutable_layer(i)->clear_param();
+      // set lr_mult and decay_mult to zero. leave all other param intact.
+      for (int ip = 0; ip < net_param->layer(i).param_size(); ip++) {
+        ParamSpec* fixed_param_spec =
+          net_param->mutable_layer(i)->mutable_param(ip);
+        fixed_param_spec->set_lr_mult(0.f);
+        fixed_param_spec->set_decay_mult(0.f);
+      }
     }
   }
 }
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 9bf4214..3587d8a 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -195,6 +195,7 @@ int train() {
   // If the gpus flag is not provided, allow the mode and device to be set
   // in the solver prototxt.
   if (FLAGS_gpu.size() == 0
+      && solver_param.has_solver_mode()
       && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
       if (solver_param.has_device_id()) {
           FLAGS_gpu = "" +
@@ -244,11 +245,15 @@ int train() {
     CopyLayers(solver.get(), FLAGS_weights);
   }
 
+  LOG(INFO) << "Starting Optimization";
   if (gpus.size() > 1) {
-    caffe::P2PSync<float> sync(solver, NULL, solver->param());
-    sync.Run(gpus);
+#ifdef USE_NCCL
+    caffe::NCCL<float> nccl(solver);
+    nccl.Run(gpus, FLAGS_snapshot.size() > 0 ? FLAGS_snapshot.c_str() : NULL);
+#else
+    LOG(FATAL) << "Multi-GPU execution not available - rebuild with USE_NCCL";
+#endif
   } else {
-    LOG(INFO) << "Starting Optimization";
     solver->Solve();
   }
   LOG(INFO) << "Optimization Done.";
diff --git a/tools/extra/extract_seconds.py b/tools/extra/extract_seconds.py
index 591a51f..68af69a 100755
--- a/tools/extra/extract_seconds.py
+++ b/tools/extra/extract_seconds.py
@@ -48,11 +48,19 @@ def extract_seconds(input_file, output_file):
     start_datetime = get_start_time(lines, log_created_year)
     assert start_datetime, 'Start time not found'
 
+    last_dt = start_datetime
     out = open(output_file, 'w')
     for line in lines:
         line = line.strip()
         if line.find('Iteration') != -1:
             dt = extract_datetime_from_line(line, log_created_year)
+
+            # if it's another year
+            if dt.month < last_dt.month:
+                log_created_year += 1
+                dt = extract_datetime_from_line(line, log_created_year)
+            last_dt = dt
+
             elapsed_seconds = (dt - start_datetime).total_seconds()
             out.write('%f\n' % elapsed_seconds)
     out.close()
diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py
index 017306b..b47ffd0 100755
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@@ -38,6 +38,7 @@ def parse_log(path_to_log):
     logfile_year = extract_seconds.get_log_created_year(path_to_log)
     with open(path_to_log) as f:
         start_time = extract_seconds.get_start_time(f, logfile_year)
+        last_time = start_time
 
         for line in f:
             iteration_match = regex_iteration.search(line)
@@ -55,6 +56,12 @@ def parse_log(path_to_log):
                 # Skip lines with bad formatting, for example when resuming solver
                 continue
 
+            # if it's another year
+            if time.month < last_time.month:
+                logfile_year += 1
+                time = extract_seconds.extract_datetime_from_line(line, logfile_year)
+            last_time = time
+
             seconds = (time - start_time).total_seconds()
 
             learning_rate_match = regex_learning_rate.search(line)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/caffe-contrib.git