[caffe-contrib] 189/362: Imported Upstream version 0.9999~rc2+git20150902+e8e660d3

Tue May 3 09:24:30 UTC 2016

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository caffe-contrib.

commit a488509266df161283568bc78c878a05622e66a6
Author: Zhou Mo <cdluminate at gmail.com>
Date:   Wed Sep 2 13:30:44 2015 +0000

    Imported Upstream version 0.9999~rc2+git20150902+e8e660d3
---
 .gitignore                                         |   2 +
 CONTRIBUTING.md                                    |  30 +
 Makefile                                           |   6 +-
 cmake/Misc.cmake                                   |   4 +-
 cmake/Modules/FindOpenBLAS.cmake                   |   2 +
 docs/tutorial/interfaces.md                        |   7 +
 docs/tutorial/solver.md                            |  79 +-
 examples/CMakeLists.txt                            |   2 +-
 examples/cifar10/cifar10_full.prototxt             |  10 +-
 examples/cifar10/cifar10_full_solver.prototxt      |   1 +
 examples/cifar10/cifar10_full_solver_lr1.prototxt  |   1 +
 examples/cifar10/cifar10_full_solver_lr2.prototxt  |   1 +
 examples/cifar10/cifar10_quick.prototxt            |  10 +-
 examples/cifar10/cifar10_quick_solver.prototxt     |   1 +
 examples/cifar10/cifar10_quick_solver_lr1.prototxt |   1 +
 examples/cifar10/train_full.sh                     |   4 +-
 examples/cifar10/train_quick.sh                    |   2 +-
 examples/cpp_classification/classification.cpp     |   2 +
 examples/imagenet/resume_training.sh               |   2 +-
 examples/mnist/convert_mnist_data.cpp              |   2 +-
 examples/mnist/lenet.prototxt                      |  10 +-
 .../lenet_adadelta_solver.prototxt}                |  17 +-
 .../lenet_solver_adam.prototxt}                    |  19 +-
 .../lenet_solver_rmsprop.prototxt}                 |  20 +-
 .../mnist_autoencoder_solver_adadelta.prototxt     |  19 +
 examples/mnist/train_lenet_adam.sh                 |   3 +
 examples/mnist/train_lenet_rmsprop.sh              |   3 +
 examples/mnist/train_mnist_autoencoder_adadelta.sh |   4 +
 .../net_surgery/bvlc_caffenet_full_conv.prototxt   |  10 +-
 examples/net_surgery/conv.prototxt                 |  10 +-
 examples/pycaffe/caffenet.py                       |   2 +-
 examples/siamese/convert_mnist_siamese_data.cpp    |   2 +-
 examples/siamese/mnist_siamese.prototxt            |  10 +-
 include/caffe/blob.hpp                             |   4 +-
 include/caffe/caffe.hpp                            |   1 +
 include/caffe/common.hpp                           |  20 +-
 include/caffe/common_layers.hpp                    |  67 ++
 include/caffe/data_layers.hpp                      |  50 +-
 include/caffe/data_reader.hpp                      |  82 ++
 include/caffe/internal_thread.hpp                  |  19 +-
 include/caffe/layer.hpp                            |  54 +-
 include/caffe/layer_factory.hpp                    |  35 +-
 include/caffe/loss_layers.hpp                      |   9 +-
 include/caffe/net.hpp                              |  53 +-
 include/caffe/parallel.hpp                         | 118 +++
 include/caffe/python_layer.hpp                     |  35 +-
 include/caffe/solver.hpp                           | 173 +++-
 include/caffe/syncedmem.hpp                        |  42 +-
 include/caffe/test/test_gradient_check_util.hpp    |  11 +-
 include/caffe/util/blocking_queue.hpp              |  47 ++
 include/caffe/util/gpu_util.cuh                    |  35 +
 include/caffe/util/hdf5.hpp                        |  39 +
 include/caffe/util/io.hpp                          |  18 -
 include/caffe/util/signal_handler.h                |  24 +
 include/caffe/vision_layers.hpp                    |  10 +-
 matlab/CMakeLists.txt                              |   4 +-
 models/bvlc_alexnet/deploy.prototxt                |  10 +-
 models/bvlc_googlenet/deploy.prototxt              |  10 +-
 models/bvlc_reference_caffenet/deploy.prototxt     |  10 +-
 .../bvlc_reference_rcnn_ilsvrc13/deploy.prototxt   |  10 +-
 models/finetune_flickr_style/deploy.prototxt       |  10 +-
 python/CMakeLists.txt                              |   2 +-
 python/caffe/__init__.py                           |   2 +-
 python/caffe/_caffe.cpp                            |  29 +-
 python/caffe/draw.py                               |   6 +-
 python/caffe/net_spec.py                           |  38 +-
 python/caffe/pycaffe.py                            |  10 +
 python/caffe/test/test_layer_type_list.py          |  10 +
 python/caffe/test/test_net_spec.py                 |  15 +
 python/caffe/test/test_python_layer.py             |  76 ++
 .../caffe/test/test_python_layer_with_param_str.py |  59 ++
 scripts/download_model_binary.py                   |   2 +-
 scripts/download_model_from_gist.sh                |   6 +-
 src/caffe/blob.cpp                                 |  49 +-
 src/caffe/common.cpp                               |  19 +-
 src/caffe/data_reader.cpp                          | 119 +++
 src/caffe/data_transformer.cpp                     |   4 +-
 src/caffe/internal_thread.cpp                      |  58 +-
 src/caffe/layer.cpp                                |  27 +
 src/caffe/layers/accuracy_layer.cpp                |  20 +
 src/caffe/layers/base_data_layer.cpp               |  94 ++-
 src/caffe/layers/base_data_layer.cu                |  17 +-
 src/caffe/layers/concat_layer.cpp                  |  13 +-
 src/caffe/layers/concat_layer.cu                   |  17 +-
 src/caffe/layers/data_layer.cpp                    |  81 +-
 src/caffe/layers/deconv_layer.cu                   |   3 +-
 src/caffe/layers/embed_layer.cpp                   | 122 +++
 src/caffe/layers/embed_layer.cu                    |  84 ++
 src/caffe/layers/hdf5_data_layer.cpp               |   2 +-
 src/caffe/layers/hdf5_output_layer.cpp             |   2 +-
 src/caffe/layers/hdf5_output_layer.cu              |   1 -
 src/caffe/layers/image_data_layer.cpp              |  31 +-
 src/caffe/layers/inner_product_layer.cu            |  19 +-
 src/caffe/layers/mvn_layer.cpp                     |  15 +-
 src/caffe/layers/mvn_layer.cu                      |   7 +-
 src/caffe/layers/spp_layer.cpp                     |  36 +
 src/caffe/layers/tile_layer.cpp                    |  62 ++
 src/caffe/layers/tile_layer.cu                     |  67 ++
 src/caffe/layers/window_data_layer.cpp             |  20 +-
 src/caffe/net.cpp                                  | 468 ++++++++---
 src/caffe/parallel.cpp                             | 441 +++++++++++
 src/caffe/proto/caffe.proto                        |  86 +-
 src/caffe/solver.cpp                               | 669 ++++++++++++++--
 src/caffe/syncedmem.cpp                            |  46 +-
 src/caffe/test/test_accuracy_layer.cpp             | 107 +++
 src/caffe/test/test_concat_layer.cpp               |   9 +
 src/caffe/test/test_data/generate_sample_data.py   |  28 +-
 src/caffe/test/test_data/solver_data.h5            | Bin 0 -> 11776 bytes
 src/caffe/test/test_data/solver_data_list.txt      |   1 +
 src/caffe/test/test_embed_layer.cpp                | 183 +++++
 src/caffe/test/test_gradient_based_solver.cpp      | 876 +++++++++++++++++++--
 src/caffe/test/test_hdf5_output_layer.cpp          |   1 +
 src/caffe/test/test_inner_product_layer.cpp        |  43 +-
 src/caffe/test/test_internal_thread.cpp            |  34 +-
 src/caffe/test/test_layer_factory.cpp              |  14 +-
 src/caffe/test/test_mvn_layer.cpp                  |  13 +-
 src/caffe/test/test_net.cpp                        |  29 +-
 src/caffe/test/test_tile_layer.cpp                 | 162 ++++
 src/caffe/test/test_upgrade_proto.cpp              |  12 +
 src/caffe/util/blocking_queue.cpp                  |  96 +++
 src/caffe/util/hdf5.cpp                            | 160 ++++
 src/caffe/util/insert_splits.cpp                   |   3 +-
 src/caffe/util/io.cpp                              |  74 --
 src/caffe/util/signal_handler.cpp                  | 115 +++
 src/caffe/util/upgrade_proto.cpp                   |  20 +-
 tools/caffe.cpp                                    | 153 +++-
 tools/convert_imageset.cpp                         |   6 +-
 tools/extra/parse_log.py                           |   2 +-
 128 files changed, 5568 insertions(+), 825 deletions(-)

diff --git a/.gitignore b/.gitignore
index 28f2aca..53c1fb0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,7 +61,9 @@ Makefile.config
 data/*
 models/*
 *.caffemodel
+*.caffemodel.h5
 *.solverstate
+*.solverstate.h5
 *.binaryproto
 *leveldb
 *lmdb
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..8cd5e56
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,30 @@
+# Contributing
+
+## Issues
+
+Specific Caffe design and development issues, bugs, and feature requests are maintained by GitHub Issues.
+
+_Please do not post usage, installation, or modeling questions, or other requests for help to Issues._
+Use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) instead. This helps developers maintain a clear, uncluttered, and efficient view of the state of Caffe.
+
+When reporting a bug, it's most helpful to provide the following information, where applicable:
+
+* What steps reproduce the bug?
+* Can you reproduce the bug using the latest [master](https://github.com/BVLC/caffe/tree/master), compiled with the `DEBUG` make option?
+* What hardware and operating system/distribution are you running?
+* If the bug is a crash, provide the backtrace (usually printed by Caffe; always obtainable with `gdb`).
+
+Try to give your issue a title that is succinct and specific. The devs will rename issues as needed to keep track of them.
+
+## Pull Requests
+
+Caffe welcomes all contributions.
+
+See the [contributing guide](http://caffe.berkeleyvision.org/development.html) for details.
+
+Briefly: read commit by commit, a PR should tell a clean, compelling story of _one_ improvement to Caffe. In particular:
+
+* A PR should do one clear thing that obviously improves Caffe, and nothing more. Making many smaller PRs is better than making one large PR; review effort is superlinear in the amount of code involved.
+* Similarly, each commit should be a small, atomic change representing one step in development. PRs should be made of many commits where appropriate.
+* Please do rewrite PR history to be clean rather than chronological. Within-PR bugfixes, style cleanups, reversions, etc. should be squashed and should not appear in merged PR history.
+* Anything nonobvious from the code should be explained in comments, commit messages, or the PR description, as appropriate.
diff --git a/Makefile b/Makefile
index 05b783a..80bc373 100644
--- a/Makefile
+++ b/Makefile
@@ -386,11 +386,13 @@ endif
 ##############################
 # Define build targets
 ##############################
-.PHONY: all test clean docs linecount lint lintclean tools examples $(DIST_ALIASES) \
+.PHONY: all lib test clean docs linecount lint lintclean tools examples $(DIST_ALIASES) \
 	py mat py$(PROJECT) mat$(PROJECT) proto runtest \
 	superclean supercleanlist supercleanfiles warn everything
 
-all: $(STATIC_NAME) $(DYNAMIC_NAME) tools examples
+all: lib tools examples
+
+lib: $(STATIC_NAME) $(DYNAMIC_NAME)
 
 everything: $(EVERYTHING_TARGETS)
 
diff --git a/cmake/Misc.cmake b/cmake/Misc.cmake
index 7676754..9dd2609 100644
--- a/cmake/Misc.cmake
+++ b/cmake/Misc.cmake
@@ -46,7 +46,7 @@ endif()
 # ---[ Set debug postfix
 set(Caffe_DEBUG_POSTFIX "-d")
 
-set(CAffe_POSTFIX "")
+set(Caffe_POSTFIX "")
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
-  set(CAffe_POSTFIX ${Caffe_DEBUG_POSTFIX})
+  set(Caffe_POSTFIX ${Caffe_DEBUG_POSTFIX})
 endif()
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
index b843492..a6512ae 100644
--- a/cmake/Modules/FindOpenBLAS.cmake
+++ b/cmake/Modules/FindOpenBLAS.cmake
@@ -2,8 +2,10 @@
 
 SET(Open_BLAS_INCLUDE_SEARCH_PATHS
   /usr/include
+  /usr/include/openblas
   /usr/include/openblas-base
   /usr/local/include
+  /usr/local/include/openblas
   /usr/local/include/openblas-base
   /opt/OpenBLAS/include
   $ENV{OpenBLAS_HOME}
diff --git a/docs/tutorial/interfaces.md b/docs/tutorial/interfaces.md
index 4060294..9006179 100644
--- a/docs/tutorial/interfaces.md
+++ b/docs/tutorial/interfaces.md
@@ -50,6 +50,13 @@ For a full example of fine-tuning, see examples/finetuning_on_flickr_style, but
     # query the first device
     caffe device_query -gpu 0
 
+**Parallelism**: the `-gpu` flag to the `caffe` tool can take a comma separated list of IDs to run on multiple GPUs. A solver and net will be instantiated for each GPU so the batch size is effectively multiplied by the number of GPUs. To reproduce single GPU training, reduce the batch size in the network definition accordingly.
+
+    # train on GPUs 0 & 1 (doubling the batch size)
+    caffe train -solver examples/mnist/lenet_solver.prototxt -gpu 0,1
+    # train on all GPUs (multiplying batch size by number of devices)
+    caffe train -solver examples/mnist/lenet_solver.prototxt -gpu all
+
 ## Python
 
 The Python interface -- pycaffe -- is the `caffe` module and its scripts in caffe/python. `import caffe` to load models, do forward and backward, handle IO, visualize networks, and even instrument model solving. All model data, derivatives, and parameters are exposed for reading and writing.
diff --git a/docs/tutorial/solver.md b/docs/tutorial/solver.md
index 17f793e..b150f64 100644
--- a/docs/tutorial/solver.md
+++ b/docs/tutorial/solver.md
@@ -6,7 +6,14 @@ title: Solver / Model Optimization
 The solver orchestrates model optimization by coordinating the network's forward inference and backward gradients to form parameter updates that attempt to improve the loss.
 The responsibilities of learning are divided between the Solver for overseeing the optimization and generating parameter updates and the Net for yielding loss and gradients.
 
-The Caffe solvers are Stochastic Gradient Descent (SGD), Adaptive Gradient (ADAGRAD), and Nesterov's Accelerated Gradient (NESTEROV).
+The Caffe solvers are:
+
+- Stochastic Gradient Descent (`SGD`), 
+- AdaDelta (`ADADELTA`),
+- Adaptive Gradient (`ADAGRAD`),
+- Adam (`ADAM`),
+- Nesterov's Accelerated Gradient (`NESTEROV`) and
+- RMSprop (`RMSPROP`)
 
 The solver
 
@@ -104,6 +111,32 @@ If learning diverges (e.g., you start to see very large or `NaN` or `inf` loss v
     [ImageNet Classification with Deep Convolutional Neural Networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
     *Advances in Neural Information Processing Systems*, 2012.
 
+### AdaDelta
+
+The **AdaDelta** (`solver_type: ADADELTA`) method (M. Zeiler [1]) is a "robust learning rate method". It is a gradient-based optimization method (like SGD). The update formulas are
+
+$$
+\begin{align}
+(v_t)_i &= \frac{\operatorname{RMS}((v_{t-1})_i)}{\operatorname{RMS}\left( \nabla L(W_t) \right)_{i}} \left( \nabla L(W_{t'}) \right)_i
+\\
+\operatorname{RMS}\left( \nabla L(W_t) \right)_{i} &= \sqrt{E[g^2] + \varepsilon}
+\\
+E[g^2]_t &= \delta{E[g^2]_{t-1} } + (1-\delta)g_{t}^2
+\end{align}
+$$
+
+and 
+
+$$
+(W_{t+1})_i =
+(W_t)_i - \alpha
+(v_t)_i.
+$$
+
+[1] M. Zeiler
+    [ADADELTA: AN ADAPTIVE LEARNING RATE METHOD](http://arxiv.org/pdf/1212.5701.pdf).
+    *arXiv preprint*, 2012.
+
 ### AdaGrad
 
 The **adaptive gradient** (`solver_type: ADAGRAD`) method (Duchi et al. [1]) is a gradient-based optimization method (like SGD) that attempts to "find needles in haystacks in the form of very predictive but rarely seen features," in Duchi et al.'s words.
@@ -124,6 +157,28 @@ Note that in practice, for weights $$ W \in \mathcal{R}^d $$, AdaGrad implementa
     [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.magicbroom.info/Papers/DuchiHaSi10.pdf).
     *The Journal of Machine Learning Research*, 2011.
 
+### Adam
+
+The **Adam** (`solver_type: ADAM`), proposed in Kingma et al. [1], is a gradient-based optimization method (like SGD). This includes an "adaptive moment estimation" ($$m_t, v_t$$) and can be regarded as a generalization of AdaGrad. The update formulas are
+
+$$
+(m_t)_i = \beta_1 (m_{t-1})_i + (1-\beta_1)(\nabla L(W_t))_i,\\
+(v_t)_i = \beta_2 (v_{t-1})_i + (1-\beta_2)(\nabla L(W_t))_i^2
+$$
+
+and
+
+$$
+(W_{t+1})_i =
+(W_t)_i - \alpha \frac{\sqrt{1-(\beta_2)_i^t}}{1-(\beta_1)_i^t}\frac{(m_t)_i}{\sqrt{(v_t)_i}+\varepsilon}.
+$$
+
+Kingma et al. [1] proposed to use $$\beta_1 = 0.9, \beta_2 = 0.999, \varepsilon = 10^{-8}$$ as default values. Caffe uses the values of `momemtum, momentum2, delta` for $$\beta_1, \beta_2, \varepsilon$$, respectively.
+
+[1] D. Kingma, J. Ba.
+    [Adam: A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980).
+    *International Conference for Learning Representations*, 2015.
+
 ### NAG
 
 **Nesterov's accelerated gradient** (`solver_type: NESTEROV`) was proposed by Nesterov [1] as an "optimal" method of convex optimization, achieving a convergence rate of $$ \mathcal{O}(1/t^2) $$ rather than the $$ \mathcal{O}(1/t) $$.
@@ -149,6 +204,28 @@ What distinguishes the method from SGD is the weight setting $$ W $$ on which we
     [On the Importance of Initialization and Momentum in Deep Learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf).
     *Proceedings of the 30th International Conference on Machine Learning*, 2013.
 
+### RMSprop
+
+The **RMSprop** (`solver_type: RMSPROP`), suggested by Tieleman in a Coursera course lecture, is a gradient-based optimization method (like SGD). The update formulas are
+
+$$
+(v_t)_i = 
+\begin{cases}
+(v_{t-1})_i + \delta, &(\nabla L(W_t))_i(\nabla L(W_{t-1}))_i > 0\\
+(v_{t-1})_i \cdot (1-\delta), & \text{else}
+\end{cases}
+$$
+
+$$
+(W_{t+1})_i =(W_t)_i - \alpha (v_t)_i,
+$$
+
+If the gradient updates results in oscillations the gradient is reduced by times $$1-\delta$$. Otherwise it will be increased by $$\delta$$. The default value of $$\delta$$ (`rms_decay`) is set to $$\delta = 0.02$$.
+
+[1] T. Tieleman, and G. Hinton.
+    [RMSProp: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+    *COURSERA: Neural Networks for Machine Learning.Technical report*, 2012.
+
 ## Scaffolding
 
 The solver scaffolding prepares the optimization method and initializes the model to be learned in `Solver::Presolve()`.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f29fc7e..663d736 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -24,7 +24,7 @@ foreach(source_file ${examples_srcs})
   if(UNIX OR APPLE)
     # Funny command to make tutorials work
     # TODO: remove in future as soon as naming is standartaized everywhere
-    set(__outname ${PROJECT_BINARY_DIR}/examples/${folder}/${name}${CAffe_POSTFIX})
+    set(__outname ${PROJECT_BINARY_DIR}/examples/${folder}/${name}${Caffe_POSTFIX})
     add_custom_command(TARGET ${name} POST_BUILD
                        COMMAND ln -sf "${__outname}" "${__outname}.bin")
   endif()
diff --git a/examples/cifar10/cifar10_full.prototxt b/examples/cifar10/cifar10_full.prototxt
index c16f7dc..446479d 100644
--- a/examples/cifar10/cifar10_full.prototxt
+++ b/examples/cifar10/cifar10_full.prototxt
@@ -2,10 +2,12 @@ name: "CIFAR10_full_deploy"
 # N.B. input image must be in CIFAR-10 format
 # as described at http://www.cs.toronto.edu/~kriz/cifar.html
 input: "data"
-input_dim: 1
-input_dim: 3
-input_dim: 32
-input_dim: 32
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 32
+  dim: 32
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/examples/cifar10/cifar10_full_solver.prototxt b/examples/cifar10/cifar10_full_solver.prototxt
index f30b398..882daa2 100644
--- a/examples/cifar10/cifar10_full_solver.prototxt
+++ b/examples/cifar10/cifar10_full_solver.prototxt
@@ -21,6 +21,7 @@ display: 200
 max_iter: 60000
 # snapshot intermediate results
 snapshot: 10000
+snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_full"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/cifar10_full_solver_lr1.prototxt b/examples/cifar10/cifar10_full_solver_lr1.prototxt
index 59bc572..55f4be4 100644
--- a/examples/cifar10/cifar10_full_solver_lr1.prototxt
+++ b/examples/cifar10/cifar10_full_solver_lr1.prototxt
@@ -21,6 +21,7 @@ display: 200
 max_iter: 65000
 # snapshot intermediate results
 snapshot: 5000
+snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_full"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/cifar10_full_solver_lr2.prototxt b/examples/cifar10/cifar10_full_solver_lr2.prototxt
index d4ed5d8..7c3d2da 100644
--- a/examples/cifar10/cifar10_full_solver_lr2.prototxt
+++ b/examples/cifar10/cifar10_full_solver_lr2.prototxt
@@ -21,6 +21,7 @@ display: 200
 max_iter: 70000
 # snapshot intermediate results
 snapshot: 5000
+snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_full"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/cifar10_quick.prototxt b/examples/cifar10/cifar10_quick.prototxt
index 1ad190e..9352fbf 100644
--- a/examples/cifar10/cifar10_quick.prototxt
+++ b/examples/cifar10/cifar10_quick.prototxt
@@ -1,9 +1,11 @@
 name: "CIFAR10_quick_test"
 input: "data"
-input_dim: 1
-input_dim: 3
-input_dim: 32
-input_dim: 32
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 32
+  dim: 32
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/examples/cifar10/cifar10_quick_solver.prototxt b/examples/cifar10/cifar10_quick_solver.prototxt
index 14b4401..5de276f 100644
--- a/examples/cifar10/cifar10_quick_solver.prototxt
+++ b/examples/cifar10/cifar10_quick_solver.prototxt
@@ -20,6 +20,7 @@ display: 100
 max_iter: 4000
 # snapshot intermediate results
 snapshot: 4000
+snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_quick"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/cifar10_quick_solver_lr1.prototxt b/examples/cifar10/cifar10_quick_solver_lr1.prototxt
index d3af70c..f8f1efd 100644
--- a/examples/cifar10/cifar10_quick_solver_lr1.prototxt
+++ b/examples/cifar10/cifar10_quick_solver_lr1.prototxt
@@ -20,6 +20,7 @@ display: 100
 max_iter: 5000
 # snapshot intermediate results
 snapshot: 5000
+snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_quick"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/train_full.sh b/examples/cifar10/train_full.sh
index 4285a5d..ef112e1 100755
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@@ -8,9 +8,9 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5
diff --git a/examples/cifar10/train_quick.sh b/examples/cifar10/train_quick.sh
index 2830c40..6b7d228 100755
--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@@ -8,4 +8,4 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
   --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5
diff --git a/examples/cpp_classification/classification.cpp b/examples/cpp_classification/classification.cpp
index 1c6371e..dc8b863 100644
--- a/examples/cpp_classification/classification.cpp
+++ b/examples/cpp_classification/classification.cpp
@@ -2,6 +2,7 @@
 #include <opencv2/core/core.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
+#include <algorithm>
 #include <iosfwd>
 #include <memory>
 #include <string>
@@ -101,6 +102,7 @@ static std::vector<int> Argmax(const std::vector<float>& v, int N) {
 std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) {
   std::vector<float> output = Predict(img);
 
+  N = std::min<int>(labels_.size(), N);
   std::vector<int> maxN = Argmax(output, N);
   std::vector<Prediction> predictions;
   for (int i = 0; i < N; ++i) {
diff --git a/examples/imagenet/resume_training.sh b/examples/imagenet/resume_training.sh
index d1febff..bf7945c 100755
--- a/examples/imagenet/resume_training.sh
+++ b/examples/imagenet/resume_training.sh
@@ -2,4 +2,4 @@
 
 ./build/tools/caffe train \
     --solver=models/bvlc_reference_caffenet/solver.prototxt \
-    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate
+    --snapshot=models/bvlc_reference_caffenet/caffenet_train_10000.solverstate.h5
diff --git a/examples/mnist/convert_mnist_data.cpp b/examples/mnist/convert_mnist_data.cpp
index 2749e45..54443f1 100644
--- a/examples/mnist/convert_mnist_data.cpp
+++ b/examples/mnist/convert_mnist_data.cpp
@@ -166,7 +166,7 @@ void convert_dataset(const char* image_filename, const char* label_filename,
     }
     LOG(ERROR) << "Processed " << count << " files.";
   }
-  delete pixels;
+  delete[] pixels;
 }
 
 int main(int argc, char** argv) {
diff --git a/examples/mnist/lenet.prototxt b/examples/mnist/lenet.prototxt
index cb42610..dff7123 100644
--- a/examples/mnist/lenet.prototxt
+++ b/examples/mnist/lenet.prototxt
@@ -1,9 +1,11 @@
 name: "LeNet"
 input: "data"
-input_dim: 64
-input_dim: 1
-input_dim: 28
-input_dim: 28
+input_shape {
+  dim: 64
+  dim: 1
+  dim: 28
+  dim: 28
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/examples/cifar10/cifar10_quick_solver_lr1.prototxt b/examples/mnist/lenet_adadelta_solver.prototxt
similarity index 68%
copy from examples/cifar10/cifar10_quick_solver_lr1.prototxt
copy to examples/mnist/lenet_adadelta_solver.prototxt
index d3af70c..776d1e0 100644
--- a/examples/cifar10/cifar10_quick_solver_lr1.prototxt
+++ b/examples/mnist/lenet_adadelta_solver.prototxt
@@ -1,7 +1,5 @@
-# reduce the learning rate after 8 epochs (4000 iters) by a factor of 10
-
 # The train/test net protocol buffer definition
-net: "examples/cifar10/cifar10_quick_train_test.prototxt"
+net: "examples/mnist/lenet_train_test.prototxt"
 # test_iter specifies how many forward passes the test should carry out.
 # In the case of MNIST, we have test batch size 100 and 100 test iterations,
 # covering the full 10,000 testing images.
@@ -9,17 +7,18 @@ test_iter: 100
 # Carry out testing every 500 training iterations.
 test_interval: 500
 # The base learning rate, momentum and the weight decay of the network.
-base_lr: 0.0001
-momentum: 0.9
-weight_decay: 0.004
-# The learning rate policy
+base_lr: 1.0
 lr_policy: "fixed"
+momentum: 0.95
+weight_decay: 0.0005
 # Display every 100 iterations
 display: 100
 # The maximum number of iterations
-max_iter: 5000
+max_iter: 10000
 # snapshot intermediate results
 snapshot: 5000
-snapshot_prefix: "examples/cifar10/cifar10_quick"
+snapshot_prefix: "examples/mnist/lenet_adadelta"
 # solver mode: CPU or GPU
 solver_mode: GPU
+solver_type: ADADELTA
+delta: 1e-6
diff --git a/examples/cifar10/cifar10_quick_solver_lr1.prototxt b/examples/mnist/lenet_solver_adam.prototxt
similarity index 59%
copy from examples/cifar10/cifar10_quick_solver_lr1.prototxt
copy to examples/mnist/lenet_solver_adam.prototxt
index d3af70c..d22c571 100644
--- a/examples/cifar10/cifar10_quick_solver_lr1.prototxt
+++ b/examples/mnist/lenet_solver_adam.prototxt
@@ -1,25 +1,26 @@
-# reduce the learning rate after 8 epochs (4000 iters) by a factor of 10
-
 # The train/test net protocol buffer definition
-net: "examples/cifar10/cifar10_quick_train_test.prototxt"
+# this follows "ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION"
+net: "examples/mnist/lenet_train_test.prototxt"
 # test_iter specifies how many forward passes the test should carry out.
 # In the case of MNIST, we have test batch size 100 and 100 test iterations,
 # covering the full 10,000 testing images.
 test_iter: 100
 # Carry out testing every 500 training iterations.
 test_interval: 500
-# The base learning rate, momentum and the weight decay of the network.
-base_lr: 0.0001
+# All parameters are from the cited paper above
+base_lr: 0.001
 momentum: 0.9
-weight_decay: 0.004
-# The learning rate policy
+momentum2: 0.999
+# since Adam dynamically changes the learning rate, we set the base learning
+# rate to a fixed value
 lr_policy: "fixed"
 # Display every 100 iterations
 display: 100
 # The maximum number of iterations
-max_iter: 5000
+max_iter: 10000
 # snapshot intermediate results
 snapshot: 5000
-snapshot_prefix: "examples/cifar10/cifar10_quick"
+snapshot_prefix: "examples/mnist/lenet"
 # solver mode: CPU or GPU
+solver_type: ADAM
 solver_mode: GPU
diff --git a/examples/cifar10/cifar10_quick_solver_lr1.prototxt b/examples/mnist/lenet_solver_rmsprop.prototxt
similarity index 69%
copy from examples/cifar10/cifar10_quick_solver_lr1.prototxt
copy to examples/mnist/lenet_solver_rmsprop.prototxt
index d3af70c..74dadc5 100644
--- a/examples/cifar10/cifar10_quick_solver_lr1.prototxt
+++ b/examples/mnist/lenet_solver_rmsprop.prototxt
@@ -1,7 +1,5 @@
-# reduce the learning rate after 8 epochs (4000 iters) by a factor of 10
-
 # The train/test net protocol buffer definition
-net: "examples/cifar10/cifar10_quick_train_test.prototxt"
+net: "examples/mnist/lenet_train_test.prototxt"
 # test_iter specifies how many forward passes the test should carry out.
 # In the case of MNIST, we have test batch size 100 and 100 test iterations,
 # covering the full 10,000 testing images.
@@ -9,17 +7,21 @@ test_iter: 100
 # Carry out testing every 500 training iterations.
 test_interval: 500
 # The base learning rate, momentum and the weight decay of the network.
-base_lr: 0.0001
-momentum: 0.9
-weight_decay: 0.004
+base_lr: 0.01
+momentum: 0.0
+weight_decay: 0.0005
 # The learning rate policy
-lr_policy: "fixed"
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
 # Display every 100 iterations
 display: 100
 # The maximum number of iterations
-max_iter: 5000
+max_iter: 10000
 # snapshot intermediate results
 snapshot: 5000
-snapshot_prefix: "examples/cifar10/cifar10_quick"
+snapshot_prefix: "examples/mnist/lenet_rmsprop"
 # solver mode: CPU or GPU
 solver_mode: GPU
+solver_type: RMSPROP
+rms_decay: 0.98
diff --git a/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt b/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
new file mode 100644
index 0000000..065647d
--- /dev/null
+++ b/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
@@ -0,0 +1,19 @@
+net: "examples/mnist/mnist_autoencoder.prototxt"
+test_state: { stage: 'test-on-train' }
+test_iter: 500
+test_state: { stage: 'test-on-test' }
+test_iter: 100
+test_interval: 500
+test_compute_loss: true
+base_lr: 1.0
+lr_policy: "fixed"
+momentum: 0.95
+delta: 1e-8
+display: 100
+max_iter: 65000
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "examples/mnist/mnist_autoencoder_adadelta_train"
+# solver mode: CPU or GPU
+solver_mode: GPU
+solver_type: ADADELTA
diff --git a/examples/mnist/train_lenet_adam.sh b/examples/mnist/train_lenet_adam.sh
new file mode 100755
index 0000000..a32ecf2
--- /dev/null
+++ b/examples/mnist/train_lenet_adam.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train --solver=examples/mnist/lenet_solver_adam.prototxt
diff --git a/examples/mnist/train_lenet_rmsprop.sh b/examples/mnist/train_lenet_rmsprop.sh
new file mode 100755
index 0000000..621cab2
--- /dev/null
+++ b/examples/mnist/train_lenet_rmsprop.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env sh
+
+./build/tools/caffe train --solver=examples/mnist/lenet_solver_rmsprop.prototxt
diff --git a/examples/mnist/train_mnist_autoencoder_adadelta.sh b/examples/mnist/train_mnist_autoencoder_adadelta.sh
new file mode 100755
index 0000000..4be0ebd
--- /dev/null
+++ b/examples/mnist/train_mnist_autoencoder_adadelta.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+./build/tools/caffe train \
+  --solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
diff --git a/examples/net_surgery/bvlc_caffenet_full_conv.prototxt b/examples/net_surgery/bvlc_caffenet_full_conv.prototxt
index 3c95197..0cadde9 100644
--- a/examples/net_surgery/bvlc_caffenet_full_conv.prototxt
+++ b/examples/net_surgery/bvlc_caffenet_full_conv.prototxt
@@ -1,10 +1,12 @@
 # Fully convolutional network version of CaffeNet.
 name: "CaffeNetConv"
 input: "data"
-input_dim: 1
-input_dim: 3
-input_dim: 451
-input_dim: 451
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 451
+  dim: 451
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/examples/net_surgery/conv.prototxt b/examples/net_surgery/conv.prototxt
index 9444c63..6b3e5c7 100644
--- a/examples/net_surgery/conv.prototxt
+++ b/examples/net_surgery/conv.prototxt
@@ -1,10 +1,12 @@
 # Simple single-layer network to showcase editing model parameters.
 name: "convolution"
 input: "data"
-input_dim: 1
-input_dim: 1
-input_dim: 100
-input_dim: 100
+input_shape {
+  dim: 1
+  dim: 1
+  dim: 100
+  dim: 100
+}
 layer {
   name: "conv"
   type: "Convolution"
diff --git a/examples/pycaffe/caffenet.py b/examples/pycaffe/caffenet.py
index 06c5a02..82af229 100644
--- a/examples/pycaffe/caffenet.py
+++ b/examples/pycaffe/caffenet.py
@@ -1,6 +1,6 @@
+from __future__ import print_function
 from caffe import layers as L, params as P, to_proto
 from caffe.proto import caffe_pb2
-from __future__ import print_function
 
 # helper function for common structures
 
diff --git a/examples/siamese/convert_mnist_siamese_data.cpp b/examples/siamese/convert_mnist_siamese_data.cpp
index 71c56a0..8008b44 100644
--- a/examples/siamese/convert_mnist_siamese_data.cpp
+++ b/examples/siamese/convert_mnist_siamese_data.cpp
@@ -102,7 +102,7 @@ void convert_dataset(const char* image_filename, const char* label_filename,
   }
 
   delete db;
-  delete pixels;
+  delete [] pixels;
 }
 
 int main(int argc, char** argv) {
diff --git a/examples/siamese/mnist_siamese.prototxt b/examples/siamese/mnist_siamese.prototxt
index 0e903f8..332731b 100644
--- a/examples/siamese/mnist_siamese.prototxt
+++ b/examples/siamese/mnist_siamese.prototxt
@@ -1,9 +1,11 @@
 name: "mnist_siamese"
 input: "data"
-input_dim: 10000
-input_dim: 1
-input_dim: 28
-input_dim: 28
+input_shape {
+  dim: 10000
+  dim: 1
+  dim: 28
+  dim: 28
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index 472cc18..dda7b1f 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -10,7 +10,7 @@
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/math_functions.hpp"
 
-const int kMaxBlobAxes = INT_MAX;
+const int kMaxBlobAxes = 32;
 
 namespace caffe {
 
@@ -109,7 +109,7 @@ class Blob {
    * @brief Returns the 'canonical' version of a (usually) user-specified axis,
    *        allowing for negative indexing (e.g., -1 for the last axis).
    *
-   * @param index the axis index.
+   * @param axis_index the axis index.
    *        If 0 <= index < num_axes(), return index.
    *        If -num_axes <= index <= -1, return (num_axes() - (-index)),
    *        e.g., the last axis index (num_axes() - 1) if index == -1,
diff --git a/include/caffe/caffe.hpp b/include/caffe/caffe.hpp
index 3c829f2..68a5e1d 100644
--- a/include/caffe/caffe.hpp
+++ b/include/caffe/caffe.hpp
@@ -10,6 +10,7 @@
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
 #include "caffe/net.hpp"
+#include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/solver.hpp"
 #include "caffe/util/benchmark.hpp"
diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
index 5f86bc2..1df6b9a 100644
--- a/include/caffe/common.hpp
+++ b/include/caffe/common.hpp
@@ -98,12 +98,12 @@ void GlobalInit(int* pargc, char*** pargv);
 class Caffe {
  public:
   ~Caffe();
-  inline static Caffe& Get() {
-    if (!singleton_.get()) {
-      singleton_.reset(new Caffe());
-    }
-    return *singleton_;
-  }
+
+  // Thread local context for Caffe. Moved to common.cpp instead of
+  // including boost/thread.hpp to avoid a boost/NVCC issues (#1009, #1010)
+  // on OSX. Also fails on Linux with CUDA 7.0.18.
+  static Caffe& Get();
+
   enum Brew { CPU, GPU };
 
   // This random number generator facade hides boost and CUDA rng
@@ -149,6 +149,11 @@ class Caffe {
   static void SetDevice(const int device_id);
   // Prints the current GPU status.
   static void DeviceQuery();
+  // Parallel training info
+  inline static int solver_count() { return Get().solver_count_; }
+  inline static void set_solver_count(int val) { Get().solver_count_ = val; }
+  inline static bool root_solver() { return Get().root_solver_; }
+  inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
 
  protected:
 #ifndef CPU_ONLY
@@ -158,7 +163,8 @@ class Caffe {
   shared_ptr<RNG> random_generator_;
 
   Brew mode_;
-  static shared_ptr<Caffe> singleton_;
+  int solver_count_;
+  bool root_solver_;
 
  private:
   // The private constructor to avoid duplicate instantiation.
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index d2c0ce6..8e64b3e 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -181,6 +181,44 @@ class EltwiseLayer : public Layer<Dtype> {
 };
 
 /**
+ * @brief A layer for learning "embeddings" of one-hot vector input.
+ *        Equivalent to an InnerProductLayer with one-hot vectors as input, but
+ *        for efficiency the input is the "hot" index of each column itself.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class EmbedLayer : public Layer<Dtype> {
+ public:
+  explicit EmbedLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Embed"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int M_;
+  int K_;
+  int N_;
+  bool bias_term_;
+  Blob<Dtype> bias_multiplier_;
+};
+
+/**
  * @brief Takes two+ Blobs, interprets last Blob as a selector and
  *  filter remaining Blobs accordingly with selector data (0 means that
  * the corresponding item has to be filtered, non-zero means that corresponding
@@ -606,6 +644,35 @@ class SliceLayer : public Layer<Dtype> {
   vector<int> slice_point_;
 };
 
+/**
+ * @brief Copy a Blob along specified dimensions.
+ */
+template <typename Dtype>
+class TileLayer : public Layer<Dtype> {
+ public:
+  explicit TileLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Tile"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  unsigned int axis_, tiles_, outer_dim_, inner_dim_;
+};
+
 }  // namespace caffe
 
 #endif  // CAFFE_COMMON_LAYERS_HPP_
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 3958cb7..552d814 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -5,16 +5,17 @@
 #include <utility>
 #include <vector>
 
-#include "boost/scoped_ptr.hpp"
 #include "hdf5.h"
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
+#include "caffe/data_reader.hpp"
 #include "caffe/data_transformer.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/internal_thread.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/util/blocking_queue.hpp"
 #include "caffe/util/db.hpp"
 
 namespace caffe {
@@ -33,6 +34,8 @@ class BaseDataLayer : public Layer<Dtype> {
   // This method may not be overridden except by the BasePrefetchingDataLayer.
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  // Data layers should be shared by multiple solvers in parallel
+  virtual inline bool ShareInParallel() const { return true; }
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
   // Data layers have no bottoms, so reshaping is trivial.
@@ -51,11 +54,16 @@ class BaseDataLayer : public Layer<Dtype> {
 };
 
 template <typename Dtype>
+class Batch {
+ public:
+  Blob<Dtype> data_, label_;
+};
+
+template <typename Dtype>
 class BasePrefetchingDataLayer :
     public BaseDataLayer<Dtype>, public InternalThread {
  public:
-  explicit BasePrefetchingDataLayer(const LayerParameter& param)
-      : BaseDataLayer<Dtype>(param) {}
+  explicit BasePrefetchingDataLayer(const LayerParameter& param);
   // LayerSetUp: implements common data layer setup functionality, and calls
   // DataLayerSetUp to do special data layer setup for individual layer types.
   // This method may not be overridden.
@@ -67,36 +75,38 @@ class BasePrefetchingDataLayer :
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
 
-  virtual void CreatePrefetchThread();
-  virtual void JoinPrefetchThread();
-  // The thread's function
-  virtual void InternalThreadEntry() {}
+  // Prefetches batches (asynchronously if to GPU memory)
+  static const int PREFETCH_COUNT = 3;
 
  protected:
-  Blob<Dtype> prefetch_data_;
-  Blob<Dtype> prefetch_label_;
+  virtual void InternalThreadEntry();
+  virtual void load_batch(Batch<Dtype>* batch) = 0;
+
+  Batch<Dtype> prefetch_[PREFETCH_COUNT];
+  BlockingQueue<Batch<Dtype>*> prefetch_free_;
+  BlockingQueue<Batch<Dtype>*> prefetch_full_;
+
   Blob<Dtype> transformed_data_;
 };
 
 template <typename Dtype>
 class DataLayer : public BasePrefetchingDataLayer<Dtype> {
  public:
-  explicit DataLayer(const LayerParameter& param)
-      : BasePrefetchingDataLayer<Dtype>(param) {}
+  explicit DataLayer(const LayerParameter& param);
   virtual ~DataLayer();
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-
+  // DataLayer uses DataReader instead for sharing for parallelism
+  virtual inline bool ShareInParallel() const { return false; }
   virtual inline const char* type() const { return "Data"; }
   virtual inline int ExactNumBottomBlobs() const { return 0; }
   virtual inline int MinTopBlobs() const { return 1; }
   virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
-  virtual void InternalThreadEntry();
+  virtual void load_batch(Batch<Dtype>* batch);
 
-  shared_ptr<db::DB> db_;
-  shared_ptr<db::Cursor> cursor_;
+  DataReader reader_;
 };
 
 /**
@@ -111,6 +121,8 @@ class DummyDataLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  // Data layers should be shared by multiple solvers in parallel
+  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
@@ -144,6 +156,8 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  // Data layers should be shared by multiple solvers in parallel
+  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
@@ -185,6 +199,8 @@ class HDF5OutputLayer : public Layer<Dtype> {
   virtual ~HDF5OutputLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  // Data layers should be shared by multiple solvers in parallel
+  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
@@ -235,7 +251,7 @@ class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
  protected:
   shared_ptr<Caffe::RNG> prefetch_rng_;
   virtual void ShuffleImages();
-  virtual void InternalThreadEntry();
+  virtual void load_batch(Batch<Dtype>* batch);
 
   vector<std::pair<std::string, int> > lines_;
   int lines_id_;
@@ -307,7 +323,7 @@ class WindowDataLayer : public BasePrefetchingDataLayer<Dtype> {
 
  protected:
   virtual unsigned int PrefetchRand();
-  virtual void InternalThreadEntry();
+  virtual void load_batch(Batch<Dtype>* batch);
 
   shared_ptr<Caffe::RNG> prefetch_rng_;
   vector<std::pair<std::string, vector<int> > > image_database_;
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
new file mode 100644
index 0000000..8ed5542
--- /dev/null
+++ b/include/caffe/data_reader.hpp
@@ -0,0 +1,82 @@
+#ifndef CAFFE_DATA_READER_HPP_
+#define CAFFE_DATA_READER_HPP_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/internal_thread.hpp"
+#include "caffe/util/blocking_queue.hpp"
+#include "caffe/util/db.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Reads data from a source to queues available to data layers.
+ * A single reading thread is created per source, even if multiple solvers
+ * are running in parallel, e.g. for multi-GPU training. This makes sure
+ * databases are read sequentially, and that each solver accesses a different
+ * subset of the database. Data is distributed to solvers in a round-robin
+ * way to keep parallel training deterministic.
+ */
+class DataReader {
+ public:
+  explicit DataReader(const LayerParameter& param);
+  ~DataReader();
+
+  inline BlockingQueue<Datum*>& free() const {
+    return queue_pair_->free_;
+  }
+  inline BlockingQueue<Datum*>& full() const {
+    return queue_pair_->full_;
+  }
+
+ protected:
+  // Queue pairs are shared between a body and its readers
+  class QueuePair {
+   public:
+    explicit QueuePair(int size);
+    ~QueuePair();
+
+    BlockingQueue<Datum*> free_;
+    BlockingQueue<Datum*> full_;
+
+  DISABLE_COPY_AND_ASSIGN(QueuePair);
+  };
+
+  // A single body is created per source
+  class Body : public InternalThread {
+   public:
+    explicit Body(const LayerParameter& param);
+    virtual ~Body();
+
+   protected:
+    void InternalThreadEntry();
+    void read_one(db::Cursor* cursor, QueuePair* qp);
+
+    const LayerParameter param_;
+    BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_;
+
+    friend class DataReader;
+
+  DISABLE_COPY_AND_ASSIGN(Body);
+  };
+
+  // A source is uniquely identified by its layer name + path, in case
+  // the same database is read from two different locations in the net.
+  static inline string source_key(const LayerParameter& param) {
+    return param.name() + ":" + param.data_param().source();
+  }
+
+  const shared_ptr<QueuePair> queue_pair_;
+  shared_ptr<Body> body_;
+
+  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
+
+DISABLE_COPY_AND_ASSIGN(DataReader);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_DATA_READER_HPP_
diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
index 815ca54..6a8c5a0 100644
--- a/include/caffe/internal_thread.hpp
+++ b/include/caffe/internal_thread.hpp
@@ -14,18 +14,22 @@ namespace caffe {
 /**
  * Virtual class encapsulate boost::thread for use in base class
  * The child class will acquire the ability to run a single thread,
- * by reimplementing the virutal function InternalThreadEntry.
+ * by reimplementing the virtual function InternalThreadEntry.
  */
 class InternalThread {
  public:
   InternalThread() : thread_() {}
   virtual ~InternalThread();
 
-  /** Returns true if the thread was successfully started. **/
-  bool StartInternalThread();
+  /**
+   * Caffe's thread local state will be initialized using the current
+   * thread values, e.g. device id, solver index etc. The random seed
+   * is initialized using caffe_rng_rand.
+   */
+  void StartInternalThread();
 
   /** Will not return until the internal thread has exited. */
-  bool WaitForInternalThreadToExit();
+  void StopInternalThread();
 
   bool is_started() const;
 
@@ -34,6 +38,13 @@ class InternalThread {
       with the code you want your thread to run. */
   virtual void InternalThreadEntry() {}
 
+  /* Should be tested when running loops to exit when requested. */
+  bool must_stop();
+
+ private:
+  void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
+      bool root_solver);
+
   shared_ptr<boost::thread> thread_;
 };
 
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index e2eba19..a0d1d4e 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -11,6 +11,12 @@
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/device_alternate.hpp"
 
+/**
+ Forward declare boost::thread instead of including boost/thread.hpp
+ to avoid a boost/NVCC issues (#1009, #1010) on OSX.
+ */
+namespace boost { class mutex; }
+
 namespace caffe {
 
 /**
@@ -32,7 +38,7 @@ class Layer {
    * layer.
    */
   explicit Layer(const LayerParameter& param)
-    : layer_param_(param) {
+    : layer_param_(param), is_shared_(false) {
       // Set phase and copy blobs (if there are any).
       phase_ = param.phase();
       if (layer_param_.blobs_size() > 0) {
@@ -60,6 +66,7 @@ class Layer {
    */
   void SetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+    InitMutex();
     CheckBlobCounts(bottom, top);
     LayerSetUp(bottom, top);
     Reshape(bottom, top);
@@ -86,7 +93,31 @@ class Layer {
       const vector<Blob<Dtype>*>& top) {}
 
   /**
-   * @brief Adjust the shapes of top blobs and internal buffers to accomodate
+   * @brief Whether a layer should be shared by multiple nets during data
+   *        parallelism. By default, all layers except for data layers should
+   *        not be shared. data layers should be shared to ensure each worker
+   *        solver access data sequentially during data parallelism.
+   */
+  virtual inline bool ShareInParallel() const { return false; }
+
+  /** @brief Return whether this layer is actually shared by other nets.
+   *         If ShareInParallel() is true and using more than one GPU and the
+   *         net has TRAIN phase, then this function is expected return true.
+   */
+  inline bool IsShared() const { return is_shared_; }
+
+  /** @brief Set whether this layer is actually shared by other nets
+   *         If ShareInParallel() is true and using more than one GPU and the
+   *         net has TRAIN phase, then is_shared should be set true.
+   */
+  inline void SetShared(bool is_shared) {
+    CHECK(ShareInParallel() || !is_shared)
+        << type() << "Layer does not support sharing.";
+    is_shared_ = is_shared;
+  }
+
+  /**
+   * @brief Adjust the shapes of top blobs and internal buffers to accommodate
    *        the shapes of the bottom blobs.
    *
    * @param bottom the input blobs, with the requested input shapes
@@ -95,7 +126,7 @@ class Layer {
    * This method should reshape top blobs as needed according to the shapes
    * of the bottom (input) blobs, as well as reshaping any internal buffers
    * and making any other necessary adjustments so that the layer can
-   * accomodate the bottom blobs.
+   * accommodate the bottom blobs.
    */
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) = 0;
@@ -396,6 +427,20 @@ class Layer {
     }
   }
 
+ private:
+  /** Whether this layer is actually shared by other nets*/
+  bool is_shared_;
+
+  /** The mutex for sequential forward if this layer is shared */
+  shared_ptr<boost::mutex> forward_mutex_;
+
+  /** Initialize forward_mutex_ */
+  void InitMutex();
+  /** Lock forward_mutex_ if this layer is shared */
+  void Lock();
+  /** Unlock forward_mutex_ if this layer is shared */
+  void Unlock();
+
   DISABLE_COPY_AND_ASSIGN(Layer);
 };  // class Layer
 
@@ -405,6 +450,8 @@ class Layer {
 template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
+  // Lock during forward to ensure sequential forward
+  Lock();
   Dtype loss = 0;
   Reshape(bottom, top);
   switch (Caffe::mode()) {
@@ -435,6 +482,7 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
   default:
     LOG(FATAL) << "Unknown caffe mode.";
   }
+  Unlock();
   return loss;
 }
 
diff --git a/include/caffe/layer_factory.hpp b/include/caffe/layer_factory.hpp
index 2fcd938..2c2fde4 100644
--- a/include/caffe/layer_factory.hpp
+++ b/include/caffe/layer_factory.hpp
@@ -41,6 +41,7 @@
 
 #include <map>
 #include <string>
+#include <vector>
 
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
@@ -71,30 +72,42 @@ class LayerRegistry {
 
   // Get a layer using a LayerParameter.
   static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
-    LOG(INFO) << "Creating layer " << param.name();
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "Creating layer " << param.name();
+    }
     const string& type = param.type();
     CreatorRegistry& registry = Registry();
     CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
-        << " (known types: " << LayerTypeList() << ")";
+        << " (known types: " << LayerTypeListString() << ")";
     return registry[type](param);
   }
 
+  static vector<string> LayerTypeList() {
+    CreatorRegistry& registry = Registry();
+    vector<string> layer_types;
+    for (typename CreatorRegistry::iterator iter = registry.begin();
+         iter != registry.end(); ++iter) {
+      layer_types.push_back(iter->first);
+    }
+    return layer_types;
+  }
+
  private:
   // Layer registry should never be instantiated - everything is done with its
   // static variables.
   LayerRegistry() {}
 
-  static string LayerTypeList() {
-    CreatorRegistry& registry = Registry();
-    string layer_types;
-    for (typename CreatorRegistry::iterator iter = registry.begin();
-         iter != registry.end(); ++iter) {
-      if (iter != registry.begin()) {
-        layer_types += ", ";
+  static string LayerTypeListString() {
+    vector<string> layer_types = LayerTypeList();
+    string layer_types_str;
+    for (vector<string>::iterator iter = layer_types.begin();
+         iter != layer_types.end(); ++iter) {
+      if (iter != layer_types.begin()) {
+        layer_types_str += ", ";
       }
-      layer_types += iter->first;
+      layer_types_str += *iter;
     }
-    return layer_types;
+    return layer_types_str;
   }
 };
 
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 5282663..8d41af3 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -39,7 +39,11 @@ class AccuracyLayer : public Layer<Dtype> {
 
   virtual inline const char* type() const { return "Accuracy"; }
   virtual inline int ExactNumBottomBlobs() const { return 2; }
-  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+  // If there are two top blobs, then the second blob will contain
+  // accuracies per class.
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlos() const { return 2; }
 
  protected:
   /**
@@ -86,6 +90,8 @@ class AccuracyLayer : public Layer<Dtype> {
   bool has_ignore_label_;
   /// The label indicating that an instance should be ignored.
   int ignore_label_;
+  /// Keeps counts of the number of samples per class.
+  Blob<Dtype> nums_buffer_;
 };
 
 /**
@@ -706,7 +712,6 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
   virtual inline int MaxTopBlobs() const { return 2; }
 
  protected:
-  /// @copydoc SoftmaxWithLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 5665df1..1bf07d2 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -23,8 +23,9 @@ namespace caffe {
 template <typename Dtype>
 class Net {
  public:
-  explicit Net(const NetParameter& param);
-  explicit Net(const string& param_file, Phase phase);
+  explicit Net(const NetParameter& param, const Net* root_net = NULL);
+  explicit Net(const string& param_file, Phase phase,
+      const Net* root_net = NULL);
   virtual ~Net() {}
 
   /// @brief Initialize a network with a NetParameter.
@@ -58,6 +59,12 @@ class Net {
   string Forward(const string& input_blob_protos, Dtype* loss = NULL);
 
   /**
+   * @brief Zeroes out the diffs of all net parameters.
+   *        Should be run before Backward.
+   */
+  void ClearParamDiffs();
+
+  /**
    * The network backward should take no input and output, since it solely
    * computes the gradient w.r.t the parameters, and the data has already been
    * provided during the forward pass.
@@ -84,6 +91,13 @@ class Net {
 
   /// @brief Updates the network weights based on the diff values computed.
   void Update();
+  /**
+   * @brief Shares weight data of owner blobs with shared blobs.
+   *
+   * Note: this is called by Net::Init, and thus should normally not be
+   * called manually.
+   */
+  void ShareWeights();
 
   /**
    * @brief For an already initialized net, implicitly copies (i.e., using no
@@ -98,8 +112,12 @@ class Net {
    */
   void CopyTrainedLayersFrom(const NetParameter& param);
   void CopyTrainedLayersFrom(const string trained_filename);
+  void CopyTrainedLayersFromBinaryProto(const string trained_filename);
+  void CopyTrainedLayersFromHDF5(const string trained_filename);
   /// @brief Writes the net to a proto.
   void ToProto(NetParameter* param, bool write_diff = false) const;
+  /// @brief Writes the net to an HDF5 file.
+  void ToHDF5(const string& filename, bool write_diff = false) const;
 
   /// @brief returns the network name.
   inline const string& name() const { return name_; }
@@ -144,11 +162,19 @@ class Net {
   inline const vector<shared_ptr<Blob<Dtype> > >& params() const {
     return params_;
   }
-  /// @brief returns the parameter learning rate multipliers
+  inline const vector<Blob<Dtype>*>& learnable_params() const {
+    return learnable_params_;
+  }
+  /// @brief returns the learnable parameter learning rate multipliers
   inline const vector<float>& params_lr() const { return params_lr_; }
+  inline const vector<bool>& has_params_lr() const { return has_params_lr_; }
+  /// @brief returns the learnable parameter decay multipliers
   inline const vector<float>& params_weight_decay() const {
     return params_weight_decay_;
   }
+  inline const vector<bool>& has_params_decay() const {
+    return has_params_decay_;
+  }
   const map<string, int>& param_names_index() const {
     return param_names_index_;
   }
@@ -209,9 +235,6 @@ class Net {
   /// @brief Helper for displaying debug info in Update.
   void UpdateDebugInfo(const int param_id);
 
-  /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
-  void GetLearningRateAndWeightDecay();
-
   /// @brief The network name
   string name_;
   /// @brief The phase: TRAIN or TEST
@@ -250,15 +273,27 @@ class Net {
   vector<Blob<Dtype>*> net_output_blobs_;
   /// The parameters in the network.
   vector<shared_ptr<Blob<Dtype> > > params_;
-  /// the learning rate multipliers
+  vector<Blob<Dtype>*> learnable_params_;
+  /**
+   * The mapping from params_ -> learnable_params_: we have
+   * learnable_param_ids_.size() == params_.size(),
+   * and learnable_params_[learnable_param_ids_[i]] == params_[i].get()
+   * if and only if params_[i] is an "owner"; otherwise, params_[i] is a sharer
+   * and learnable_params_[learnable_param_ids_[i]] gives its owner.
+   */
+  vector<int> learnable_param_ids_;
+  /// the learning rate multipliers for learnable_params_
   vector<float> params_lr_;
-  /// the weight decay multipliers
+  vector<bool> has_params_lr_;
+  /// the weight decay multipliers for learnable_params_
   vector<float> params_weight_decay_;
+  vector<bool> has_params_decay_;
   /// The bytes of memory used by this net
   size_t memory_used_;
   /// Whether to compute and display debug info for the net.
   bool debug_info_;
-
+  /// The root net that actually holds the shared layers in data parallelism
+  const Net* const root_net_;
   DISABLE_COPY_AND_ASSIGN(Net);
 };
 
diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
new file mode 100644
index 0000000..85fc2b5
--- /dev/null
+++ b/include/caffe/parallel.hpp
@@ -0,0 +1,118 @@
+#ifndef CAFFE_PARALLEL_HPP_
+#define CAFFE_PARALLEL_HPP_
+
+#include <boost/date_time/posix_time/posix_time.hpp>
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/internal_thread.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/solver.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/blocking_queue.hpp"
+
+namespace caffe {
+
+// Represents a net parameters. Once a net is created, its parameter buffers can
+// be replaced by ones from Params, to allow parallelization. Params ensures
+// parameters are allocated in one consecutive array.
+template<typename Dtype>
+class Params {
+ public:
+  explicit Params(shared_ptr<Solver<Dtype> > root_solver);
+  virtual ~Params() {
+  }
+
+  inline size_t size() const {
+    return size_;
+  }
+  inline Dtype* data() const {
+    return data_;
+  }
+  inline Dtype* diff() const {
+    return diff_;
+  }
+
+ protected:
+  const size_t size_;           // Size of buffers
+  Dtype* data_;                 // Network parameters
+  Dtype* diff_;                 // Gradient
+
+DISABLE_COPY_AND_ASSIGN(Params);
+};
+
+// Params stored in GPU memory.
+template<typename Dtype>
+class GPUParams : public Params<Dtype> {
+ public:
+  GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device);
+  virtual ~GPUParams();
+
+  void configure(Solver<Dtype>* solver) const;
+
+ protected:
+  using Params<Dtype>::size_;
+  using Params<Dtype>::data_;
+  using Params<Dtype>::diff_;
+};
+
+class DevicePair {
+ public:
+  DevicePair(int parent, int device)
+      : parent_(parent),
+        device_(device) {
+  }
+  inline int parent() {
+    return parent_;
+  }
+  inline int device() {
+    return device_;
+  }
+
+  // Group GPUs in pairs, by proximity depending on machine's topology
+  static void compute(const vector<int> devices, vector<DevicePair>* pairs);
+
+ protected:
+  int parent_;
+  int device_;
+};
+
+// Synchronous data parallelism using map-reduce between local GPUs.
+template<typename Dtype>
+class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
+    public InternalThread {
+ public:
+  explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
+                   P2PSync<Dtype>* parent, const SolverParameter& param);
+  virtual ~P2PSync();
+
+  inline const shared_ptr<Solver<Dtype> >& solver() const {
+    return solver_;
+  }
+
+  void run(const vector<int>& gpus);
+
+ protected:
+  void on_start();
+  void on_gradients_ready();
+
+  void InternalThreadEntry();
+
+  P2PSync<Dtype>* parent_;
+  vector<P2PSync<Dtype>*> children_;
+  BlockingQueue<P2PSync<Dtype>*> queue_;
+  const int initial_iter_;
+  Dtype* parent_grads_;
+  shared_ptr<Solver<Dtype> > solver_;
+
+  using Params<Dtype>::size_;
+  using Params<Dtype>::data_;
+  using Params<Dtype>::diff_;
+};
+
+}  // namespace caffe
+
+#endif
diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp
index 19cf18c..c43c1e8 100644
--- a/include/caffe/python_layer.hpp
+++ b/include/caffe/python_layer.hpp
@@ -18,22 +18,17 @@ class PythonLayer : public Layer<Dtype> {
 
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("setup")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
+    self_.attr("param_str") = bp::str(
+        this->layer_param_.python_param().param_str());
+    self_.attr("setup")(bottom, top);
   }
-
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("reshape")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
+    self_.attr("reshape")(bottom, top);
+  }
+
+  virtual inline bool ShareInParallel() const {
+    return this->layer_param_.python_param().share_in_parallel();
   }
 
   virtual inline const char* type() const { return "Python"; }
@@ -41,21 +36,11 @@ class PythonLayer : public Layer<Dtype> {
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-    try {
-      self_.attr("forward")(bottom, top);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
+    self_.attr("forward")(bottom, top);
   }
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-    try {
-      self_.attr("backward")(top, propagate_down, bottom);
-    } catch (bp::error_already_set) {
-      PyErr_Print();
-      throw;
-    }
+    self_.attr("backward")(top, propagate_down, bottom);
   }
 
  private:
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index c2ced48..aba3e03 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -1,6 +1,6 @@
 #ifndef CAFFE_OPTIMIZATION_SOLVER_HPP_
 #define CAFFE_OPTIMIZATION_SOLVER_HPP_
-
+#include <boost/function.hpp>
 #include <string>
 #include <vector>
 
@@ -9,6 +9,28 @@
 namespace caffe {
 
 /**
+  * @brief Enumeration of actions that a client of the Solver may request by
+  * implementing the Solver's action request function, which a
+  * a client may optionally provide in order to request early termination
+  * or saving a snapshot without exiting. In the executable caffe, this
+  * mechanism is used to allow the snapshot to be saved when stopping
+  * execution with a SIGINT (Ctrl-C).
+  */
+  namespace SolverAction {
+    enum Enum {
+      NONE = 0,  // Take no special action.
+      STOP = 1,  // Stop training. snapshot_after_train controls whether a
+                 // snapshot is created.
+      SNAPSHOT = 2  // Take a snapshot, and keep training.
+    };
+  }
+
+/**
+ * @brief Type of a function that returns a Solver Action enumeration.
+ */
+typedef boost::function<SolverAction::Enum()> ActionCallback;
+
+/**
  * @brief An interface for classes that perform optimization on Net%s.
  *
  * Requires implementation of ApplyUpdate to compute a parameter update
@@ -17,27 +39,49 @@ namespace caffe {
 template <typename Dtype>
 class Solver {
  public:
-  explicit Solver(const SolverParameter& param);
-  explicit Solver(const string& param_file);
+  explicit Solver(const SolverParameter& param,
+      const Solver* root_solver = NULL);
+  explicit Solver(const string& param_file, const Solver* root_solver = NULL);
   void Init(const SolverParameter& param);
   void InitTrainNet();
   void InitTestNets();
+
+  // Client of the Solver optionally may call this in order to set the function
+  // that the solver uses to see what action it should take (e.g. snapshot or
+  // exit training early).
+  void SetActionFunction(ActionCallback func);
+  SolverAction::Enum GetRequestedAction();
   // The main entry of the solver function. In default, iter will be zero. Pass
   // in a non-zero iter number to resume training for a pre-trained net.
   virtual void Solve(const char* resume_file = NULL);
   inline void Solve(const string resume_file) { Solve(resume_file.c_str()); }
   void Step(int iters);
-  // The Restore function implements how one should restore the solver to a
-  // previously snapshotted state. You should implement the RestoreSolverState()
-  // function that restores the state from a SolverState protocol buffer.
+  // The Restore method simply dispatches to one of the
+  // RestoreSolverStateFrom___ protected methods. You should implement these
+  // methods to restore the state from the appropriate snapshot type.
   void Restore(const char* resume_file);
   virtual ~Solver() {}
+  inline const SolverParameter& param() const { return param_; }
   inline shared_ptr<Net<Dtype> > net() { return net_; }
   inline const vector<shared_ptr<Net<Dtype> > >& test_nets() {
     return test_nets_;
   }
   int iter() { return iter_; }
 
+  // Invoked at specific points during an iteration
+  class Callback {
+   protected:
+    virtual void on_start() = 0;
+    virtual void on_gradients_ready() = 0;
+
+    template <typename T>
+    friend class Solver;
+  };
+  const vector<Callback*>& callbacks() const { return callbacks_; }
+  void add_callback(Callback* value) {
+    callbacks_.push_back(value);
+  }
+
  protected:
   // Make and apply the update value for the current iteration.
   virtual void ApplyUpdate() = 0;
@@ -46,11 +90,15 @@ class Solver {
   // function that produces a SolverState protocol buffer that needs to be
   // written to disk together with the learned net.
   void Snapshot();
+  string SnapshotFilename(const string extension);
+  string SnapshotToBinaryProto();
+  string SnapshotToHDF5();
   // The test routine
   void TestAll();
   void Test(const int test_net_id = 0);
-  virtual void SnapshotSolverState(SolverState* state) = 0;
-  virtual void RestoreSolverState(const SolverState& state) = 0;
+  virtual void SnapshotSolverState(const string& model_filename) = 0;
+  virtual void RestoreSolverStateFromHDF5(const string& state_file) = 0;
+  virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0;
   void DisplayOutputBlobs(const int net_id);
 
   SolverParameter param_;
@@ -58,10 +106,45 @@ class Solver {
   int current_step_;
   shared_ptr<Net<Dtype> > net_;
   vector<shared_ptr<Net<Dtype> > > test_nets_;
+  vector<Callback*> callbacks_;
+
+  // The root solver that holds root nets (actually containing shared layers)
+  // in data parallelism
+  const Solver* const root_solver_;
+
+  // A function that can be set by a client of the Solver to provide indication
+  // that it wants a snapshot saved and/or to exit early.
+  ActionCallback action_request_function_;
+
+  // True iff a request to stop early was received.
+  bool requested_early_exit_;
 
   DISABLE_COPY_AND_ASSIGN(Solver);
 };
 
+/**
+ * @brief Solver that only computes gradients, used as worker
+ *        for multi-GPU training.
+ */
+template <typename Dtype>
+class WorkerSolver : public Solver<Dtype> {
+ public:
+  explicit WorkerSolver(const SolverParameter& param,
+      const Solver<Dtype>* root_solver = NULL)
+      : Solver<Dtype>(param, root_solver) {}
+
+ protected:
+  void ApplyUpdate() {}
+  void SnapshotSolverState(const string& model_filename) {
+    LOG(FATAL) << "Should not be called on worker solver.";
+  }
+  void RestoreSolverStateFromBinaryProto(const string& state_file) {
+    LOG(FATAL) << "Should not be called on worker solver.";
+  }
+  void RestoreSolverStateFromHDF5(const string& state_file) {
+    LOG(FATAL) << "Should not be called on worker solver.";
+  }
+};
 
 /**
  * @brief Optimizes the parameters of a Net using
@@ -85,8 +168,11 @@ class SGDSolver : public Solver<Dtype> {
   virtual void Regularize(int param_id);
   virtual void ComputeUpdateValue(int param_id, Dtype rate);
   virtual void ClipGradients();
-  virtual void SnapshotSolverState(SolverState * state);
-  virtual void RestoreSolverState(const SolverState& state);
+  virtual void SnapshotSolverState(const string& model_filename);
+  virtual void SnapshotSolverStateToBinaryProto(const string& model_filename);
+  virtual void SnapshotSolverStateToHDF5(const string& model_filename);
+  virtual void RestoreSolverStateFromHDF5(const string& state_file);
+  virtual void RestoreSolverStateFromBinaryProto(const string& state_file);
   // history maintains the historical momentum data.
   // update maintains update related data and is not needed in snapshots.
   // temp maintains other information that might be needed in computation
@@ -128,6 +214,67 @@ class AdaGradSolver : public SGDSolver<Dtype> {
   DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
 };
 
+
+template <typename Dtype>
+class RMSPropSolver : public SGDSolver<Dtype> {
+ public:
+  explicit RMSPropSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
+  explicit RMSPropSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
+
+ protected:
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+  void constructor_sanity_check() {
+    CHECK_EQ(0, this->param_.momentum())
+        << "Momentum cannot be used with RMSProp.";
+    CHECK_GE(this->param_.rms_decay(), 0)
+        << "rms_decay should lie between 0 and 1.";
+    CHECK_LT(this->param_.rms_decay(), 1)
+        << "rms_decay should lie between 0 and 1.";
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RMSPropSolver);
+};
+
+template <typename Dtype>
+class AdaDeltaSolver : public SGDSolver<Dtype> {
+ public:
+  explicit AdaDeltaSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
+  explicit AdaDeltaSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
+
+ protected:
+  void AdaDeltaPreSolve();
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+  DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver);
+};
+
+/**
+ * @brief AdamSolver, an algorithm for first-order gradient-based optimization
+ *        of stochastic objective functions, based on adaptive estimates of
+ *        lower-order moments. Described in [1].
+ *
+ * [1] D. P. Kingma and J. L. Ba, "ADAM: A Method for Stochastic Optimization."
+ *     arXiv preprint arXiv:1412.6980v8 (2014).
+ */
+template <typename Dtype>
+class AdamSolver : public SGDSolver<Dtype> {
+ public:
+  explicit AdamSolver(const SolverParameter& param)
+      : SGDSolver<Dtype>(param) { AdamPreSolve();}
+  explicit AdamSolver(const string& param_file)
+      : SGDSolver<Dtype>(param_file) { AdamPreSolve(); }
+
+ protected:
+  void AdamPreSolve();
+  virtual void ComputeUpdateValue(int param_id, Dtype rate);
+
+  DISABLE_COPY_AND_ASSIGN(AdamSolver);
+};
+
 template <typename Dtype>
 Solver<Dtype>* GetSolver(const SolverParameter& param) {
   SolverParameter_SolverType type = param.solver_type();
@@ -139,6 +286,12 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
       return new NesterovSolver<Dtype>(param);
   case SolverParameter_SolverType_ADAGRAD:
       return new AdaGradSolver<Dtype>(param);
+  case SolverParameter_SolverType_RMSPROP:
+      return new RMSPropSolver<Dtype>(param);
+  case SolverParameter_SolverType_ADADELTA:
+      return new AdaDeltaSolver<Dtype>(param);
+  case SolverParameter_SolverType_ADAM:
+      return new AdamSolver<Dtype>(param);
   default:
       LOG(FATAL) << "Unknown SolverType: " << type;
   }
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 1b726de..62aadef 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -8,26 +8,29 @@
 
 namespace caffe {
 
-// Theoretically, CaffeMallocHost and CaffeFreeHost should simply call the
-// cudaMallocHost and cudaFree functions in order to create pinned memory.
-// However, those codes rely on the existence of a cuda GPU (I don't know
-// why that is a must since allocating memory should not be accessing the
-// GPU resource, but it just creates an error as of Cuda 5.0) and will cause
-// problem when running on a machine without GPU. Thus, we simply define
-// these two functions for safety and possible future change if the problem
-// of calling cuda functions disappears in a future version.
-//
-// In practice, although we are creating unpinned memory here, as long as we
-// are constantly accessing them the memory pages almost always stays in
-// the physical memory (assuming we have large enough memory installed), and
-// does not seem to create a memory bottleneck here.
-
+// If CUDA is available and in GPU mode, host memory will be allocated pinned,
+// using cudaMallocHost. It avoids dynamic pinning for transfers (DMA).
+// The improvement in performance seems negligible in the single GPU case,
+// but might be more significant for parallel training. Most importantly,
+// it improved stability for large models on many GPUs.
 inline void CaffeMallocHost(void** ptr, size_t size) {
+#ifndef CPU_ONLY
+  if (Caffe::mode() == Caffe::GPU) {
+    CUDA_CHECK(cudaMallocHost(ptr, size));
+    return;
+  }
+#endif
   *ptr = malloc(size);
   CHECK(*ptr) << "host allocation of size " << size << " failed";
 }
 
 inline void CaffeFreeHost(void* ptr) {
+#ifndef CPU_ONLY
+  if (Caffe::mode() == Caffe::GPU) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+    return;
+  }
+#endif
   free(ptr);
 }
 
@@ -42,20 +45,25 @@ class SyncedMemory {
  public:
   SyncedMemory()
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
-        own_cpu_data_(false) {}
+        own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {}
   explicit SyncedMemory(size_t size)
       : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
-        own_cpu_data_(false) {}
+        own_cpu_data_(false), own_gpu_data_(false), gpu_device_(-1) {}
   ~SyncedMemory();
   const void* cpu_data();
   void set_cpu_data(void* data);
   const void* gpu_data();
+  void set_gpu_data(void* data);
   void* mutable_cpu_data();
   void* mutable_gpu_data();
   enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
   SyncedHead head() { return head_; }
   size_t size() { return size_; }
 
+#ifndef CPU_ONLY
+  void async_gpu_push(const cudaStream_t& stream);
+#endif
+
  private:
   void to_cpu();
   void to_gpu();
@@ -64,6 +72,8 @@ class SyncedMemory {
   size_t size_;
   SyncedHead head_;
   bool own_cpu_data_;
+  bool own_gpu_data_;
+  int gpu_device_;
 
   DISABLE_COPY_AND_ASSIGN(SyncedMemory);
 };  // class SyncedMemory
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index cc5dcba..25f35d1 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -45,6 +45,10 @@ class GradientChecker {
   void CheckGradientEltwise(Layer<Dtype>* layer,
       const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 
+  // Checks the gradient of a single output with respect to particular input
+  // blob(s).  If check_bottom = i >= 0, check only the ith bottom Blob.
+  // If check_bottom == -1, check everything -- all bottom Blobs and all
+  // param Blobs.  Otherwise (if check_bottom < -1), check only param Blobs.
   void CheckGradientSingle(Layer<Dtype>* layer,
       const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
       int check_bottom, int top_id, int top_data_id, bool element_wise = false);
@@ -83,21 +87,22 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
   // First, figure out what blobs we need to check against, and zero init
   // parameter blobs.
   vector<Blob<Dtype>*> blobs_to_check;
-  vector<bool> propagate_down(bottom.size(), check_bottom < 0);
+  vector<bool> propagate_down(bottom.size(), check_bottom == -1);
   for (int i = 0; i < layer->blobs().size(); ++i) {
     Blob<Dtype>* blob = layer->blobs()[i].get();
     caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
     blobs_to_check.push_back(blob);
   }
-  if (check_bottom < 0) {
+  if (check_bottom == -1) {
     for (int i = 0; i < bottom.size(); ++i) {
       blobs_to_check.push_back(bottom[i]);
     }
-  } else {
+  } else if (check_bottom >= 0) {
     CHECK_LT(check_bottom, bottom.size());
     blobs_to_check.push_back(bottom[check_bottom]);
     propagate_down[check_bottom] = true;
   }
+  CHECK_GT(blobs_to_check.size(), 0) << "No blobs to check.";
   // Compute the gradient analytically using Backward
   Caffe::set_random_seed(seed_);
   // Ignore the loss from the layer (it's just the weighted sum of the losses
diff --git a/include/caffe/util/blocking_queue.hpp b/include/caffe/util/blocking_queue.hpp
new file mode 100644
index 0000000..955e12c
--- /dev/null
+++ b/include/caffe/util/blocking_queue.hpp
@@ -0,0 +1,47 @@
+#ifndef CAFFE_UTIL_BLOCKING_QUEUE_HPP_
+#define CAFFE_UTIL_BLOCKING_QUEUE_HPP_
+
+#include <queue>
+#include <string>
+
+#include "caffe/common.hpp"
+
+namespace caffe {
+
+template<typename T>
+class BlockingQueue {
+ public:
+  explicit BlockingQueue();
+
+  void push(const T& t);
+
+  bool try_pop(T* t);
+
+  // This logs a message if the threads needs to be blocked
+  // useful for detecting e.g. when data feeding is too slow
+  T pop(const string& log_on_wait = "");
+
+  bool try_peek(T* t);
+
+  // Return element without removing it
+  T peek();
+
+  size_t size() const;
+
+ protected:
+  /**
+   Move synchronization fields out instead of including boost/thread.hpp
+   to avoid a boost/NVCC issues (#1009, #1010) on OSX. Also fails on
+   Linux CUDA 7.0.18.
+   */
+  class sync;
+
+  std::queue<T> queue_;
+  shared_ptr<sync> sync_;
+
+DISABLE_COPY_AND_ASSIGN(BlockingQueue);
+};
+
+}  // namespace caffe
+
+#endif
diff --git a/include/caffe/util/gpu_util.cuh b/include/caffe/util/gpu_util.cuh
new file mode 100644
index 0000000..994202f
--- /dev/null
+++ b/include/caffe/util/gpu_util.cuh
@@ -0,0 +1,35 @@
+#ifndef CAFFE_UTIL_GPU_UTIL_H_
+#define CAFFE_UTIL_GPU_UTIL_H_
+
+namespace caffe {
+
+template <typename Dtype>
+inline __device__ Dtype caffe_gpu_atomic_add(const Dtype val, Dtype* address);
+
+template <>
+inline __device__
+float caffe_gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+// double atomicAdd implementation taken from:
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#axzz3PVCpVsEG
+template <>
+inline __device__
+double caffe_gpu_atomic_add(const double val, double* address) {
+  unsigned long long int* address_as_ull =  // NOLINT(runtime/int)
+      // NOLINT_NEXT_LINE(runtime/int)
+      reinterpret_cast<unsigned long long int*>(address);
+  unsigned long long int old = *address_as_ull;  // NOLINT(runtime/int)
+  unsigned long long int assumed;  // NOLINT(runtime/int)
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+        __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_GPU_UTIL_H_
diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp
new file mode 100644
index 0000000..ce568c5
--- /dev/null
+++ b/include/caffe/util/hdf5.hpp
@@ -0,0 +1,39 @@
+#ifndef CAFFE_UTIL_HDF5_H_
+#define CAFFE_UTIL_HDF5_H_
+
+#include <string>
+
+#include "hdf5.h"
+#include "hdf5_hl.h"
+
+#include "caffe/blob.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void hdf5_load_nd_dataset_helper(
+    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+    Blob<Dtype>* blob);
+
+template <typename Dtype>
+void hdf5_load_nd_dataset(
+    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+    Blob<Dtype>* blob);
+
+template <typename Dtype>
+void hdf5_save_nd_dataset(
+    const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob,
+    bool write_diff = false);
+
+int hdf5_load_int(hid_t loc_id, const string& dataset_name);
+void hdf5_save_int(hid_t loc_id, const string& dataset_name, int i);
+string hdf5_load_string(hid_t loc_id, const string& dataset_name);
+void hdf5_save_string(hid_t loc_id, const string& dataset_name,
+                      const string& s);
+
+int hdf5_get_num_links(hid_t loc_id);
+string hdf5_get_name_by_idx(hid_t loc_id, int idx);
+
+}  // namespace caffe
+
+#endif   // CAFFE_UTIL_HDF5_H_
diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp
index 3a62c3c..c0938ad 100644
--- a/include/caffe/util/io.hpp
+++ b/include/caffe/util/io.hpp
@@ -5,15 +5,11 @@
 #include <string>
 
 #include "google/protobuf/message.h"
-#include "hdf5.h"
-#include "hdf5_hl.h"
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
 
-#define HDF5_NUM_DIMS 4
-
 namespace caffe {
 
 using ::google::protobuf::Message;
@@ -140,20 +136,6 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);
 
 void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);
 
-template <typename Dtype>
-void hdf5_load_nd_dataset_helper(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
-
-template <typename Dtype>
-void hdf5_load_nd_dataset(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob);
-
-template <typename Dtype>
-void hdf5_save_nd_dataset(
-    const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);
-
 }  // namespace caffe
 
 #endif   // CAFFE_UTIL_IO_H_
diff --git a/include/caffe/util/signal_handler.h b/include/caffe/util/signal_handler.h
new file mode 100644
index 0000000..fb84c65
--- /dev/null
+++ b/include/caffe/util/signal_handler.h
@@ -0,0 +1,24 @@
+#ifndef INCLUDE_CAFFE_UTIL_SIGNAL_HANDLER_H_
+#define INCLUDE_CAFFE_UTIL_SIGNAL_HANDLER_H_
+
+#include "caffe/proto/caffe.pb.h"
+#include "caffe/solver.hpp"
+
+namespace caffe {
+
+class SignalHandler {
+ public:
+  // Contructor. Specify what action to take when a signal is received.
+  SignalHandler(SolverAction::Enum SIGINT_action,
+                SolverAction::Enum SIGHUP_action);
+  ~SignalHandler();
+  ActionCallback GetActionFunction();
+ private:
+  SolverAction::Enum CheckForSignals() const;
+  SolverAction::Enum SIGINT_action_;
+  SolverAction::Enum SIGHUP_action_;
+};
+
+}  // namespace caffe
+
+#endif  // INCLUDE_CAFFE_UTIL_SIGNAL_HANDLER_H_
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index a6bd86a..211e3d9 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -471,13 +471,7 @@ class SPPLayer : public Layer<Dtype> {
 
   virtual inline const char* type() const { return "SPP"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 1; }
-  // MAX POOL layers can output an extra top blob for the mask;
-  // others can only output the pooled inputs.
-  virtual inline int MaxTopBlobs() const {
-    return (this->layer_param_.pooling_param().pool() ==
-            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
-  }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
@@ -491,9 +485,11 @@ class SPPLayer : public Layer<Dtype> {
 
   int pyramid_height_;
   int bottom_h_, bottom_w_;
+  int num_;
   int channels_;
   int kernel_h_, kernel_w_;
   int pad_h_, pad_w_;
+  bool reshaped_first_time_;
 
   /// the internal Split layer that feeds the pooling layers
   shared_ptr<SplitLayer<Dtype> > split_layer_;
diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt
index 4b0d549..f420df8 100644
--- a/matlab/CMakeLists.txt
+++ b/matlab/CMakeLists.txt
@@ -43,7 +43,7 @@ string(REPLACE ";" ";-L" link_folders "-L${folders}")
 string(REPLACE ";" ":"  rpath_folders   "${folders}")
 
 if(build_using MATCHES "Matlab")
-  set(libflags -lcaffe${CAffe_POSTFIX} ${libflags}) # Matlab R2014a complans for -Wl,--whole-archive
+  set(libflags -lcaffe${Caffe_POSTFIX} ${libflags}) # Matlab R2014a complans for -Wl,--whole-archive
 
   caffe_fetch_and_set_proper_mexext(Matlab_caffe_mex)
   add_custom_command(OUTPUT ${Matlab_caffe_mex} COMMAND ${Matlab_mex}
@@ -56,7 +56,7 @@ elseif(build_using MATCHES "Octave")
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     set(libflags -Wl,-force_load,$<TARGET_LINKER_FILE:caffe> ${libflags})
   elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-    set(libflags -Wl,--whole-archive -lcaffe${CAffe_POSTFIX} -Wl,--no-whole-archive ${libflags})
+    set(libflags -Wl,--whole-archive -lcaffe${Caffe_POSTFIX} -Wl,--no-whole-archive ${libflags})
   endif()
 
   add_custom_command(OUTPUT ${Matlab_caffe_mex} COMMAND ${Octave_compiler}
diff --git a/models/bvlc_alexnet/deploy.prototxt b/models/bvlc_alexnet/deploy.prototxt
index ced055b..ff10daa 100644
--- a/models/bvlc_alexnet/deploy.prototxt
+++ b/models/bvlc_alexnet/deploy.prototxt
@@ -1,9 +1,11 @@
 name: "AlexNet"
 input: "data"
-input_dim: 10
-input_dim: 3
-input_dim: 227
-input_dim: 227
+input_shape {
+  dim: 10
+  dim: 3
+  dim: 227
+  dim: 227
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/models/bvlc_googlenet/deploy.prototxt b/models/bvlc_googlenet/deploy.prototxt
index 4648bf2..1f90ee2 100644
--- a/models/bvlc_googlenet/deploy.prototxt
+++ b/models/bvlc_googlenet/deploy.prototxt
@@ -1,9 +1,11 @@
 name: "GoogleNet"
 input: "data"
-input_dim: 10
-input_dim: 3
-input_dim: 224
-input_dim: 224
+input_shape {
+  dim: 10
+  dim: 3
+  dim: 224
+  dim: 224
+}
 layer {
   name: "conv1/7x7_s2"
   type: "Convolution"
diff --git a/models/bvlc_reference_caffenet/deploy.prototxt b/models/bvlc_reference_caffenet/deploy.prototxt
index 29ccf14..127f1e2 100644
--- a/models/bvlc_reference_caffenet/deploy.prototxt
+++ b/models/bvlc_reference_caffenet/deploy.prototxt
@@ -1,9 +1,11 @@
 name: "CaffeNet"
 input: "data"
-input_dim: 10
-input_dim: 3
-input_dim: 227
-input_dim: 227
+input_shape {
+  dim: 10
+  dim: 3
+  dim: 227
+  dim: 227
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/models/bvlc_reference_rcnn_ilsvrc13/deploy.prototxt b/models/bvlc_reference_rcnn_ilsvrc13/deploy.prototxt
index ea9cf98..ae1df96 100644
--- a/models/bvlc_reference_rcnn_ilsvrc13/deploy.prototxt
+++ b/models/bvlc_reference_rcnn_ilsvrc13/deploy.prototxt
@@ -1,9 +1,11 @@
 name: "R-CNN-ilsvrc13"
 input: "data"
-input_dim: 10
-input_dim: 3
-input_dim: 227
-input_dim: 227
+input_shape {
+  dim: 10
+  dim: 3
+  dim: 227
+  dim: 227
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/models/finetune_flickr_style/deploy.prototxt b/models/finetune_flickr_style/deploy.prototxt
index 4a924f7..0f07e47 100644
--- a/models/finetune_flickr_style/deploy.prototxt
+++ b/models/finetune_flickr_style/deploy.prototxt
@@ -1,9 +1,11 @@
 name: "FlickrStyleCaffeNet"
 input: "data"
-input_dim: 10
-input_dim: 3
-input_dim: 227
-input_dim: 227
+input_shape {
+  dim: 10
+  dim: 3
+  dim: 227
+  dim: 227
+}
 layer {
   name: "conv1"
   type: "Convolution"
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index df0401d..0e2bc7e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,7 +18,7 @@ if(UNIX OR APPLE)
                        COMMAND ${CMAKE_COMMAND} -E make_directory ${PROJECT_SOURCE_DIR}/python/caffe/proto
                        COMMAND touch ${PROJECT_SOURCE_DIR}/python/caffe/proto/__init__.py
                        COMMAND cp ${proto_gen_folder}/*.py ${PROJECT_SOURCE_DIR}/python/caffe/proto/
-                       COMMENT "Creating symlink ${__linkname} -> ${PROJECT_BINARY_DIR}/lib/_caffe${CAffe_POSTFIX}.so")
+                       COMMENT "Creating symlink ${__linkname} -> ${PROJECT_BINARY_DIR}/lib/_caffe${Caffe_POSTFIX}.so")
 endif()
 
 # ---[ Install
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 1b2da51..6cc44e7 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver
+from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
 from .detector import Detector
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index dff7f62..cc49f60 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -190,6 +190,21 @@ bp::object Blob_Reshape(bp::tuple args, bp::dict kwargs) {
   return bp::object();
 }
 
+bp::object BlobVec_add_blob(bp::tuple args, bp::dict kwargs) {
+  if (bp::len(kwargs) > 0) {
+    throw std::runtime_error("BlobVec.add_blob takes no kwargs");
+  }
+  typedef vector<shared_ptr<Blob<Dtype> > > BlobVec;
+  BlobVec* self = bp::extract<BlobVec*>(args[0]);
+  vector<int> shape(bp::len(args) - 1);
+  for (int i = 1; i < bp::len(args); ++i) {
+    shape[i - 1] = bp::extract<int>(args[i]);
+  }
+  self->push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+  // We need to explicitly return None to use bp::raw_function.
+  return bp::object();
+}
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -200,6 +215,8 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_device", &Caffe::SetDevice);
 
+  bp::def("layer_type_list", &LayerRegistry<Dtype>::LayerTypeList);
+
   bp::class_<Net<Dtype>, shared_ptr<Net<Dtype> >, boost::noncopyable >("Net",
     bp::no_init)
     .def("__init__", bp::make_constructor(&Net_Init))
@@ -211,6 +228,8 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("copy_from", static_cast<void (Net<Dtype>::*)(const string)>(
         &Net<Dtype>::CopyTrainedLayersFrom))
     .def("share_with", &Net<Dtype>::ShareTrainedLayersWith)
+    .add_property("_blob_loss_weights", bp::make_function(
+        &Net<Dtype>::blob_loss_weights, bp::return_internal_reference<>()))
     .add_property("_blobs", bp::make_function(&Net<Dtype>::blobs,
         bp::return_internal_reference<>()))
     .add_property("layers", bp::make_function(&Net<Dtype>::layers,
@@ -230,6 +249,11 @@ BOOST_PYTHON_MODULE(_caffe) {
 
   bp::class_<Blob<Dtype>, shared_ptr<Blob<Dtype> >, boost::noncopyable>(
     "Blob", bp::no_init)
+    .add_property("shape",
+        bp::make_function(
+            static_cast<const vector<int>& (Blob<Dtype>::*)() const>(
+                &Blob<Dtype>::shape),
+            bp::return_value_policy<bp::copy_const_reference>()))
     .add_property("num",      &Blob<Dtype>::num)
     .add_property("channels", &Blob<Dtype>::channels)
     .add_property("height",   &Blob<Dtype>::height)
@@ -279,7 +303,8 @@ BOOST_PYTHON_MODULE(_caffe) {
 
   // vector wrappers for all the vector types we use
   bp::class_<vector<shared_ptr<Blob<Dtype> > > >("BlobVec")
-    .def(bp::vector_indexing_suite<vector<shared_ptr<Blob<Dtype> > >, true>());
+    .def(bp::vector_indexing_suite<vector<shared_ptr<Blob<Dtype> > >, true>())
+    .def("add_blob", bp::raw_function(&BlobVec_add_blob));
   bp::class_<vector<Blob<Dtype>*> >("RawBlobVec")
     .def(bp::vector_indexing_suite<vector<Blob<Dtype>*>, true>());
   bp::class_<vector<shared_ptr<Layer<Dtype> > > >("LayerVec")
@@ -288,6 +313,8 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def(bp::vector_indexing_suite<vector<string> >());
   bp::class_<vector<int> >("IntVec")
     .def(bp::vector_indexing_suite<vector<int> >());
+  bp::class_<vector<Dtype> >("DtypeVec")
+    .def(bp::vector_indexing_suite<vector<Dtype> >());
   bp::class_<vector<shared_ptr<Net<Dtype> > > >("NetVec")
     .def(bp::vector_indexing_suite<vector<shared_ptr<Net<Dtype> > >, true>());
   bp::class_<vector<bool> >("BoolVec")
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 324929d..a002b60 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -40,7 +40,7 @@ def get_edge_label(layer):
 
     if layer.type == 'Data':
         edge_label = 'Batch ' + str(layer.data_param.batch_size)
-    elif layer.type == 'Convolution':
+    elif layer.type == 'Convolution' or layer.type == 'Deconvolution':
         edge_label = str(layer.convolution_param.num_output)
     elif layer.type == 'InnerProduct':
         edge_label = str(layer.inner_product_param.num_output)
@@ -74,7 +74,7 @@ def get_layer_label(layer, rankdir):
         # horizontal space is not; separate words with newlines
         separator = '\\n'
 
-    if layer.type == 'Convolution':
+    if layer.type == 'Convolution' or layer.type == 'Deconvolution':
         # Outer double quotes needed or else colon characters don't parse
         # properly
         node_label = '"%s%s(%s)%skernel size: %d%sstride: %d%spad: %d"' %\
@@ -109,7 +109,7 @@ def choose_color_by_layertype(layertype):
     """Define colors for nodes based on the layer type.
     """
     color = '#6495ED'  # Default
-    if layertype == 'Convolution':
+    if layertype == 'Convolution' or layertype == 'Deconvolution':
         color = '#FF5050'
     elif layertype == 'Pooling':
         color = '#FF9900'
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 1b4814a..77a0e00 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -1,7 +1,7 @@
 """Python net specification.
 
 This module provides a way to write nets directly in Python, using a natural,
-functional style. See examples/python_nets/caffenet.py for an example.
+functional style. See examples/pycaffe/caffenet.py for an example.
 
 Currently this works as a thin wrapper around the Python protobuf interface,
 with layers and parameters automatically generated for the "layers" and
@@ -18,7 +18,7 @@ for specifying nets. In particular, the automatically generated layer names
 are not guaranteed to be forward-compatible.
 """
 
-from collections import OrderedDict
+from collections import OrderedDict, Counter
 
 from .proto import caffe_pb2
 from google import protobuf
@@ -44,10 +44,8 @@ def to_proto(*tops):
     """Generate a NetParameter that contains all layers needed to compute
     all arguments."""
 
-    if not isinstance(tops, tuple):
-        tops = (tops,)
     layers = OrderedDict()
-    autonames = {}
+    autonames = Counter()
     for top in tops:
         top.fn._to_proto(layers, {}, autonames)
     net = caffe_pb2.NetParameter()
@@ -89,6 +87,9 @@ class Top(object):
 
         return to_proto(self)
 
+    def _to_proto(self, layers, names, autonames):
+        return self.fn._to_proto(layers, names, autonames)
+
 
 class Function(object):
     """A Function specifies a layer, its parameters, and its inputs (which
@@ -107,11 +108,18 @@ class Function(object):
             del self.params['in_place']
         self.tops = tuple(Top(self, n) for n in range(self.ntop))
 
-    def _get_name(self, top, names, autonames):
+    def _get_name(self, names, autonames):
+        if self not in names and self.ntop > 0:
+            names[self] = self._get_top_name(self.tops[0], names, autonames)
+        elif self not in names:
+            autonames[self.type_name] += 1
+            names[self] = self.type_name + str(autonames[self.type_name])
+        return names[self]
+
+    def _get_top_name(self, top, names, autonames):
         if top not in names:
-            n = autonames.setdefault(top.fn.type_name, 1)
             autonames[top.fn.type_name] += 1
-            names[top] = top.fn.type_name + str(n)
+            names[top] = top.fn.type_name + str(autonames[top.fn.type_name])
         return names[top]
 
     def _to_proto(self, layers, names, autonames):
@@ -119,7 +127,7 @@ class Function(object):
             return
         bottom_names = []
         for inp in self.inputs:
-            inp.fn._to_proto(layers, names, autonames)
+            inp._to_proto(layers, names, autonames)
             bottom_names.append(layers[inp.fn].top[inp.n])
         layer = caffe_pb2.LayerParameter()
         layer.type = self.type_name
@@ -129,8 +137,8 @@ class Function(object):
             layer.top.extend(layer.bottom)
         else:
             for top in self.tops:
-                layer.top.append(self._get_name(top, names, autonames))
-        layer.name = self._get_name(self.tops[0], names, autonames)
+                layer.top.append(self._get_top_name(top, names, autonames))
+        layer.name = self._get_name(names, autonames)
 
         for k, v in six.iteritems(self.params):
             # special case to handle generic *params
@@ -163,10 +171,10 @@ class NetSpec(object):
 
     def to_proto(self):
         names = {v: k for k, v in six.iteritems(self.tops)}
-        autonames = {}
+        autonames = Counter()
         layers = OrderedDict()
         for name, top in six.iteritems(self.tops):
-            top.fn._to_proto(layers, names, autonames)
+            top._to_proto(layers, names, autonames)
         net = caffe_pb2.NetParameter()
         net.layer.extend(layers.values())
         return net
@@ -180,7 +188,9 @@ class Layers(object):
     def __getattr__(self, name):
         def layer_fn(*args, **kwargs):
             fn = Function(name, args, kwargs)
-            if fn.ntop == 1:
+            if fn.ntop == 0:
+                return fn
+            elif fn.ntop == 1:
                 return fn.tops[0]
             else:
                 return fn.tops
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index e8a676a..4f980a9 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -28,6 +28,15 @@ def _Net_blobs(self):
 
 
 @property
+def _Net_blob_loss_weights(self):
+    """
+    An OrderedDict (bottom to top, i.e., input to output) of network
+    blob loss weights indexed by name
+    """
+    return OrderedDict(zip(self._blob_names, self._blob_loss_weights))
+
+
+ at property
 def _Net_params(self):
     """
     An OrderedDict (bottom to top, i.e., input to output) of network
@@ -270,6 +279,7 @@ def _Net_batch(self, blobs):
 
 # Attach methods to Net.
 Net.blobs = _Net_blobs
+Net.blob_loss_weights = _Net_blob_loss_weights
 Net.params = _Net_params
 Net.forward = _Net_forward
 Net.backward = _Net_backward
diff --git a/python/caffe/test/test_layer_type_list.py b/python/caffe/test/test_layer_type_list.py
new file mode 100644
index 0000000..7edc80d
--- /dev/null
+++ b/python/caffe/test/test_layer_type_list.py
@@ -0,0 +1,10 @@
+import unittest
+
+import caffe
+
+class TestLayerTypeList(unittest.TestCase):
+
+    def test_standard_types(self):
+        for type_name in ['Data', 'Convolution', 'InnerProduct']:
+            self.assertIn(type_name, caffe.layer_type_list(),
+                    '%s not in layer_type_list()' % type_name)
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index 909a101..b4595e6 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -41,6 +41,14 @@ def anon_lenet(batch_size):
     loss = L.SoftmaxWithLoss(ip2, label)
     return loss.to_proto()
 
+def silent_net():
+    n = caffe.NetSpec()
+    n.data, n.data2 = L.DummyData(shape=[dict(dim=[3]), dict(dim=[4, 2])],
+                                  ntop=2)
+    n.silence_data = L.Silence(n.data, ntop=0)
+    n.silence_data2 = L.Silence(n.data2, ntop=0)
+    return n.to_proto()
+
 class TestNetSpec(unittest.TestCase):
     def load_net(self, net_proto):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
@@ -65,3 +73,10 @@ class TestNetSpec(unittest.TestCase):
                 net_proto.layer[6].top)
         net = self.load_net(net_proto)
         self.assertEqual(len(net.layers), 9)
+
+    def test_zero_tops(self):
+        """Test net construction for top-less layers."""
+
+        net_proto = silent_net()
+        net = self.load_net(net_proto)
+        self.assertEqual(len(net.forward()), 0)
diff --git a/python/caffe/test/test_python_layer.py b/python/caffe/test/test_python_layer.py
index f41e283..8ed8665 100644
--- a/python/caffe/test/test_python_layer.py
+++ b/python/caffe/test/test_python_layer.py
@@ -22,6 +22,28 @@ class SimpleLayer(caffe.Layer):
         bottom[0].diff[...] = 10 * top[0].diff
 
 
+class ExceptionLayer(caffe.Layer):
+    """A layer for checking exceptions from Python"""
+
+    def setup(self, bottom, top):
+        raise RuntimeError
+
+class ParameterLayer(caffe.Layer):
+    """A layer that just multiplies by ten"""
+
+    def setup(self, bottom, top):
+        self.blobs.add_blob(1)
+        self.blobs[0].data[0] = 0
+
+    def reshape(self, bottom, top):
+        top[0].reshape(*bottom[0].data.shape)
+
+    def forward(self, bottom, top):
+        pass
+
+    def backward(self, top, propagate_down, bottom):
+        self.blobs[0].diff[0] = 1
+
 def python_net_file():
     with tempfile.NamedTemporaryFile(mode='w+', delete=False) as f:
         f.write("""name: 'pythonnet' force_backward: true
@@ -35,6 +57,26 @@ def python_net_file():
         return f.name
 
 
+def exception_net_file():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False) as f:
+        f.write("""name: 'pythonnet' force_backward: true
+        input: 'data' input_shape { dim: 10 dim: 9 dim: 8 }
+        layer { type: 'Python' name: 'layer' bottom: 'data' top: 'top'
+          python_param { module: 'test_python_layer' layer: 'ExceptionLayer' } }
+          """)
+        return f.name
+
+
+def parameter_net_file():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False) as f:
+        f.write("""name: 'pythonnet' force_backward: true
+        input: 'data' input_shape { dim: 10 dim: 9 dim: 8 }
+        layer { type: 'Python' name: 'layer' bottom: 'data' top: 'top'
+          python_param { module: 'test_python_layer' layer: 'ParameterLayer' } }
+          """)
+        return f.name
+
+
 class TestPythonLayer(unittest.TestCase):
     def setUp(self):
         net_file = python_net_file()
@@ -62,3 +104,37 @@ class TestPythonLayer(unittest.TestCase):
         for blob in six.itervalues(self.net.blobs):
             for d in blob.data.shape:
                 self.assertEqual(s, d)
+
+    def test_exception(self):
+        net_file = exception_net_file()
+        self.assertRaises(RuntimeError, caffe.Net, net_file, caffe.TEST)
+        os.remove(net_file)
+
+    def test_parameter(self):
+        net_file = parameter_net_file()
+        net = caffe.Net(net_file, caffe.TRAIN)
+        # Test forward and backward
+        net.forward()
+        net.backward()
+        layer = net.layers[list(net._layer_names).index('layer')]
+        self.assertEqual(layer.blobs[0].data[0], 0)
+        self.assertEqual(layer.blobs[0].diff[0], 1)
+        layer.blobs[0].data[0] += layer.blobs[0].diff[0]
+        self.assertEqual(layer.blobs[0].data[0], 1)
+
+        # Test saving and loading
+        h, caffemodel_file = tempfile.mkstemp()
+        net.save(caffemodel_file)
+        layer.blobs[0].data[0] = -1
+        self.assertEqual(layer.blobs[0].data[0], -1)
+        net.copy_from(caffemodel_file)
+        self.assertEqual(layer.blobs[0].data[0], 1)
+        os.remove(caffemodel_file)
+        
+        # Test weight sharing
+        net2 = caffe.Net(net_file, caffe.TRAIN)
+        net2.share_with(net)
+        layer = net.layers[list(net2._layer_names).index('layer')]
+        self.assertEqual(layer.blobs[0].data[0], 1)
+
+        os.remove(net_file)
diff --git a/python/caffe/test/test_python_layer_with_param_str.py b/python/caffe/test/test_python_layer_with_param_str.py
new file mode 100644
index 0000000..3d0f107
--- /dev/null
+++ b/python/caffe/test/test_python_layer_with_param_str.py
@@ -0,0 +1,59 @@
+import unittest
+import tempfile
+import os
+import six
+
+import caffe
+
+
+class SimpleParamLayer(caffe.Layer):
+    """A layer that just multiplies by the numeric value of its param string"""
+
+    def setup(self, bottom, top):
+        try:
+            self.value = float(self.param_str)
+        except ValueError:
+            raise ValueError("Parameter string must be a legible float")
+
+    def reshape(self, bottom, top):
+        top[0].reshape(*bottom[0].data.shape)
+
+    def forward(self, bottom, top):
+        top[0].data[...] = self.value * bottom[0].data
+
+    def backward(self, top, propagate_down, bottom):
+        bottom[0].diff[...] = self.value * top[0].diff
+
+
+def python_param_net_file():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False) as f:
+        f.write("""name: 'pythonnet' force_backward: true
+        input: 'data' input_shape { dim: 10 dim: 9 dim: 8 }
+        layer { type: 'Python' name: 'mul10' bottom: 'data' top: 'mul10'
+          python_param { module: 'test_python_layer_with_param_str'
+                layer: 'SimpleParamLayer' param_str: '10' } }
+        layer { type: 'Python' name: 'mul2' bottom: 'mul10' top: 'mul2'
+          python_param { module: 'test_python_layer_with_param_str'
+                layer: 'SimpleParamLayer' param_str: '2' } }""")
+        return f.name
+
+
+class TestLayerWithParam(unittest.TestCase):
+    def setUp(self):
+        net_file = python_param_net_file()
+        self.net = caffe.Net(net_file, caffe.TRAIN)
+        os.remove(net_file)
+
+    def test_forward(self):
+        x = 8
+        self.net.blobs['data'].data[...] = x
+        self.net.forward()
+        for y in self.net.blobs['mul2'].data.flat:
+            self.assertEqual(y, 2 * 10 * x)
+
+    def test_backward(self):
+        x = 7
+        self.net.blobs['mul2'].diff[...] = x
+        self.net.backward()
+        for y in self.net.blobs['data'].diff.flat:
+            self.assertEqual(y, 2 * 10 * x)
diff --git a/scripts/download_model_binary.py b/scripts/download_model_binary.py
index 48e9015..03a50f6 100755
--- a/scripts/download_model_binary.py
+++ b/scripts/download_model_binary.py
@@ -32,7 +32,7 @@ def parse_readme_frontmatter(dirname):
     with open(readme_filename) as f:
         lines = [line.strip() for line in f.readlines()]
     top = lines.index('---')
-    bottom = lines[top + 1:].index('---')
+    bottom = lines.index('---', top + 1)
     frontmatter = yaml.load('\n'.join(lines[top + 1:bottom]))
     assert all(key in frontmatter for key in required_keys)
     return dirname, frontmatter
diff --git a/scripts/download_model_from_gist.sh b/scripts/download_model_from_gist.sh
index a1dccf7..89527b7 100755
--- a/scripts/download_model_from_gist.sh
+++ b/scripts/download_model_from_gist.sh
@@ -18,7 +18,7 @@ fi
 
 echo "Downloading Caffe model info to $MODEL_DIR ..."
 mkdir -p $MODEL_DIR
-wget https://gist.github.com/$GIST/download -O $MODEL_DIR/gist.tar.gz
-tar xzf $MODEL_DIR/gist.tar.gz --directory=$MODEL_DIR --strip-components=1
-rm $MODEL_DIR/gist.tar.gz
+wget https://gist.github.com/$GIST/download -O $MODEL_DIR/gist.zip
+unzip -j $MODEL_DIR/gist.zip -d $MODEL_DIR
+rm $MODEL_DIR/gist.zip
 echo "Done"
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 94fdcc3..8450aa1 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -456,10 +456,25 @@ void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
   }
   // copy data
   Dtype* data_vec = mutable_cpu_data();
-  for (int i = 0; i < count_; ++i) {
-    data_vec[i] = proto.data(i);
+  if (proto.double_data_size() > 0) {
+    CHECK_EQ(count_, proto.double_data_size());
+    for (int i = 0; i < count_; ++i) {
+      data_vec[i] = proto.double_data(i);
+    }
+  } else {
+    CHECK_EQ(count_, proto.data_size());
+    for (int i = 0; i < count_; ++i) {
+      data_vec[i] = proto.data(i);
+    }
   }
-  if (proto.diff_size() > 0) {
+  if (proto.double_diff_size() > 0) {
+    CHECK_EQ(count_, proto.double_diff_size());
+    Dtype* diff_vec = mutable_cpu_diff();
+    for (int i = 0; i < count_; ++i) {
+      diff_vec[i] = proto.double_diff(i);
+    }
+  } else if (proto.diff_size() > 0) {
+    CHECK_EQ(count_, proto.diff_size());
     Dtype* diff_vec = mutable_cpu_diff();
     for (int i = 0; i < count_; ++i) {
       diff_vec[i] = proto.diff(i);
@@ -467,20 +482,40 @@ void Blob<Dtype>::FromProto(const BlobProto& proto, bool reshape) {
   }
 }
 
-template <typename Dtype>
-void Blob<Dtype>::ToProto(BlobProto* proto, bool write_diff) const {
+template <>
+void Blob<double>::ToProto(BlobProto* proto, bool write_diff) const {
+  proto->clear_shape();
+  for (int i = 0; i < shape_.size(); ++i) {
+    proto->mutable_shape()->add_dim(shape_[i]);
+  }
+  proto->clear_double_data();
+  proto->clear_double_diff();
+  const double* data_vec = cpu_data();
+  for (int i = 0; i < count_; ++i) {
+    proto->add_double_data(data_vec[i]);
+  }
+  if (write_diff) {
+    const double* diff_vec = cpu_diff();
+    for (int i = 0; i < count_; ++i) {
+      proto->add_double_diff(diff_vec[i]);
+    }
+  }
+}
+
+template <>
+void Blob<float>::ToProto(BlobProto* proto, bool write_diff) const {
   proto->clear_shape();
   for (int i = 0; i < shape_.size(); ++i) {
     proto->mutable_shape()->add_dim(shape_[i]);
   }
   proto->clear_data();
   proto->clear_diff();
-  const Dtype* data_vec = cpu_data();
+  const float* data_vec = cpu_data();
   for (int i = 0; i < count_; ++i) {
     proto->add_data(data_vec[i]);
   }
   if (write_diff) {
-    const Dtype* diff_vec = cpu_diff();
+    const float* diff_vec = cpu_diff();
     for (int i = 0; i < count_; ++i) {
       proto->add_diff(diff_vec[i]);
     }
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index af96cac..299d67d 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -1,4 +1,6 @@
+#include <boost/thread.hpp>
 #include <glog/logging.h>
+#include <cmath>
 #include <cstdio>
 #include <ctime>
 
@@ -7,7 +9,15 @@
 
 namespace caffe {
 
-shared_ptr<Caffe> Caffe::singleton_;
+// Make sure each thread can have different values.
+static boost::thread_specific_ptr<Caffe> thread_instance_;
+
+Caffe& Caffe::Get() {
+  if (!thread_instance_.get()) {
+    thread_instance_.reset(new Caffe());
+  }
+  return *(thread_instance_.get());
+}
 
 // random seeding
 int64_t cluster_seedgen(void) {
@@ -25,7 +35,7 @@ int64_t cluster_seedgen(void) {
 
   pid = getpid();
   s = time(NULL);
-  seed = abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
   return seed;
 }
 
@@ -42,7 +52,8 @@ void GlobalInit(int* pargc, char*** pargv) {
 #ifdef CPU_ONLY  // CPU-only Caffe.
 
 Caffe::Caffe()
-    : random_generator_(), mode_(Caffe::CPU) { }
+    : random_generator_(), mode_(Caffe::CPU),
+      solver_count_(1), root_solver_(true) { }
 
 Caffe::~Caffe() { }
 
@@ -86,7 +97,7 @@ void* Caffe::RNG::generator() {
 
 Caffe::Caffe()
     : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
-    mode_(Caffe::CPU) {
+    mode_(Caffe::CPU), solver_count_(1), root_solver_(true) {
   // Try to create a cublas handler, and report an error if failed (but we will
   // keep the program running as one might just want to run CPU code).
   if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
new file mode 100644
index 0000000..1637820
--- /dev/null
+++ b/src/caffe/data_reader.cpp
@@ -0,0 +1,119 @@
+#include <boost/thread.hpp>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/data_layers.hpp"
+#include "caffe/data_reader.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+using boost::weak_ptr;
+
+map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
+static boost::mutex bodies_mutex_;
+
+DataReader::DataReader(const LayerParameter& param)
+    : queue_pair_(new QueuePair(  //
+        param.data_param().prefetch() * param.data_param().batch_size())) {
+  // Get or create a body
+  boost::mutex::scoped_lock lock(bodies_mutex_);
+  string key = source_key(param);
+  weak_ptr<Body>& weak = bodies_[key];
+  body_ = weak.lock();
+  if (!body_) {
+    body_.reset(new Body(param));
+    bodies_[key] = weak_ptr<Body>(body_);
+  }
+  body_->new_queue_pairs_.push(queue_pair_);
+}
+
+DataReader::~DataReader() {
+  string key = source_key(body_->param_);
+  body_.reset();
+  boost::mutex::scoped_lock lock(bodies_mutex_);
+  if (bodies_[key].expired()) {
+    bodies_.erase(key);
+  }
+}
+
+//
+
+DataReader::QueuePair::QueuePair(int size) {
+  // Initialize the free queue with requested number of datums
+  for (int i = 0; i < size; ++i) {
+    free_.push(new Datum());
+  }
+}
+
+DataReader::QueuePair::~QueuePair() {
+  Datum* datum;
+  while (free_.try_pop(&datum)) {
+    delete datum;
+  }
+  while (full_.try_pop(&datum)) {
+    delete datum;
+  }
+}
+
+//
+
+DataReader::Body::Body(const LayerParameter& param)
+    : param_(param),
+      new_queue_pairs_() {
+  StartInternalThread();
+}
+
+DataReader::Body::~Body() {
+  StopInternalThread();
+}
+
+void DataReader::Body::InternalThreadEntry() {
+  shared_ptr<db::DB> db(db::GetDB(param_.data_param().backend()));
+  db->Open(param_.data_param().source(), db::READ);
+  shared_ptr<db::Cursor> cursor(db->NewCursor());
+  vector<shared_ptr<QueuePair> > qps;
+  try {
+    int solver_count = param_.phase() == TRAIN ? Caffe::solver_count() : 1;
+
+    // To ensure deterministic runs, only start running once all solvers
+    // are ready. But solvers need to peek on one item during initialization,
+    // so read one item, then wait for the next solver.
+    for (int i = 0; i < solver_count; ++i) {
+      shared_ptr<QueuePair> qp(new_queue_pairs_.pop());
+      read_one(cursor.get(), qp.get());
+      qps.push_back(qp);
+    }
+    // Main loop
+    while (!must_stop()) {
+      for (int i = 0; i < solver_count; ++i) {
+        read_one(cursor.get(), qps[i].get());
+      }
+      // Check no additional readers have been created. This can happen if
+      // more than one net is trained at a time per process, whether single
+      // or multi solver. It might also happen if two data layers have same
+      // name and same source.
+      CHECK_EQ(new_queue_pairs_.size(), 0);
+    }
+  } catch (boost::thread_interrupted&) {
+    // Interrupted exception is expected on shutdown
+  }
+}
+
+void DataReader::Body::read_one(db::Cursor* cursor, QueuePair* qp) {
+  Datum* datum = qp->free_.pop();
+  // TODO deserialize in-place instead of copy?
+  datum->ParseFromString(cursor->value());
+  qp->full_.push(datum);
+
+  // go to the next iter
+  cursor->Next();
+  if (!cursor->valid()) {
+    DLOG(INFO) << "Restarting data prefetching from start.";
+    cursor->SeekToFirst();
+  }
+}
+
+}  // namespace caffe
diff --git a/src/caffe/data_transformer.cpp b/src/caffe/data_transformer.cpp
index 2263392..4666d9b 100644
--- a/src/caffe/data_transformer.cpp
+++ b/src/caffe/data_transformer.cpp
@@ -19,7 +19,9 @@ DataTransformer<Dtype>::DataTransformer(const TransformationParameter& param,
     CHECK_EQ(param_.mean_value_size(), 0) <<
       "Cannot specify mean_file and mean_value at the same time";
     const string& mean_file = param.mean_file();
-    LOG(INFO) << "Loading mean file from: " << mean_file;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "Loading mean file from: " << mean_file;
+    }
     BlobProto blob_proto;
     ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
     data_mean_.FromProto(blob_proto);
diff --git a/src/caffe/internal_thread.cpp b/src/caffe/internal_thread.cpp
index c2d19d4..104884e 100644
--- a/src/caffe/internal_thread.cpp
+++ b/src/caffe/internal_thread.cpp
@@ -1,40 +1,66 @@
 #include <boost/thread.hpp>
+#include <exception>
+
 #include "caffe/internal_thread.hpp"
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
 InternalThread::~InternalThread() {
-  WaitForInternalThreadToExit();
+  StopInternalThread();
 }
 
 bool InternalThread::is_started() const {
-  return thread_.get() != NULL && thread_->joinable();
+  return thread_ && thread_->joinable();
+}
+
+bool InternalThread::must_stop() {
+  return thread_ && thread_->interruption_requested();
 }
 
+void InternalThread::StartInternalThread() {
+  CHECK(!is_started()) << "Threads should persist and not be restarted.";
+
+  int device = 0;
+#ifndef CPU_ONLY
+  CUDA_CHECK(cudaGetDevice(&device));
+#endif
+  Caffe::Brew mode = Caffe::mode();
+  int rand_seed = caffe_rng_rand();
+  int solver_count = Caffe::solver_count();
+  bool root_solver = Caffe::root_solver();
 
-bool InternalThread::StartInternalThread() {
-  if (!WaitForInternalThreadToExit()) {
-    return false;
-  }
   try {
-    thread_.reset(
-        new boost::thread(&InternalThread::InternalThreadEntry, this));
-  } catch (...) {
-    return false;
+    thread_.reset(new boost::thread(&InternalThread::entry, this, device, mode,
+          rand_seed, solver_count, root_solver));
+  } catch (std::exception& e) {
+    LOG(FATAL) << "Thread exception: " << e.what();
   }
-  return true;
 }
 
-/** Will not return until the internal thread has exited. */
-bool InternalThread::WaitForInternalThreadToExit() {
+void InternalThread::entry(int device, Caffe::Brew mode, int rand_seed,
+    int solver_count, bool root_solver) {
+#ifndef CPU_ONLY
+  CUDA_CHECK(cudaSetDevice(device));
+#endif
+  Caffe::set_mode(mode);
+  Caffe::set_random_seed(rand_seed);
+  Caffe::set_solver_count(solver_count);
+  Caffe::set_root_solver(root_solver);
+
+  InternalThreadEntry();
+}
+
+void InternalThread::StopInternalThread() {
   if (is_started()) {
+    thread_->interrupt();
     try {
       thread_->join();
-    } catch (...) {
-      return false;
+    } catch (boost::thread_interrupted&) {
+    } catch (std::exception& e) {
+      LOG(FATAL) << "Thread exception: " << e.what();
     }
   }
-  return true;
 }
 
 }  // namespace caffe
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
new file mode 100644
index 0000000..3b91289
--- /dev/null
+++ b/src/caffe/layer.cpp
@@ -0,0 +1,27 @@
+#include <boost/thread.hpp>
+#include "caffe/layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void Layer<Dtype>::InitMutex() {
+  forward_mutex_.reset(new boost::mutex());
+}
+
+template <typename Dtype>
+void Layer<Dtype>::Lock() {
+  if (IsShared()) {
+    forward_mutex_->lock();
+  }
+}
+
+template <typename Dtype>
+void Layer<Dtype>::Unlock() {
+  if (IsShared()) {
+    forward_mutex_->unlock();
+  }
+}
+
+INSTANTIATE_CLASS(Layer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 90aad67..e2d8d9f 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -38,6 +38,13 @@ void AccuracyLayer<Dtype>::Reshape(
       << "with integer values in {0, 1, ..., C-1}.";
   vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
   top[0]->Reshape(top_shape);
+  if (top.size() > 1) {
+    // Per-class accuracy is a vector; 1 axes.
+    vector<int> top_shape_per_class(1);
+    top_shape_per_class[0] = bottom[0]->shape(label_axis_);
+    top[1]->Reshape(top_shape_per_class);
+    nums_buffer_.Reshape(top_shape_per_class);
+  }
 }
 
 template <typename Dtype>
@@ -50,6 +57,10 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int num_labels = bottom[0]->shape(label_axis_);
   vector<Dtype> maxval(top_k_+1);
   vector<int> max_id(top_k_+1);
+  if (top.size() > 1) {
+    caffe_set(nums_buffer_.count(), Dtype(0), nums_buffer_.mutable_cpu_data());
+    caffe_set(top[1]->count(), Dtype(0), top[1]->mutable_cpu_data());
+  }
   int count = 0;
   for (int i = 0; i < outer_num_; ++i) {
     for (int j = 0; j < inner_num_; ++j) {
@@ -58,6 +69,7 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       if (has_ignore_label_ && label_value == ignore_label_) {
         continue;
       }
+      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
       DCHECK_GE(label_value, 0);
       DCHECK_LT(label_value, num_labels);
       // Top-k accuracy
@@ -73,6 +85,7 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       for (int k = 0; k < top_k_; k++) {
         if (bottom_data_vector[k].second == label_value) {
           ++accuracy;
+          if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
           break;
         }
       }
@@ -82,6 +95,13 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   // LOG(INFO) << "Accuracy: " << accuracy;
   top[0]->mutable_cpu_data()[0] = accuracy / count;
+  if (top.size() > 1) {
+    for (int i = 0; i < top[1]->count(); ++i) {
+      top[1]->mutable_cpu_data()[i] =
+          nums_buffer_.cpu_data()[i] == 0 ? 0
+          : top[1]->cpu_data()[i] / nums_buffer_.cpu_data()[i];
+    }
+  }
   // Accuracy layer should not be used as a loss function.
 }
 
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 26a1118..b90bd4e 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -1,7 +1,9 @@
+#include <boost/thread.hpp>
 #include <string>
 #include <vector>
 
 #include "caffe/data_layers.hpp"
+#include "caffe/net.hpp"
 #include "caffe/util/io.hpp"
 
 namespace caffe {
@@ -28,55 +30,95 @@ void BaseDataLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+BasePrefetchingDataLayer<Dtype>::BasePrefetchingDataLayer(
+    const LayerParameter& param)
+    : BaseDataLayer<Dtype>(param),
+      prefetch_free_(), prefetch_full_() {
+  for (int i = 0; i < PREFETCH_COUNT; ++i) {
+    prefetch_free_.push(&prefetch_[i]);
+  }
+}
+
+template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   BaseDataLayer<Dtype>::LayerSetUp(bottom, top);
-  // Now, start the prefetch thread. Before calling prefetch, we make two
-  // cpu_data calls so that the prefetch thread does not accidentally make
-  // simultaneous cudaMalloc calls when the main thread is running. In some
-  // GPUs this seems to cause failures if we do not so.
-  this->prefetch_data_.mutable_cpu_data();
-  if (this->output_labels_) {
-    this->prefetch_label_.mutable_cpu_data();
+  // Before starting the prefetch thread, we make cpu_data and gpu_data
+  // calls so that the prefetch thread does not accidentally make simultaneous
+  // cudaMalloc calls when the main thread is running. In some GPUs this
+  // seems to cause failures if we do not so.
+  for (int i = 0; i < PREFETCH_COUNT; ++i) {
+    prefetch_[i].data_.mutable_cpu_data();
+    if (this->output_labels_) {
+      prefetch_[i].label_.mutable_cpu_data();
+    }
   }
+#ifndef CPU_ONLY
+  if (Caffe::mode() == Caffe::GPU) {
+    for (int i = 0; i < PREFETCH_COUNT; ++i) {
+      prefetch_[i].data_.mutable_gpu_data();
+      if (this->output_labels_) {
+        prefetch_[i].label_.mutable_gpu_data();
+      }
+    }
+  }
+#endif
   DLOG(INFO) << "Initializing prefetch";
-  this->CreatePrefetchThread();
+  this->data_transformer_->InitRand();
+  StartInternalThread();
   DLOG(INFO) << "Prefetch initialized.";
 }
 
 template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::CreatePrefetchThread() {
-  this->data_transformer_->InitRand();
-  CHECK(StartInternalThread()) << "Thread execution failed";
-}
+void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
+#ifndef CPU_ONLY
+  cudaStream_t stream;
+  if (Caffe::mode() == Caffe::GPU) {
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  }
+#endif
 
-template <typename Dtype>
-void BasePrefetchingDataLayer<Dtype>::JoinPrefetchThread() {
-  CHECK(WaitForInternalThreadToExit()) << "Thread joining failed";
+  try {
+    while (!must_stop()) {
+      Batch<Dtype>* batch = prefetch_free_.pop();
+      load_batch(batch);
+#ifndef CPU_ONLY
+      if (Caffe::mode() == Caffe::GPU) {
+        batch->data_.data().get()->async_gpu_push(stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+      }
+#endif
+      prefetch_full_.push(batch);
+    }
+  } catch (boost::thread_interrupted&) {
+    // Interrupted exception is expected on shutdown
+  }
+#ifndef CPU_ONLY
+  if (Caffe::mode() == Caffe::GPU) {
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+#endif
 }
 
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, join the thread
-  JoinPrefetchThread();
-  DLOG(INFO) << "Thread joined";
+  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(prefetch_data_);
+  top[0]->ReshapeLike(batch->data_);
   // Copy the data
-  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
+  caffe_copy(batch->data_.count(), batch->data_.cpu_data(),
              top[0]->mutable_cpu_data());
   DLOG(INFO) << "Prefetch copied";
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(prefetch_label_);
+    top[1]->ReshapeLike(batch->label_);
     // Copy the labels.
-    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
-               top[1]->mutable_cpu_data());
+    caffe_copy(batch->label_.count(), batch->label_.cpu_data(),
+        top[1]->mutable_cpu_data());
   }
-  // Start a new prefetch thread
-  DLOG(INFO) << "CreatePrefetchThread";
-  CreatePrefetchThread();
+
+  prefetch_free_.push(batch);
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/base_data_layer.cu b/src/caffe/layers/base_data_layer.cu
index 9335a5b..ff6e412 100644
--- a/src/caffe/layers/base_data_layer.cu
+++ b/src/caffe/layers/base_data_layer.cu
@@ -7,22 +7,23 @@ namespace caffe {
 template <typename Dtype>
 void BasePrefetchingDataLayer<Dtype>::Forward_gpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // First, join the thread
-  JoinPrefetchThread();
+  Batch<Dtype>* batch = prefetch_full_.pop("Data layer prefetch queue empty");
   // Reshape to loaded data.
-  top[0]->ReshapeLike(this->prefetch_data_);
+  top[0]->ReshapeLike(batch->data_);
   // Copy the data
-  caffe_copy(prefetch_data_.count(), prefetch_data_.cpu_data(),
+  caffe_copy(batch->data_.count(), batch->data_.gpu_data(),
       top[0]->mutable_gpu_data());
   if (this->output_labels_) {
     // Reshape to loaded labels.
-    top[1]->ReshapeLike(prefetch_label_);
+    top[1]->ReshapeLike(batch->label_);
     // Copy the labels.
-    caffe_copy(prefetch_label_.count(), prefetch_label_.cpu_data(),
+    caffe_copy(batch->label_.count(), batch->label_.gpu_data(),
         top[1]->mutable_gpu_data());
   }
-  // Start a new prefetch thread
-  CreatePrefetchThread();
+  // Ensure the copy is synchronous wrt the host, so that the next batch isn't
+  // copied in meanwhile.
+  CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
+  prefetch_free_.push(batch);
 }
 
 INSTANTIATE_LAYER_GPU_FORWARD(BasePrefetchingDataLayer);
diff --git a/src/caffe/layers/concat_layer.cpp b/src/caffe/layers/concat_layer.cpp
index 1cac8fc..95fba10 100644
--- a/src/caffe/layers/concat_layer.cpp
+++ b/src/caffe/layers/concat_layer.cpp
@@ -76,13 +76,14 @@ void ConcatLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   int offset_concat_axis = 0;
   const int top_concat_axis = top[0]->shape(concat_axis_);
   for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
-    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
     const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    for (int n = 0; n < num_concats_; ++n) {
-      caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
-          (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
-          bottom_diff + n * bottom_concat_axis * concat_input_size_);
+    if (propagate_down[i]) {
+      Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
+      for (int n = 0; n < num_concats_; ++n) {
+        caffe_copy(bottom_concat_axis * concat_input_size_, top_diff +
+            (n * top_concat_axis + offset_concat_axis) * concat_input_size_,
+            bottom_diff + n * bottom_concat_axis * concat_input_size_);
+      }
     }
     offset_concat_axis += bottom_concat_axis;
   }
diff --git a/src/caffe/layers/concat_layer.cu b/src/caffe/layers/concat_layer.cu
index 8f2e85d..3c64c7e 100644
--- a/src/caffe/layers/concat_layer.cu
+++ b/src/caffe/layers/concat_layer.cu
@@ -53,15 +53,16 @@ void ConcatLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   const int top_concat_axis = top[0]->shape(concat_axis_);
   const bool kForward = false;
   for (int i = 0; i < bottom.size(); ++i) {
-    if (!propagate_down[i]) { continue; }
-    Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
     const int bottom_concat_axis = bottom[i]->shape(concat_axis_);
-    const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
-    const int nthreads = bottom_concat_size * num_concats_;
-    Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
-        nthreads, top_diff, kForward, num_concats_, concat_input_size_,
-        top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+    if (propagate_down[i]) {
+      Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+      const int bottom_concat_size = bottom_concat_axis * concat_input_size_;
+      const int nthreads = bottom_concat_size * num_concats_;
+      Concat<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+          nthreads, top_diff, kForward, num_concats_, concat_input_size_,
+          top_concat_axis, bottom_concat_axis, offset_concat_axis, bottom_diff);
+    }
     offset_concat_axis += bottom_concat_axis;
   }
 }
diff --git a/src/caffe/layers/data_layer.cpp b/src/caffe/layers/data_layer.cpp
index 161a75e..0932d9f 100644
--- a/src/caffe/layers/data_layer.cpp
+++ b/src/caffe/layers/data_layer.cpp
@@ -11,93 +11,85 @@
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/benchmark.hpp"
 #include "caffe/util/io.hpp"
-#include "caffe/util/math_functions.hpp"
-#include "caffe/util/rng.hpp"
 
 namespace caffe {
 
 template <typename Dtype>
-DataLayer<Dtype>::~DataLayer<Dtype>() {
-  this->JoinPrefetchThread();
+DataLayer<Dtype>::DataLayer(const LayerParameter& param)
+  : BasePrefetchingDataLayer<Dtype>(param),
+    reader_(param) {
+}
+
+template <typename Dtype>
+DataLayer<Dtype>::~DataLayer() {
+  this->StopInternalThread();
 }
 
 template <typename Dtype>
 void DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  // Initialize DB
-  db_.reset(db::GetDB(this->layer_param_.data_param().backend()));
-  db_->Open(this->layer_param_.data_param().source(), db::READ);
-  cursor_.reset(db_->NewCursor());
+  const int batch_size = this->layer_param_.data_param().batch_size();
+  // Read a data point, and use it to initialize the top blob.
+  Datum& datum = *(reader_.full().peek());
 
-  // Check if we should randomly skip a few data points
-  if (this->layer_param_.data_param().rand_skip()) {
-    unsigned int skip = caffe_rng_rand() %
-                        this->layer_param_.data_param().rand_skip();
-    LOG(INFO) << "Skipping first " << skip << " data points.";
-    while (skip-- > 0) {
-      cursor_->Next();
-    }
-  }
-  // Read a data point, to initialize the prefetch and top blobs.
-  Datum datum;
-  datum.ParseFromString(cursor_->value());
   // Use data_transformer to infer the expected blob shape from datum.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
   this->transformed_data_.Reshape(top_shape);
   // Reshape top[0] and prefetch_data according to the batch_size.
-  top_shape[0] = this->layer_param_.data_param().batch_size();
-  this->prefetch_data_.Reshape(top_shape);
-  top[0]->ReshapeLike(this->prefetch_data_);
-
+  top_shape[0] = batch_size;
+  top[0]->Reshape(top_shape);
+  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
+    this->prefetch_[i].data_.Reshape(top_shape);
+  }
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
       << top[0]->width();
   // label
   if (this->output_labels_) {
-    vector<int> label_shape(1, this->layer_param_.data_param().batch_size());
+    vector<int> label_shape(1, batch_size);
     top[1]->Reshape(label_shape);
-    this->prefetch_label_.Reshape(label_shape);
+    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
+      this->prefetch_[i].label_.Reshape(label_shape);
+    }
   }
 }
 
-// This function is used to create a thread that prefetches the data.
-template <typename Dtype>
-void DataLayer<Dtype>::InternalThreadEntry() {
+// This function is called on prefetch thread
+template<typename Dtype>
+void DataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   CPUTimer batch_timer;
   batch_timer.Start();
   double read_time = 0;
   double trans_time = 0;
   CPUTimer timer;
-  CHECK(this->prefetch_data_.count());
+  CHECK(batch->data_.count());
   CHECK(this->transformed_data_.count());
 
   // Reshape according to the first datum of each batch
   // on single input batches allows for inputs of varying dimension.
   const int batch_size = this->layer_param_.data_param().batch_size();
-  Datum datum;
-  datum.ParseFromString(cursor_->value());
+  Datum& datum = *(reader_.full().peek());
   // Use data_transformer to infer the expected blob shape from datum.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
   this->transformed_data_.Reshape(top_shape);
-  // Reshape prefetch_data according to the batch_size.
+  // Reshape batch according to the batch_size.
   top_shape[0] = batch_size;
-  this->prefetch_data_.Reshape(top_shape);
+  batch->data_.Reshape(top_shape);
 
-  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
+  Dtype* top_data = batch->data_.mutable_cpu_data();
   Dtype* top_label = NULL;  // suppress warnings about uninitialized variables
 
   if (this->output_labels_) {
-    top_label = this->prefetch_label_.mutable_cpu_data();
+    top_label = batch->label_.mutable_cpu_data();
   }
-  timer.Start();
   for (int item_id = 0; item_id < batch_size; ++item_id) {
+    timer.Start();
     // get a datum
-    Datum datum;
-    datum.ParseFromString(cursor_->value());
+    Datum& datum = *(reader_.full().pop("Waiting for data"));
     read_time += timer.MicroSeconds();
     timer.Start();
     // Apply data transformations (mirror, scale, crop...)
-    int offset = this->prefetch_data_.offset(item_id);
+    int offset = batch->data_.offset(item_id);
     this->transformed_data_.set_cpu_data(top_data + offset);
     this->data_transformer_->Transform(datum, &(this->transformed_data_));
     // Copy label.
@@ -105,13 +97,8 @@ void DataLayer<Dtype>::InternalThreadEntry() {
       top_label[item_id] = datum.label();
     }
     trans_time += timer.MicroSeconds();
-    timer.Start();
-    // go to the next item.
-    cursor_->Next();
-    if (!cursor_->valid()) {
-      DLOG(INFO) << "Restarting data prefetching from start.";
-      cursor_->SeekToFirst();
-    }
+
+    reader_.free().push(const_cast<Datum*>(&datum));
   }
   timer.Stop();
   batch_timer.Stop();
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
index 39bc4de..8a1eed8 100644
--- a/src/caffe/layers/deconv_layer.cu
+++ b/src/caffe/layers/deconv_layer.cu
@@ -52,7 +52,8 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         // gradient w.r.t. bottom data, if necessary.
         if (propagate_down[i]) {
           this->forward_gpu_gemm(top_diff + top[i]->offset(n), weight,
-              bottom_diff + bottom[i]->offset(n));
+              bottom_diff + bottom[i]->offset(n),
+              this->param_propagate_down_[0]);
         }
       }
     }
diff --git a/src/caffe/layers/embed_layer.cpp b/src/caffe/layers/embed_layer.cpp
new file mode 100644
index 0000000..be6b2cd
--- /dev/null
+++ b/src/caffe/layers/embed_layer.cpp
@@ -0,0 +1,122 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/common_layers.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  N_ = this->layer_param_.embed_param().num_output();
+  CHECK_GT(N_, 0) << "EmbedLayer num_output must be positive.";
+  K_ = this->layer_param_.embed_param().input_dim();
+  CHECK_GT(K_, 0) << "EmbedLayer input_dim must be positive.";
+  bias_term_ = this->layer_param_.embed_param().bias_term();
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Initialize the weights --
+    // transposed from InnerProductLayer for spatial locality.
+    vector<int> weight_shape(2);
+    weight_shape[0] = K_;
+    weight_shape[1] = N_;
+    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
+    // fill the weights
+    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
+        this->layer_param_.embed_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, initialize and fill the bias term
+    if (bias_term_) {
+      vector<int> bias_shape(1, N_);
+      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
+      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
+          this->layer_param_.embed_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());
+    }
+  }  // parameter initialization
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  // Figure out the dimensions
+  M_ = bottom[0]->count();
+  vector<int> top_shape = bottom[0]->shape();
+  top_shape.push_back(N_);
+  top[0]->Reshape(top_shape);
+  // Set up the bias multiplier
+  if (bias_term_) {
+    vector<int> bias_shape(1, M_);
+    bias_multiplier_.Reshape(bias_shape);
+    caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  int index;
+  for (int n = 0; n < M_; ++n) {
+    index = static_cast<int>(bottom_data[n]);
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, K_);
+    DCHECK_EQ(static_cast<Dtype>(index), bottom_data[n]) << "non-integer input";
+    caffe_copy(N_, weight + index * N_, top_data + n * N_);
+  }
+  if (bias_term_) {
+    const Dtype* bias = this->blobs_[1]->cpu_data();
+    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1),
+        bias_multiplier_.cpu_data(), bias, Dtype(1), top_data);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
+  if (this->param_propagate_down_[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    // Gradient with respect to weight
+    Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+    int index;
+    for (int n = 0; n < M_; ++n) {
+      index = static_cast<int>(bottom_data[n]);
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index, K_);
+      DCHECK_EQ(static_cast<Dtype>(index), bottom_data[n])
+          << "non-integer input";
+      caffe_axpy(N_, Dtype(1), top_diff + n * N_, weight_diff + index * N_);
+    }
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
+    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, Dtype(1), top_diff,
+        bias_multiplier_.cpu_data(), Dtype(1), bias_diff);
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(EmbedLayer);
+#endif
+
+INSTANTIATE_CLASS(EmbedLayer);
+REGISTER_LAYER_CLASS(Embed);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
new file mode 100644
index 0000000..62a4db8
--- /dev/null
+++ b/src/caffe/layers/embed_layer.cu
@@ -0,0 +1,84 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/common_layers.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/gpu_util.cuh"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* weight, const int M, const int N, const int K,
+    Dtype* top_data) {
+  CUDA_KERNEL_LOOP(top_index, nthreads) {
+    const int n = top_index / N;
+    const int d = top_index % N;
+    const int index = static_cast<int>(bottom_data[n]);
+    const int weight_index = index * N + d;
+    top_data[top_index] = weight[weight_index];
+  }
+}
+
+template <typename Dtype>
+__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_diff, const int M, const int N, const int K,
+    Dtype* weight_diff);
+
+template <typename Dtype>
+__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_diff, const int M, const int N, const int K,
+    Dtype* weight_diff) {
+  CUDA_KERNEL_LOOP(top_index, nthreads) {
+    const int n = top_index / N;
+    const int d = top_index % N;
+    const int index = static_cast<int>(bottom_data[n]);
+    const int weight_index = index * N + d;
+    caffe_gpu_atomic_add(top_diff[top_index], weight_diff + weight_index);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  const int count = top[0]->count();
+  EmbedForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, weight, M_, N_, K_, top_data);
+  if (bias_term_) {
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1),
+        bias_multiplier_.gpu_data(),
+        this->blobs_[1]->gpu_data(), Dtype(1), top_data);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
+  if (this->param_propagate_down_[0]) {
+    const int top_count = top[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+    EmbedBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(top_count), CAFFE_CUDA_NUM_THREADS>>>(
+        top_count, bottom_data, top_diff, M_, N_, K_, weight_diff);
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, Dtype(1), top_diff,
+        bias_multiplier_.gpu_data(), Dtype(1), bias_diff);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(EmbedLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 8a782f7..8ced510 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -16,7 +16,7 @@ TODO:
 
 #include "caffe/data_layers.hpp"
 #include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
+#include "caffe/util/hdf5.hpp"
 
 namespace caffe {
 
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index f63375c..56788c2 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -6,7 +6,7 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
+#include "caffe/util/hdf5.hpp"
 #include "caffe/vision_layers.hpp"
 
 namespace caffe {
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu
index ae497c3..eb6d0e4 100644
--- a/src/caffe/layers/hdf5_output_layer.cu
+++ b/src/caffe/layers/hdf5_output_layer.cu
@@ -6,7 +6,6 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
-#include "caffe/util/io.hpp"
 #include "caffe/vision_layers.hpp"
 
 namespace caffe {
diff --git a/src/caffe/layers/image_data_layer.cpp b/src/caffe/layers/image_data_layer.cpp
index 18c035c..223ba3a 100644
--- a/src/caffe/layers/image_data_layer.cpp
+++ b/src/caffe/layers/image_data_layer.cpp
@@ -17,7 +17,7 @@ namespace caffe {
 
 template <typename Dtype>
 ImageDataLayer<Dtype>::~ImageDataLayer<Dtype>() {
-  this->JoinPrefetchThread();
+  this->StopInternalThread();
 }
 
 template <typename Dtype>
@@ -62,14 +62,18 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // Read an image, and use it to initialize the top blob.
   cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
                                     new_height, new_width, is_color);
+  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
   // Use data_transformer to infer the expected blob shape from a cv_image.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
   this->transformed_data_.Reshape(top_shape);
   // Reshape prefetch_data and top[0] according to the batch_size.
   const int batch_size = this->layer_param_.image_data_param().batch_size();
+  CHECK_GT(batch_size, 0) << "Positive batch size required";
   top_shape[0] = batch_size;
-  this->prefetch_data_.Reshape(top_shape);
-  top[0]->ReshapeLike(this->prefetch_data_);
+  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
+    this->prefetch_[i].data_.Reshape(top_shape);
+  }
+  top[0]->Reshape(top_shape);
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
@@ -77,7 +81,9 @@ void ImageDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  this->prefetch_label_.Reshape(label_shape);
+  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
+    this->prefetch_[i].label_.Reshape(label_shape);
+  }
 }
 
 template <typename Dtype>
@@ -87,15 +93,15 @@ void ImageDataLayer<Dtype>::ShuffleImages() {
   shuffle(lines_.begin(), lines_.end(), prefetch_rng);
 }
 
-// This function is used to create a thread that prefetches the data.
+// This function is called on prefetch thread
 template <typename Dtype>
-void ImageDataLayer<Dtype>::InternalThreadEntry() {
+void ImageDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   CPUTimer batch_timer;
   batch_timer.Start();
   double read_time = 0;
   double trans_time = 0;
   CPUTimer timer;
-  CHECK(this->prefetch_data_.count());
+  CHECK(batch->data_.count());
   CHECK(this->transformed_data_.count());
   ImageDataParameter image_data_param = this->layer_param_.image_data_param();
   const int batch_size = image_data_param.batch_size();
@@ -108,15 +114,16 @@ void ImageDataLayer<Dtype>::InternalThreadEntry() {
   // on single input batches allows for inputs of varying dimension.
   cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
       new_height, new_width, is_color);
+  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
   // Use data_transformer to infer the expected blob shape from a cv_img.
   vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
   this->transformed_data_.Reshape(top_shape);
-  // Reshape prefetch_data according to the batch_size.
+  // Reshape batch according to the batch_size.
   top_shape[0] = batch_size;
-  this->prefetch_data_.Reshape(top_shape);
+  batch->data_.Reshape(top_shape);
 
-  Dtype* prefetch_data = this->prefetch_data_.mutable_cpu_data();
-  Dtype* prefetch_label = this->prefetch_label_.mutable_cpu_data();
+  Dtype* prefetch_data = batch->data_.mutable_cpu_data();
+  Dtype* prefetch_label = batch->label_.mutable_cpu_data();
 
   // datum scales
   const int lines_size = lines_.size();
@@ -130,7 +137,7 @@ void ImageDataLayer<Dtype>::InternalThreadEntry() {
     read_time += timer.MicroSeconds();
     timer.Start();
     // Apply transformations (mirror, crop...) to the image
-    int offset = this->prefetch_data_.offset(item_id);
+    int offset = batch->data_.offset(item_id);
     this->transformed_data_.set_cpu_data(prefetch_data + offset);
     this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
     trans_time += timer.MicroSeconds();
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
index dd90cac..c0ebd2c 100644
--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -15,12 +15,19 @@ void InnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
   const Dtype* weight = this->blobs_[0]->gpu_data();
-  caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
-      bottom_data, weight, (Dtype)0., top_data);
-  if (bias_term_) {
-    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
-        bias_multiplier_.gpu_data(),
-        this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
+  if (M_ == 1) {
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, N_, K_, (Dtype)1.,
+                         weight, bottom_data, (Dtype)0., top_data);
+    if (bias_term_)
+      caffe_gpu_axpy<Dtype>(N_, bias_multiplier_.cpu_data()[0],
+                            this->blobs_[1]->gpu_data(), top_data);
+  } else {
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1.,
+                          bottom_data, weight, (Dtype)0., top_data);
+    if (bias_term_)
+      caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1.,
+                            bias_multiplier_.gpu_data(),
+                            this->blobs_[1]->gpu_data(), (Dtype)1., top_data);
   }
 }
 
diff --git a/src/caffe/layers/mvn_layer.cpp b/src/caffe/layers/mvn_layer.cpp
index 3e79bdd..325691b 100644
--- a/src/caffe/layers/mvn_layer.cpp
+++ b/src/caffe/layers/mvn_layer.cpp
@@ -18,8 +18,12 @@ void MVNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       1, 1);
   temp_.Reshape(bottom[0]->num(), bottom[0]->channels(),
       bottom[0]->height(), bottom[0]->width());
-  sum_multiplier_.Reshape(1, 1,
-      bottom[0]->height(), bottom[0]->width());
+  if ( this->layer_param_.mvn_param().across_channels() ) {
+    sum_multiplier_.Reshape(1, bottom[0]->channels(), bottom[0]->height(),
+                            bottom[0]->width());
+  } else {
+    sum_multiplier_.Reshape(1, 1, bottom[0]->height(), bottom[0]->width());
+  }
   Dtype* multiplier_data = sum_multiplier_.mutable_cpu_data();
   caffe_set(sum_multiplier_.count(), Dtype(1), multiplier_data);
   eps_ = this->layer_param_.mvn_param().eps();
@@ -130,7 +134,12 @@ void MVNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 
     caffe_div(temp_.count(), bottom_diff, temp_.cpu_data(), bottom_diff);
   } else {
-    caffe_copy(temp_.count(), top_diff, bottom_diff);
+    caffe_cpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, top_diff,
+      sum_multiplier_.cpu_data(), 0., mean_.mutable_cpu_data());
+    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+      mean_.cpu_data(), sum_multiplier_.cpu_data(), 0.,
+      temp_.mutable_cpu_data());
+    caffe_add(temp_.count(), top_diff, temp_.cpu_data(), bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/mvn_layer.cu b/src/caffe/layers/mvn_layer.cu
index 3888a0c..d86a2e7 100644
--- a/src/caffe/layers/mvn_layer.cu
+++ b/src/caffe/layers/mvn_layer.cu
@@ -113,7 +113,12 @@ void MVNLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
     caffe_gpu_div(temp_.count(), bottom_diff, temp_.gpu_data(), bottom_diff);
   } else {
-    caffe_copy(temp_.count(), top_diff, bottom_diff);
+    caffe_gpu_gemv<Dtype>(CblasNoTrans, num, dim, 1. / dim, top_diff,
+            sum_multiplier_.gpu_data(), 0., mean_.mutable_gpu_data());
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, dim, 1, -1.,
+            mean_.gpu_data(), sum_multiplier_.gpu_data(), 0.,
+            temp_.mutable_gpu_data());
+    caffe_gpu_add(temp_.count(), top_diff, temp_.gpu_data(), bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp
index 795dd71..d762291 100644
--- a/src/caffe/layers/spp_layer.cpp
+++ b/src/caffe/layers/spp_layer.cpp
@@ -66,8 +66,11 @@ void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   SPPParameter spp_param = this->layer_param_.spp_param();
 
+  num_ = bottom[0]->num();
+  channels_ = bottom[0]->channels();
   bottom_h_ = bottom[0]->height();
   bottom_w_ = bottom[0]->width();
+  reshaped_first_time_ = false;
   CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero.";
   CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero.";
 
@@ -82,6 +85,15 @@ void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   flatten_outputs_.clear();
   concat_bottom_vec_.clear();
 
+  if (pyramid_height_ == 1) {
+    // pooling layer setup
+    LayerParameter pooling_param = GetPoolingParam(0, bottom_h_, bottom_w_,
+        spp_param);
+    pooling_layers_.push_back(shared_ptr<PoolingLayer<Dtype> > (
+        new PoolingLayer<Dtype>(pooling_param)));
+    pooling_layers_[0]->SetUp(bottom, top);
+    return;
+  }
   // split layer output holders setup
   for (int i = 0; i < pyramid_height_; i++) {
     split_top_vec_.push_back(new Blob<Dtype>());
@@ -135,10 +147,26 @@ void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
       << "corresponding to (num, channels, height, width)";
+  // Do nothing if bottom shape is unchanged since last Reshape
+  if (num_ == bottom[0]->num() && channels_ == bottom[0]->channels() &&
+      bottom_h_ == bottom[0]->height() && bottom_w_ == bottom[0]->width() &&
+      reshaped_first_time_) {
+    return;
+  }
+  num_ = bottom[0]->num();
   channels_ = bottom[0]->channels();
   bottom_h_ = bottom[0]->height();
   bottom_w_ = bottom[0]->width();
+  reshaped_first_time_ = true;
   SPPParameter spp_param = this->layer_param_.spp_param();
+  if (pyramid_height_ == 1) {
+    LayerParameter pooling_param = GetPoolingParam(0, bottom_h_, bottom_w_,
+        spp_param);
+    pooling_layers_[0].reset(new PoolingLayer<Dtype>(pooling_param));
+    pooling_layers_[0]->SetUp(bottom, top);
+    pooling_layers_[0]->Reshape(bottom, top);
+    return;
+  }
   split_layer_->Reshape(bottom, split_top_vec_);
   for (int i = 0; i < pyramid_height_; i++) {
     LayerParameter pooling_param = GetPoolingParam(
@@ -159,6 +187,10 @@ void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void SPPLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  if (pyramid_height_ == 1) {
+    pooling_layers_[0]->Forward(bottom, top);
+    return;
+  }
   split_layer_->Forward(bottom, split_top_vec_);
   for (int i = 0; i < pyramid_height_; i++) {
     pooling_layers_[i]->Forward(
@@ -175,6 +207,10 @@ void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   if (!propagate_down[0]) {
     return;
   }
+  if (pyramid_height_ == 1) {
+    pooling_layers_[0]->Backward(top, propagate_down, bottom);
+    return;
+  }
   vector<bool> concat_propagate_down(pyramid_height_, true);
   concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
   for (int i = 0; i < pyramid_height_; i++) {
diff --git a/src/caffe/layers/tile_layer.cpp b/src/caffe/layers/tile_layer.cpp
new file mode 100644
index 0000000..f55008c
--- /dev/null
+++ b/src/caffe/layers/tile_layer.cpp
@@ -0,0 +1,62 @@
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void TileLayer<Dtype>::Reshape(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const TileParameter& tile_param = this->layer_param_.tile_param();
+  axis_ = bottom[0]->CanonicalAxisIndex(tile_param.axis());
+  CHECK(tile_param.has_tiles()) << "Number of tiles must be specified";
+  tiles_ = tile_param.tiles();
+  CHECK_GT(tiles_, 0) << "Number of tiles must be positive.";
+  vector<int> top_shape = bottom[0]->shape();
+  top_shape[axis_] = bottom[0]->shape(axis_) * tiles_;
+  top[0]->Reshape(top_shape);
+  outer_dim_ = bottom[0]->count(0, axis_);
+  inner_dim_ = bottom[0]->count(axis_);
+}
+
+template <typename Dtype>
+void TileLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  for (int i = 0; i < outer_dim_; ++i) {
+    for (int t = 0; t < tiles_; ++t) {
+      caffe_copy(inner_dim_, bottom_data, top_data);
+      top_data += inner_dim_;
+    }
+    bottom_data += inner_dim_;
+  }
+}
+
+template <typename Dtype>
+void TileLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) { return; }
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  for (int i = 0; i < outer_dim_; ++i) {
+    caffe_copy(inner_dim_, top_diff, bottom_diff);
+    top_diff += inner_dim_;
+    for (int t = 1; t < tiles_; ++t) {
+      caffe_axpy(inner_dim_, Dtype(1), top_diff, bottom_diff);
+      top_diff += inner_dim_;
+    }
+    bottom_diff += inner_dim_;
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(TileLayer);
+#endif
+
+INSTANTIATE_CLASS(TileLayer);
+REGISTER_LAYER_CLASS(Tile);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/tile_layer.cu b/src/caffe/layers/tile_layer.cu
new file mode 100644
index 0000000..7fd3bc4
--- /dev/null
+++ b/src/caffe/layers/tile_layer.cu
@@ -0,0 +1,67 @@
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void Tile(const int nthreads, const Dtype* bottom_data,
+    const int tile_size, const int num_tiles, const int bottom_tile_axis,
+    Dtype* top_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int d = index % tile_size;
+    const int b = (index / tile_size / num_tiles) % bottom_tile_axis;
+    const int n = index / tile_size / num_tiles / bottom_tile_axis;
+    const int bottom_index = (n * bottom_tile_axis + b) * tile_size + d;
+    top_data[index] = bottom_data[bottom_index];
+  }
+}
+
+template <typename Dtype>
+void TileLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int bottom_tile_axis = bottom[0]->shape(axis_);
+  const int nthreads = top[0]->count();
+  Tile<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+      nthreads, bottom_data, inner_dim_, tiles_, bottom_tile_axis, top_data);
+}
+
+template <typename Dtype>
+__global__ void TileBackward(const int nthreads, const Dtype* top_diff,
+    const int tile_size, const int num_tiles, const int bottom_tile_axis,
+    Dtype* bottom_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int d = index % tile_size;
+    const int b = (index / tile_size) % bottom_tile_axis;
+    const int n = index / tile_size / bottom_tile_axis;
+    bottom_diff[index] = 0;
+    int top_index = (n * num_tiles * bottom_tile_axis + b) * tile_size + d;
+    for (int t = 0; t < num_tiles; ++t) {
+      bottom_diff[index] += top_diff[top_index];
+      top_index += bottom_tile_axis * tile_size;
+    }
+  }
+}
+
+template <typename Dtype>
+void TileLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) { return; }
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int bottom_tile_axis = bottom[0]->shape(axis_);
+  const int tile_size = inner_dim_ / bottom_tile_axis;
+  const int nthreads = bottom[0]->count();
+  TileBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
+      nthreads, top_diff, tile_size, tiles_, bottom_tile_axis, bottom_diff);
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(TileLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/window_data_layer.cpp b/src/caffe/layers/window_data_layer.cpp
index c127d56..f637f2e 100644
--- a/src/caffe/layers/window_data_layer.cpp
+++ b/src/caffe/layers/window_data_layer.cpp
@@ -27,7 +27,7 @@ namespace caffe {
 
 template <typename Dtype>
 WindowDataLayer<Dtype>::~WindowDataLayer<Dtype>() {
-  this->JoinPrefetchThread();
+  this->StopInternalThread();
 }
 
 template <typename Dtype>
@@ -171,7 +171,9 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_GT(crop_size, 0);
   const int batch_size = this->layer_param_.window_data_param().batch_size();
   top[0]->Reshape(batch_size, channels, crop_size, crop_size);
-  this->prefetch_data_.Reshape(batch_size, channels, crop_size, crop_size);
+  for (int i = 0; i < this->PREFETCH_COUNT; ++i)
+    this->prefetch_[i].data_.Reshape(
+        batch_size, channels, crop_size, crop_size);
 
   LOG(INFO) << "output data size: " << top[0]->num() << ","
       << top[0]->channels() << "," << top[0]->height() << ","
@@ -179,7 +181,9 @@ void WindowDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
   // label
   vector<int> label_shape(1, batch_size);
   top[1]->Reshape(label_shape);
-  this->prefetch_label_.Reshape(label_shape);
+  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
+    this->prefetch_[i].label_.Reshape(label_shape);
+  }
 
   // data mean
   has_mean_file_ = this->transform_param_.has_mean_file();
@@ -217,9 +221,9 @@ unsigned int WindowDataLayer<Dtype>::PrefetchRand() {
   return (*prefetch_rng)();
 }
 
-// Thread fetching the data
+// This function is called on prefetch thread
 template <typename Dtype>
-void WindowDataLayer<Dtype>::InternalThreadEntry() {
+void WindowDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
   // At each iteration, sample N windows where N*p are foreground (object)
   // windows and N*(1-p) are background (non-object) windows
   CPUTimer batch_timer;
@@ -227,8 +231,8 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
   double read_time = 0;
   double trans_time = 0;
   CPUTimer timer;
-  Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
-  Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
+  Dtype* top_data = batch->data_.mutable_cpu_data();
+  Dtype* top_label = batch->label_.mutable_cpu_data();
   const Dtype scale = this->layer_param_.window_data_param().scale();
   const int batch_size = this->layer_param_.window_data_param().batch_size();
   const int context_pad = this->layer_param_.window_data_param().context_pad();
@@ -252,7 +256,7 @@ void WindowDataLayer<Dtype>::InternalThreadEntry() {
   bool use_square = (crop_mode == "square") ? true : false;
 
   // zero out batch
-  caffe_set(this->prefetch_data_.count(), Dtype(0), top_data);
+  caffe_set(batch->data_.count(), Dtype(0), top_data);
 
   const int num_fg = static_cast<int>(static_cast<float>(batch_size)
       * fg_fraction);
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index a18ee63..89d1401 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -5,12 +5,15 @@
 #include <utility>
 #include <vector>
 
+#include "hdf5.h"
+
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/net.hpp"
+#include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/util/hdf5.hpp"
 #include "caffe/util/insert_splits.hpp"
-#include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
@@ -19,12 +22,14 @@
 namespace caffe {
 
 template <typename Dtype>
-Net<Dtype>::Net(const NetParameter& param) {
+Net<Dtype>::Net(const NetParameter& param, const Net* root_net)
+    : root_net_(root_net) {
   Init(param);
 }
 
 template <typename Dtype>
-Net<Dtype>::Net(const string& param_file, Phase phase) {
+Net<Dtype>::Net(const string& param_file, Phase phase, const Net* root_net)
+    : root_net_(root_net) {
   NetParameter param;
   ReadNetParamsFromTextFileOrDie(param_file, &param);
   param.mutable_state()->set_phase(phase);
@@ -33,14 +38,18 @@ Net<Dtype>::Net(const string& param_file, Phase phase) {
 
 template <typename Dtype>
 void Net<Dtype>::Init(const NetParameter& in_param) {
+  CHECK(Caffe::root_solver() || root_net_)
+      << "root_net_ needs to be set for all non-root solvers";
   // Set phase from the state.
   phase_ = in_param.state().phase();
   // Filter layers based on their include/exclude rules and
   // the current NetState.
   NetParameter filtered_param;
   FilterNet(in_param, &filtered_param);
-  LOG(INFO) << "Initializing net from parameters: " << std::endl
-            << filtered_param.DebugString();
+  if (Caffe::root_solver()) {
+    LOG(INFO) << "Initializing net from parameters: " << std::endl
+              << filtered_param.DebugString();
+  }
   // Create a copy of filtered_param with splits added where necessary.
   NetParameter param;
   InsertSplits(filtered_param, &param);
@@ -64,7 +73,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     const int layer_id = -1;  // inputs have fake layer ID -1
     AppendTop(param, layer_id, input_id, &available_blobs, &blob_name_to_idx);
   }
-  DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+  DLOG_IF(INFO, Caffe::root_solver())
+      << "Memory required for data: " << memory_used_ * sizeof(Dtype);
   // For each layer, set up its input and output
   bottom_vecs_.resize(param.layer_size());
   top_vecs_.resize(param.layer_size());
@@ -73,6 +83,9 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   top_id_vecs_.resize(param.layer_size());
   bottom_need_backward_.resize(param.layer_size());
   for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
+    // For non-root solvers, whether this layer is shared from root_net_.
+    bool share_from_root = !Caffe::root_solver()
+        && root_net_->layers_[layer_id]->ShareInParallel();
     // Inherit phase from net if unset.
     if (!param.layer(layer_id).has_phase()) {
       param.mutable_layer(layer_id)->set_phase(phase_);
@@ -85,9 +98,17 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
           << "propagate_down param must be specified "
           << "either 0 or bottom_size times ";
     }
-    layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
+    if (share_from_root) {
+      LOG(INFO) << "Sharing layer " << layer_param.name() << " from root net";
+      layers_.push_back(root_net_->layers_[layer_id]);
+      layers_[layer_id]->SetShared(true);
+    } else {
+      layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
+    }
     layer_names_.push_back(layer_param.name());
-    LOG(INFO) << "Creating Layer " << layer_param.name();
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "Creating Layer " << layer_param.name();
+    }
     bool need_backward = false;
 
     // Figure out this layer's input and output
@@ -117,20 +138,42 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
       }
     }
     // After this layer is connected, set it up.
-    LOG(INFO) << "Setting up " << layer_names_[layer_id];
-    layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
+    if (share_from_root) {
+      // Set up size of top blobs using root_net_
+      const vector<Blob<Dtype>*>& base_top = root_net_->top_vecs_[layer_id];
+      const vector<Blob<Dtype>*>& this_top = this->top_vecs_[layer_id];
+      for (int top_id = 0; top_id < base_top.size(); ++top_id) {
+        this_top[top_id]->ReshapeLike(*base_top[top_id]);
+        LOG(INFO) << "Created top blob " << top_id << " (shape: "
+            << this_top[top_id]->shape_string() <<  ") for shared layer "
+            << layer_param.name();
+      }
+    } else {
+      layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
+    }
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "Setting up " << layer_names_[layer_id];
+    }
     for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
       if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
         blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
       }
       blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
-      LOG(INFO) << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
+      if (Caffe::root_solver()) {
+        LOG(INFO) << "Top shape: "
+                  << top_vecs_[layer_id][top_id]->shape_string();
+      }
       if (layer->loss(top_id)) {
-        LOG(INFO) << "    with loss weight " << layer->loss(top_id);
+        if (Caffe::root_solver()) {
+          LOG(INFO) << "    with loss weight " << layer->loss(top_id);
+        }
       }
       memory_used_ += top_vecs_[layer_id][top_id]->count();
     }
-    DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+    if (Caffe::root_solver()) {
+      DLOG(INFO) << "Memory required for data: "
+                 << memory_used_ * sizeof(Dtype);
+    }
     const int param_size = layer_param.param_size();
     const int num_param_blobs = layers_[layer_id]->blobs().size();
     CHECK_LE(param_size, num_param_blobs)
@@ -139,7 +182,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
       const ParamSpec* param_spec = (param_id < param_size) ?
           &layer_param.param(param_id) : &default_param_spec;
-      const bool param_need_backward = param_spec->lr_mult() > 0;
+      const bool param_need_backward = param_spec->lr_mult() != 0;
       need_backward |= param_need_backward;
       layers_[layer_id]->set_param_propagate_down(param_id,
                                                   param_need_backward);
@@ -189,10 +232,14 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     }
     if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
     if (layer_need_backward_[layer_id]) {
-      LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
+      if (Caffe::root_solver()) {
+        LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
+      }
     } else {
-      LOG(INFO) << layer_names_[layer_id]
-                << " does not need backward computation.";
+      if (Caffe::root_solver()) {
+        LOG(INFO) << layer_names_[layer_id]
+                  << " does not need backward computation.";
+      }
     }
     for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
          ++bottom_id) {
@@ -232,7 +279,9 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // In the end, all remaining blobs are considered output blobs.
   for (set<string>::iterator it = available_blobs.begin();
       it != available_blobs.end(); ++it) {
-    LOG(INFO) << "This network produces output " << *it;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "This network produces output " << *it;
+    }
     net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
     net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
   }
@@ -242,10 +291,12 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
     layer_names_index_[layer_names_[layer_id]] = layer_id;
   }
-  GetLearningRateAndWeightDecay();
+  ShareWeights();
   debug_info_ = param.debug_info();
-  LOG(INFO) << "Network initialization done.";
-  LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+  if (Caffe::root_solver()) {
+    LOG(INFO) << "Network initialization done.";
+    LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+  }
 }
 
 template <typename Dtype>
@@ -284,27 +335,33 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
   // Check whether the rule is broken due to phase.
   if (rule.has_phase()) {
       if (rule.phase() != state.phase()) {
-        LOG(INFO) << "The NetState phase (" << state.phase()
-          << ") differed from the phase (" << rule.phase()
-          << ") specified by a rule in layer " << layer_name;
+        if (Caffe::root_solver()) {
+          LOG(INFO) << "The NetState phase (" << state.phase()
+                    << ") differed from the phase (" << rule.phase()
+                    << ") specified by a rule in layer " << layer_name;
+        }
         return false;
       }
   }
   // Check whether the rule is broken due to min level.
   if (rule.has_min_level()) {
     if (state.level() < rule.min_level()) {
-      LOG(INFO) << "The NetState level (" << state.level()
-          << ") is above the min_level (" << rule.min_level()
-          << ") specified by a rule in layer " << layer_name;
+      if (Caffe::root_solver()) {
+        LOG(INFO) << "The NetState level (" << state.level()
+                  << ") is above the min_level (" << rule.min_level()
+                  << ") specified by a rule in layer " << layer_name;
+      }
       return false;
     }
   }
   // Check whether the rule is broken due to max level.
   if (rule.has_max_level()) {
     if (state.level() > rule.max_level()) {
-      LOG(INFO) << "The NetState level (" << state.level()
-          << ") is above the max_level (" << rule.max_level()
-          << ") specified by a rule in layer " << layer_name;
+      if (Caffe::root_solver()) {
+        LOG(INFO) << "The NetState level (" << state.level()
+                  << ") is above the max_level (" << rule.max_level()
+                  << ") specified by a rule in layer " << layer_name;
+      }
       return false;
     }
   }
@@ -317,8 +374,10 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (!has_stage) {
-      LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
-                << "' specified by a rule in layer " << layer_name;
+      if (Caffe::root_solver()) {
+        LOG(INFO) << "The NetState did not contain stage '" << rule.stage(i)
+                  << "' specified by a rule in layer " << layer_name;
+      }
       return false;
     }
   }
@@ -331,8 +390,10 @@ bool Net<Dtype>::StateMeetsRule(const NetState& state,
       if (rule.not_stage(i) == state.stage(j)) { has_stage = true; }
     }
     if (has_stage) {
-      LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
-                << "' specified by a rule in layer " << layer_name;
+      if (Caffe::root_solver()) {
+        LOG(INFO) << "The NetState contained a not_stage '" << rule.not_stage(i)
+                  << "' specified by a rule in layer " << layer_name;
+      }
       return false;
     }
   }
@@ -354,20 +415,25 @@ void Net<Dtype>::AppendTop(const NetParameter& param, const int layer_id,
   if (blob_name_to_idx && layer_param && layer_param->bottom_size() > top_id &&
       blob_name == layer_param->bottom(top_id)) {
     // In-place computation
-    LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
+    if (Caffe::root_solver()) {
+      LOG(INFO) << layer_param->name() << " -> " << blob_name << " (in-place)";
+    }
     top_vecs_[layer_id].push_back(blobs_[(*blob_name_to_idx)[blob_name]].get());
     top_id_vecs_[layer_id].push_back((*blob_name_to_idx)[blob_name]);
   } else if (blob_name_to_idx &&
              blob_name_to_idx->find(blob_name) != blob_name_to_idx->end()) {
     // If we are not doing in-place computation but have duplicated blobs,
     // raise an error.
-    LOG(FATAL) << "Duplicate blobs produced by multiple sources.";
+    LOG(FATAL) << "Top blob '" << blob_name
+               << "' produced by multiple sources.";
   } else {
     // Normal output.
-    if (layer_param) {
-      LOG(INFO) << layer_param->name() << " -> " << blob_name;
-    } else {
-      LOG(INFO) << "Input " << top_id << " -> " << blob_name;
+    if (Caffe::root_solver()) {
+      if (layer_param) {
+        LOG(INFO) << layer_param->name() << " -> " << blob_name;
+      } else {
+        LOG(INFO) << "Input " << top_id << " -> " << blob_name;
+      }
     }
     shared_ptr<Blob<Dtype> > blob_pointer(new Blob<Dtype>());
     const int blob_id = blobs_.size();
@@ -403,11 +469,13 @@ int Net<Dtype>::AppendBottom(const NetParameter& param, const int layer_id,
   const LayerParameter& layer_param = param.layer(layer_id);
   const string& blob_name = layer_param.bottom(bottom_id);
   if (available_blobs->find(blob_name) == available_blobs->end()) {
-    LOG(FATAL) << "Unknown blob input " << blob_name
-               << " (at index " << bottom_id << ") to layer " << layer_id;
+    LOG(FATAL) << "Unknown bottom blob '" << blob_name << "' (layer '"
+               << layer_param.name() << "', bottom index " << bottom_id << ")";
   }
   const int blob_id = (*blob_name_to_idx)[blob_name];
-  LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
+  if (Caffe::root_solver()) {
+    LOG(INFO) << layer_names_[layer_id] << " <- " << blob_name;
+  }
   bottom_vecs_[layer_id].push_back(blobs_[blob_id].get());
   bottom_id_vecs_[layer_id].push_back(blob_id);
   available_blobs->erase(blob_name);
@@ -439,6 +507,9 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
   params_.push_back(layers_[layer_id]->blobs()[param_id]);
   param_id_vecs_[layer_id].push_back(net_param_id);
   param_layer_indices_.push_back(make_pair(layer_id, param_id));
+  ParamSpec default_param_spec;
+  const ParamSpec* param_spec = (layer_param.param_size() > param_id) ?
+      &layer_param.param(param_id) : &default_param_spec;
   if (!param_size || !param_name.size() || (param_name.size() &&
       param_names_index_.find(param_name) == param_names_index_.end())) {
     // This layer "owns" this parameter blob -- it is either anonymous
@@ -448,6 +519,13 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
     if (param_name.size()) {
       param_names_index_[param_name] = net_param_id;
     }
+    const int learnable_param_id = learnable_params_.size();
+    learnable_params_.push_back(params_[net_param_id].get());
+    learnable_param_ids_.push_back(learnable_param_id);
+    has_params_lr_.push_back(param_spec->has_lr_mult());
+    has_params_decay_.push_back(param_spec->has_decay_mult());
+    params_lr_.push_back(param_spec->lr_mult());
+    params_weight_decay_.push_back(param_spec->decay_mult());
   } else {
     // Named param blob with name we've seen before: share params
     const int owner_net_param_id = param_names_index_[param_name];
@@ -456,9 +534,10 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
         param_layer_indices_[owner_net_param_id];
     const int owner_layer_id = owner_index.first;
     const int owner_param_id = owner_index.second;
-    LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
-              << "layer '" << layer_names_[owner_layer_id] << "', param "
-              << "index " << owner_param_id;
+    LOG_IF(INFO, Caffe::root_solver()) << "Sharing parameters '" << param_name
+        << "' owned by "
+        << "layer '" << layer_names_[owner_layer_id] << "', param "
+        << "index " << owner_param_id;
     Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
     Blob<Dtype>* owner_blob =
         layers_[owner_layer_id]->blobs()[owner_param_id].get();
@@ -467,28 +546,40 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
                                   ParamSpec_DimCheckMode_PERMISSIVE)) {
       // Permissive dimension checking -- only check counts are the same.
       CHECK_EQ(this_blob->count(), owner_blob->count())
-          << "Shared parameter blobs must have the same count.";
+          << "Cannot share param '" << param_name << "' owned by layer '"
+          << layer_names_[owner_layer_id] << "' with layer '"
+          << layer_names_[layer_id] << "'; count mismatch.  Owner layer param "
+          << "shape is " << owner_blob->shape_string() << "; sharing layer "
+          << "shape is " << this_blob->shape_string();
     } else {
       // Strict dimension checking -- all dims must be the same.
-      CHECK(this_blob->shape() == owner_blob->shape());
+      CHECK(this_blob->shape() == owner_blob->shape())
+          << "Cannot share param '" << param_name << "' owned by layer '"
+          << layer_names_[owner_layer_id] << "' with layer '"
+          << layer_names_[layer_id] << "'; shape mismatch.  Owner layer param "
+          << "shape is " << owner_blob->shape_string() << "; sharing layer "
+          << "expects shape " << this_blob->shape_string();
     }
-    layers_[layer_id]->blobs()[param_id]->ShareData(
-        *layers_[owner_layer_id]->blobs()[owner_param_id]);
-  }
-}
-
-template <typename Dtype>
-void Net<Dtype>::GetLearningRateAndWeightDecay() {
-  LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
-  ParamSpec default_param_spec;
-  for (int i = 0; i < layers_.size(); ++i) {
-    vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
-    for (int j = 0; j < layer_blobs.size(); ++j) {
-      const ParamSpec* param_spec =
-          (layers_[i]->layer_param().param_size() > j) ?
-          &layers_[i]->layer_param().param(j) : &default_param_spec;
-      params_lr_.push_back(param_spec->lr_mult());
-      params_weight_decay_.push_back(param_spec->decay_mult());
+    const int learnable_param_id = learnable_param_ids_[owner_net_param_id];
+    learnable_param_ids_.push_back(learnable_param_id);
+    if (param_spec->has_lr_mult()) {
+      if (has_params_lr_[learnable_param_id]) {
+        CHECK_EQ(param_spec->lr_mult(), params_lr_[learnable_param_id])
+            << "Shared param '" << param_name << "' has mismatched lr_mult.";
+      } else {
+        has_params_lr_[learnable_param_id] = true;
+        params_lr_[learnable_param_id] = param_spec->lr_mult();
+      }
+    }
+    if (param_spec->has_decay_mult()) {
+      if (has_params_decay_[learnable_param_id]) {
+        CHECK_EQ(param_spec->decay_mult(),
+                 params_weight_decay_[learnable_param_id])
+            << "Shared param '" << param_name << "' has mismatched decay_mult.";
+      } else {
+        has_params_decay_[learnable_param_id] = true;
+        params_weight_decay_[learnable_param_id] = param_spec->decay_mult();
+      }
     }
   }
 }
@@ -581,8 +672,10 @@ void Net<Dtype>::InputDebugInfo(const int input_id) {
   const Blob<Dtype>& blob = *net_input_blobs_[input_id];
   const string& blob_name = blob_names_[net_input_blob_indices_[input_id]];
   const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-  LOG(INFO) << "    [Forward] "
-     << "Input " << blob_name << " data: " << data_abs_val_mean;
+  if (Caffe::root_solver()) {
+    LOG(INFO) << "    [Forward] "
+              << "Input " << blob_name << " data: " << data_abs_val_mean;
+  }
 }
 
 template <typename Dtype>
@@ -591,9 +684,12 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
     const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
     const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
-       << " data: " << data_abs_val_mean;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "    [Forward] "
+                << "Layer " << layer_names_[layer_id]
+                << ", top blob " << blob_name
+                << " data: " << data_abs_val_mean;
+    }
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
        ++param_id) {
@@ -601,9 +697,12 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
     const int net_param_id = param_id_vecs_[layer_id][param_id];
     const string& blob_name = param_display_names_[net_param_id];
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Forward] "
-       << "Layer " << layer_names_[layer_id] << ", param blob " << blob_name
-       << " data: " << data_abs_val_mean;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "    [Forward] "
+                << "Layer " << layer_names_[layer_id]
+                << ", param blob " << blob_name
+                << " data: " << data_abs_val_mean;
+    }
   }
 }
 
@@ -615,18 +714,24 @@ void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
     const Blob<Dtype>& blob = *bottom_vec[bottom_id];
     const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
-        << " diff: " << diff_abs_val_mean;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "    [Backward] "
+                << "Layer " << layer_names_[layer_id]
+                << ", bottom blob " << blob_name
+                << " diff: " << diff_abs_val_mean;
+    }
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
        ++param_id) {
     if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
     const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
     const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
-    LOG(INFO) << "    [Backward] "
-        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
-        << " diff: " << diff_abs_val_mean;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "    [Backward] "
+                << "Layer " << layer_names_[layer_id]
+                << ", param blob " << param_id
+                << " diff: " << diff_abs_val_mean;
+    }
   }
 }
 
@@ -639,17 +744,22 @@ void Net<Dtype>::UpdateDebugInfo(const int param_id) {
   const Dtype diff_abs_val_mean = blob.asum_diff() / blob.count();
   if (param_owner < 0) {
     const Dtype data_abs_val_mean = blob.asum_data() / blob.count();
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param " << param_display_name
-        << " data: " << data_abs_val_mean << "; diff: " << diff_abs_val_mean;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "    [Update] Layer " << layer_name
+                << ", param " << param_display_name
+                << " data: " << data_abs_val_mean
+                << "; diff: " << diff_abs_val_mean;
+    }
   } else {
     const string& owner_layer_name =
         layer_names_[param_layer_indices_[param_owner].first];
-    LOG(INFO) << "    [Update] Layer " << layer_name
-        << ", param blob " << param_display_name
-        << " (owned by layer " << owner_layer_name << ", "
-        << "param " << param_display_names_[param_owners_[param_id]] << ")"
-        << " diff: " << diff_abs_val_mean;
+    if (Caffe::root_solver()) {
+      LOG(INFO) << "    [Update] Layer " << layer_name
+                << ", param blob " << param_display_name
+                << " (owned by layer " << owner_layer_name << ", " << "param "
+                << param_display_names_[param_owners_[param_id]] << ")"
+                << " diff: " << diff_abs_val_mean;
+    }
   }
 }
 
@@ -675,7 +785,11 @@ void Net<Dtype>::ShareTrainedLayersWith(const Net* other) {
         << "Incompatible number of blobs for layer " << source_layer_name;
     for (int j = 0; j < target_blobs.size(); ++j) {
       Blob<Dtype>* source_blob = source_layer->blobs()[j].get();
-      CHECK(target_blobs[j]->shape() == source_blob->shape());
+      CHECK(target_blobs[j]->shape() == source_blob->shape())
+          << "Cannot share param " << j << " weights from layer '"
+          << source_layer_name << "'; shape mismatch.  Source param shape is "
+          << source_blob->shape_string() << "; target param shape is "
+          << target_blobs[j]->shape_string();
       target_blobs[j]->ShareData(*source_blob);
     }
   }
@@ -706,8 +820,8 @@ void Net<Dtype>::Backward() {
     const Dtype l2norm_data = std::sqrt(sumsq_data);
     const Dtype l2norm_diff = std::sqrt(sumsq_diff);
     LOG(ERROR) << "    [Backward] All net params (data, diff): "
-        << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
-        << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
+               << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
+               << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
   }
 }
 
@@ -739,6 +853,17 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
     CHECK_EQ(target_blobs.size(), source_layer.blobs_size())
         << "Incompatible number of blobs for layer " << source_layer_name;
     for (int j = 0; j < target_blobs.size(); ++j) {
+      if (!target_blobs[j]->ShapeEquals(source_layer.blobs(j))) {
+        Blob<Dtype> source_blob;
+        const bool kReshape = true;
+        source_blob.FromProto(source_layer.blobs(j), kReshape);
+        LOG(FATAL) << "Cannot copy param " << j << " weights from layer '"
+            << source_layer_name << "'; shape mismatch.  Source param shape is "
+            << source_blob.shape_string() << "; target param shape is "
+            << target_blobs[j]->shape_string() << ". "
+            << "To learn this layer's parameters from scratch rather than "
+            << "copying from a saved net, rename the layer.";
+      }
       const bool kReshape = false;
       target_blobs[j]->FromProto(source_layer.blobs(j), kReshape);
     }
@@ -747,12 +872,73 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
+  if (trained_filename.size() >= 3 &&
+      trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
+    CopyTrainedLayersFromHDF5(trained_filename);
+  } else {
+    CopyTrainedLayersFromBinaryProto(trained_filename);
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::CopyTrainedLayersFromBinaryProto(
+    const string trained_filename) {
   NetParameter param;
   ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
   CopyTrainedLayersFrom(param);
 }
 
 template <typename Dtype>
+void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
+  hid_t file_hid = H5Fopen(trained_filename.c_str(), H5F_ACC_RDONLY,
+                           H5P_DEFAULT);
+  CHECK_GE(file_hid, 0) << "Couldn't open " << trained_filename;
+  hid_t data_hid = H5Gopen2(file_hid, "data", H5P_DEFAULT);
+  CHECK_GE(data_hid, 0) << "Error reading weights from " << trained_filename;
+  int num_layers = hdf5_get_num_links(data_hid);
+  for (int i = 0; i < num_layers; ++i) {
+    string source_layer_name = hdf5_get_name_by_idx(data_hid, i);
+    if (!layer_names_index_.count(source_layer_name)) {
+      DLOG(INFO) << "Ignoring source layer " << source_layer_name;
+      continue;
+    }
+    int target_layer_id = layer_names_index_[source_layer_name];
+    DLOG(INFO) << "Copying source layer " << source_layer_name;
+    vector<shared_ptr<Blob<Dtype> > >& target_blobs =
+        layers_[target_layer_id]->blobs();
+    hid_t layer_hid = H5Gopen2(data_hid, source_layer_name.c_str(),
+        H5P_DEFAULT);
+    CHECK_GE(layer_hid, 0)
+        << "Error reading weights from " << trained_filename;
+    // Check that source layer doesn't have more params than target layer
+    int num_source_params = hdf5_get_num_links(layer_hid);
+    CHECK_LE(num_source_params, target_blobs.size())
+        << "Incompatible number of blobs for layer " << source_layer_name;
+    for (int j = 0; j < target_blobs.size(); ++j) {
+      ostringstream oss;
+      oss << j;
+      string dataset_name = oss.str();
+      int target_net_param_id = param_id_vecs_[target_layer_id][j];
+      if (!H5Lexists(layer_hid, dataset_name.c_str(), H5P_DEFAULT)) {
+        // Target param doesn't exist in source weights...
+        if (param_owners_[target_net_param_id] != -1) {
+          // ...but it's weight-shared in target, so that's fine.
+          continue;
+        } else {
+          LOG(FATAL) << "Incompatible number of blobs for layer "
+              << source_layer_name;
+        }
+      }
+      hdf5_load_nd_dataset(layer_hid, dataset_name.c_str(), 0, kMaxBlobAxes,
+          target_blobs[j].get());
+    }
+    H5Gclose(layer_hid);
+  }
+  H5Gclose(data_hid);
+  H5Fclose(file_hid);
+}
+
+template <typename Dtype>
 void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
   param->Clear();
   param->set_name(name_);
@@ -763,51 +949,101 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
   DLOG(INFO) << "Serializing " << layers_.size() << " layers";
   for (int i = 0; i < layers_.size(); ++i) {
     LayerParameter* layer_param = param->add_layer();
-    for (int j = 0; j < bottom_id_vecs_[i].size(); ++j) {
-      layer_param->add_bottom(blob_names_[bottom_id_vecs_[i][j]]);
+    layers_[i]->ToProto(layer_param, write_diff);
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
+  hid_t file_hid = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
+      H5P_DEFAULT);
+  CHECK_GE(file_hid, 0)
+      << "Couldn't open " << filename << " to save weights.";
+  hid_t data_hid = H5Gcreate2(file_hid, "data", H5P_DEFAULT, H5P_DEFAULT,
+      H5P_DEFAULT);
+  CHECK_GE(data_hid, 0) << "Error saving weights to " << filename << ".";
+  hid_t diff_hid = -1;
+  if (write_diff) {
+    diff_hid = H5Gcreate2(file_hid, "diff", H5P_DEFAULT, H5P_DEFAULT,
+        H5P_DEFAULT);
+    CHECK_GE(diff_hid, 0) << "Error saving weights to " << filename << ".";
+  }
+  for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
+    const LayerParameter& layer_param = layers_[layer_id]->layer_param();
+    string layer_name = layer_param.name();
+    hid_t layer_data_hid = H5Gcreate2(data_hid, layer_name.c_str(),
+        H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+    CHECK_GE(layer_data_hid, 0)
+        << "Error saving weights to " << filename << ".";
+    hid_t layer_diff_hid = -1;
+    if (write_diff) {
+      layer_diff_hid = H5Gcreate2(diff_hid, layer_name.c_str(),
+          H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
+      CHECK_GE(layer_diff_hid, 0)
+          << "Error saving weights to " << filename << ".";
     }
-    for (int j = 0; j < top_id_vecs_[i].size(); ++j) {
-      layer_param->add_top(blob_names_[top_id_vecs_[i][j]]);
+    int num_params = layers_[layer_id]->blobs().size();
+    for (int param_id = 0; param_id < num_params; ++param_id) {
+      ostringstream dataset_name;
+      dataset_name << param_id;
+      const int net_param_id = param_id_vecs_[layer_id][param_id];
+      if (param_owners_[net_param_id] == -1) {
+        // Only save params that own themselves
+        hdf5_save_nd_dataset<Dtype>(layer_data_hid, dataset_name.str(),
+            *params_[net_param_id]);
+      }
+      if (write_diff) {
+        // Write diffs regardless of weight-sharing
+        hdf5_save_nd_dataset<Dtype>(layer_diff_hid, dataset_name.str(),
+            *params_[net_param_id], true);
+      }
     }
-    layers_[i]->ToProto(layer_param, write_diff);
+    H5Gclose(layer_data_hid);
+    if (write_diff) {
+      H5Gclose(layer_diff_hid);
+    }
+  }
+  H5Gclose(data_hid);
+  if (write_diff) {
+    H5Gclose(diff_hid);
   }
+  H5Fclose(file_hid);
 }
 
 template <typename Dtype>
 void Net<Dtype>::Update() {
-  // First, accumulate the diffs of any shared parameters into their owner's
-  // diff. (Assumes that the learning rate, weight decay, etc. have already been
-  // accounted for in the current diff.)
-  for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    const int count = params_[i]->count();
-    const Dtype* this_diff;
-    Dtype* owner_diff;
+  for (int i = 0; i < learnable_params_.size(); ++i) {
+    learnable_params_[i]->Update();
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::ClearParamDiffs() {
+  for (int i = 0; i < learnable_params_.size(); ++i) {
+    Blob<Dtype>* blob = learnable_params_[i];
     switch (Caffe::mode()) {
     case Caffe::CPU:
-      this_diff = params_[i]->cpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-      caffe_add(count, this_diff, owner_diff, owner_diff);
+      caffe_set(blob->count(), static_cast<Dtype>(0),
+                blob->mutable_cpu_diff());
       break;
     case Caffe::GPU:
 #ifndef CPU_ONLY
-      this_diff = params_[i]->gpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-      caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
+      caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+                    blob->mutable_gpu_diff());
 #else
       NO_GPU;
 #endif
       break;
-    default:
-      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
     }
   }
-  // Now, update the owned parameters.
+}
+
+template <typename Dtype>
+void Net<Dtype>::ShareWeights() {
   for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] >= 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    params_[i]->Update();
+    if (param_owners_[i] < 0) { continue; }
+    params_[i]->ShareData(*params_[param_owners_[i]]);
+    params_[i]->ShareDiff(*params_[param_owners_[i]]);
   }
 }
 
diff --git a/src/caffe/parallel.cpp b/src/caffe/parallel.cpp
new file mode 100644
index 0000000..a6d154e
--- /dev/null
+++ b/src/caffe/parallel.cpp
@@ -0,0 +1,441 @@
+#ifndef CPU_ONLY
+#include <cuda_runtime.h>
+#endif
+#include <glog/logging.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <cstdlib>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "boost/thread.hpp"
+#include "caffe/caffe.hpp"
+#include "caffe/parallel.hpp"
+
+namespace caffe {
+
+enum Op {
+  copy,
+  replace_cpu,
+  replace_gpu,
+  replace_cpu_diff,
+  replace_gpu_diff
+};
+
+template<typename Dtype>
+static void apply_buffers(const vector<Blob<Dtype>*>& blobs,
+                          Dtype* buffer, size_t total_size, Op op) {
+  Dtype* ptr = buffer;
+  for (int i = 0; i < blobs.size(); ++i) {
+    int size = blobs[i]->count();
+    switch (op) {
+      case copy: {
+        // Init buffer to current values of blobs
+        caffe_copy(size,
+                   reinterpret_cast<const Dtype*>(blobs[i]->data()->cpu_data()),
+                   ptr);
+        break;
+      }
+      case replace_cpu:
+        blobs[i]->data()->set_cpu_data(ptr);
+        break;
+      case replace_gpu:
+        blobs[i]->data()->set_gpu_data(ptr);
+        break;
+      case replace_cpu_diff:
+        blobs[i]->diff()->set_cpu_data(ptr);
+        break;
+      case replace_gpu_diff:
+        blobs[i]->diff()->set_gpu_data(ptr);
+        break;
+    }
+    ptr += size;
+  }
+  // total_size is at least one byte
+  CHECK_EQ(total_size, (ptr == buffer ? 1 : ptr - buffer));
+}
+
+// Buffer size necessary to store given blobs
+template<typename Dtype>
+static size_t total_size(const vector<Blob<Dtype>*>& params) {
+  size_t size = 0;
+  for (int i = 0; i < params.size(); ++i)
+    size += params[i]->count();
+  // Size have at least one byte, otherwise cudaMalloc fails if net has no
+  // learnable parameters.
+  return (size > 0) ? size : 1;
+}
+
+template<typename Dtype>
+Params<Dtype>::Params(shared_ptr<Solver<Dtype> > root_solver)
+    : size_(total_size<Dtype>(root_solver->net()->learnable_params())),
+      data_(),
+      diff_() {
+}
+
+template<typename Dtype>
+GPUParams<Dtype>::GPUParams(shared_ptr<Solver<Dtype> > root_solver, int device)
+    : Params<Dtype>(root_solver) {
+#ifndef CPU_ONLY
+  int initial_device;
+  CUDA_CHECK(cudaGetDevice(&initial_device));
+
+  // Allocate device buffers
+  CUDA_CHECK(cudaSetDevice(device));
+  CUDA_CHECK(cudaMalloc(&data_, size_ * sizeof(Dtype)));
+
+  // Copy blob values
+  const vector<Blob<Dtype>*>& net =
+      root_solver->net()->learnable_params();
+  apply_buffers(net, data_, size_, copy);
+
+  CUDA_CHECK(cudaMalloc(&diff_, size_ * sizeof(Dtype)));
+  caffe_gpu_set(size_, Dtype(0), diff_);
+
+  CUDA_CHECK(cudaSetDevice(initial_device));
+#else
+  NO_GPU;
+#endif
+}
+
+template<typename Dtype>
+GPUParams<Dtype>::~GPUParams() {
+#ifndef CPU_ONLY
+  CUDA_CHECK(cudaFree(data_));
+  CUDA_CHECK(cudaFree(diff_));
+#endif
+}
+
+template<typename Dtype>
+void GPUParams<Dtype>::configure(Solver<Dtype>* solver) const {
+  const vector<Blob<Dtype>*>& net =
+      solver->net()->learnable_params();
+  apply_buffers(net, data_, size_, replace_gpu);
+  apply_buffers(net, diff_, size_, replace_gpu_diff);
+}
+
+void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
+#ifndef CPU_ONLY
+  vector<int> remaining(devices);
+
+  // Depth for reduction tree
+  int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));
+
+  // Group GPUs by board
+  for (int d = 0; d < remaining_depth; ++d) {
+    for (int i = 0; i < remaining.size(); ++i) {
+      for (int j = i + 1; j < remaining.size(); ++j) {
+        cudaDeviceProp a, b;
+        CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
+        CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
+        if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
+          if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
+            pairs->push_back(DevicePair(remaining[i], remaining[j]));
+            DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
+            remaining.erase(remaining.begin() + j);
+            break;
+          }
+        }
+      }
+    }
+  }
+  ostringstream s;
+  for (int i = 0; i < remaining.size(); ++i) {
+    s << (i ? ", " : "") << remaining[i];
+  }
+  DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();
+
+  // Group by P2P accessibility
+  remaining_depth = ceil(log2(remaining.size()));
+  for (int d = 0; d < remaining_depth; ++d) {
+    for (int i = 0; i < remaining.size(); ++i) {
+      for (int j = i + 1; j < remaining.size(); ++j) {
+        int access;
+        CUDA_CHECK(
+            cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
+        if (access) {
+          pairs->push_back(DevicePair(remaining[i], remaining[j]));
+          DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
+          remaining.erase(remaining.begin() + j);
+          break;
+        }
+      }
+    }
+  }
+  s.str("");
+  for (int i = 0; i < remaining.size(); ++i) {
+    s << (i ? ", " : "") << remaining[i];
+  }
+  DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();
+
+  // Group remaining
+  remaining_depth = ceil(log2(remaining.size()));
+  for (int d = 0; d < remaining_depth; ++d) {
+    for (int i = 0; i < remaining.size(); ++i) {
+      pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
+      DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
+                 << remaining[i + 1];
+      remaining.erase(remaining.begin() + i + 1);
+    }
+  }
+
+  // Should only be the parent node remaining
+  CHECK_EQ(remaining.size(), 1);
+
+  pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));
+
+  CHECK(pairs->size() == devices.size());
+  for (int i = 0; i < pairs->size(); ++i) {
+    CHECK((*pairs)[i].parent() != (*pairs)[i].device());
+    for (int j = i + 1; j < pairs->size(); ++j) {
+      CHECK((*pairs)[i].device() != (*pairs)[j].device());
+    }
+  }
+#else
+  NO_GPU;
+#endif
+}
+
+//
+
+template<typename Dtype>
+P2PSync<Dtype>::P2PSync(shared_ptr<Solver<Dtype> > root_solver,
+                        P2PSync<Dtype>* parent, const SolverParameter& param)
+    : GPUParams<Dtype>(root_solver, param.device_id()),
+      parent_(parent),
+      children_(),
+      queue_(),
+      initial_iter_(root_solver->iter()),
+      solver_() {
+#ifndef CPU_ONLY
+  int initial_device;
+  CUDA_CHECK(cudaGetDevice(&initial_device));
+  const int self = param.device_id();
+  CUDA_CHECK(cudaSetDevice(self));
+
+  if (parent == NULL) {
+    solver_ = root_solver;
+  } else {
+    Caffe::set_root_solver(false);
+    solver_.reset(new WorkerSolver<Dtype>(param, root_solver.get()));
+    Caffe::set_root_solver(true);
+  }
+  this->configure(solver_.get());
+  solver_->add_callback(this);
+
+  if (parent) {
+    // Enable p2p access between devices
+    const int peer = parent->solver_->param().device_id();
+    int access;
+    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
+    if (access) {
+      CUDA_CHECK(cudaDeviceEnablePeerAccess(peer, 0));
+    } else {
+      LOG(INFO)<< "GPU " << self << " does not have p2p access to GPU " << peer;
+    }
+    // Allocate receiving buffer on parent
+    CUDA_CHECK(cudaSetDevice(peer));
+    CUDA_CHECK(cudaMalloc(&parent_grads_, size_ * sizeof(Dtype)));
+    CUDA_CHECK(cudaSetDevice(self));
+  }
+
+  CUDA_CHECK(cudaSetDevice(initial_device));
+#else
+  NO_GPU;
+#endif
+}
+
+template<typename Dtype>
+P2PSync<Dtype>::~P2PSync() {
+#ifndef CPU_ONLY
+  int initial_device;
+  CUDA_CHECK(cudaGetDevice(&initial_device));
+  const int self = solver_->param().device_id();
+  CUDA_CHECK(cudaSetDevice(self));
+
+  if (parent_) {
+    CUDA_CHECK(cudaFree(parent_grads_));
+    const int peer = parent_->solver_->param().device_id();
+    int access;
+    CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
+    if (access) {
+      CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
+    }
+  }
+
+  CUDA_CHECK(cudaSetDevice(initial_device));
+#endif
+}
+
+template<typename Dtype>
+void P2PSync<Dtype>::InternalThreadEntry() {
+  Caffe::SetDevice(solver_->param().device_id());
+  CHECK(Caffe::root_solver());
+  Caffe::set_root_solver(false);
+  // See if there is a defined seed and reset random state if so
+  if (solver_->param().random_seed() >= 0) {
+    // Fetch random seed and modulate by device ID to make sure
+    // everyone doesn't have the same seed.  We seem to have some
+    // solver instability if we have everyone with the same seed
+    Caffe::set_random_seed(
+        solver_->param().random_seed() + solver_->param().device_id());
+  }
+  solver_->Step(solver_->param().max_iter() - initial_iter_);
+}
+
+template<typename Dtype>
+void P2PSync<Dtype>::on_start() {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  int device;
+  CUDA_CHECK(cudaGetDevice(&device));
+  CHECK(device == solver_->param().device_id());
+#else
+//  CHECK(false);
+#endif
+
+  // Wait for update from parent
+  if (parent_) {
+    P2PSync<Dtype> *parent = queue_.pop();
+    CHECK(parent == parent_);
+  }
+
+  // Update children
+  for (int i = children_.size() - 1; i >= 0; i--) {
+    Dtype* src = data_;
+    Dtype* dst = children_[i]->data_;
+
+#ifdef DEBUG
+    cudaPointerAttributes attributes;
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
+    CHECK(attributes.device == device);
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
+    CHECK(attributes.device == children_[i]->solver_->param().device_id());
+#endif
+
+    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),
+        cudaMemcpyDeviceToDevice, cudaStreamDefault));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
+    children_[i]->queue_.push(this);
+  }
+#endif
+}
+
+template<typename Dtype>
+void P2PSync<Dtype>::on_gradients_ready() {
+#ifndef CPU_ONLY
+#ifdef DEBUG
+  int device;
+  CUDA_CHECK(cudaGetDevice(&device));
+  CHECK(device == solver_->param().device_id());
+#endif
+
+  // Sum children gradients as they appear in the queue
+  for (int i = 0; i < children_.size(); ++i) {
+    P2PSync<Dtype> *child = queue_.pop();
+    Dtype* src = child->parent_grads_;
+    Dtype* dst = diff_;
+
+#ifdef DEBUG
+    bool ok = false;
+    for (int j = 0; j < children_.size(); ++j) {
+      if (child == children_[j]) {
+        ok = true;
+      }
+    }
+    CHECK(ok);
+    cudaPointerAttributes attributes;
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
+    CHECK(attributes.device == device);
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
+    CHECK(attributes.device == device);
+#endif
+
+    caffe_gpu_add(size_, src, dst, dst);
+  }
+
+  // Send gradients to parent
+  if (parent_) {
+    Dtype* src = diff_;
+    Dtype* dst = parent_grads_;
+
+#ifdef DEBUG
+    cudaPointerAttributes attributes;
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
+    CHECK(attributes.device == device);
+    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
+    CHECK(attributes.device == parent_->solver_->param().device_id());
+#endif
+
+    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
+        cudaMemcpyDeviceToDevice, cudaStreamDefault));
+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
+    parent_->queue_.push(this);
+  } else {
+    // Loss functions divide gradients by the batch size, so to compensate
+    // for split batch, the root solver divides by number of solvers.
+    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
+  }
+#endif
+}
+
+template<typename Dtype>
+void P2PSync<Dtype>::run(const vector<int>& gpus) {
+  // Pair devices for map-reduce synchronization
+  vector<DevicePair> pairs;
+  DevicePair::compute(gpus, &pairs);
+  ostringstream s;
+  for (int i = 1; i < pairs.size(); ++i) {
+    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
+  }
+  LOG(INFO)<< "GPUs pairs " << s.str();
+
+  SolverParameter param(solver_->param());
+  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
+
+  // Build the GPU tree by finding the parent for each solver
+  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
+    for (int i = 1; i < pairs.size(); ++i) {
+      if (!syncs[i].get()) {
+        P2PSync<Dtype>* parent = NULL;
+        for (int j = 0; j < syncs.size(); ++j) {
+          P2PSync<Dtype>* sync = j == 0 ? this : syncs[j].get();
+          if (sync) {
+            const SolverParameter& p = sync->solver()->param();
+            if (p.device_id() == pairs[i].parent()) {
+              parent = sync;
+            }
+          }
+        }
+        if (parent) {
+          param.set_device_id(pairs[i].device());
+          syncs[i].reset(new P2PSync<Dtype>(solver_, parent, param));
+          parent->children_.push_back((P2PSync<Dtype>*) syncs[i].get());
+        }
+      }
+    }
+  }
+
+  LOG(INFO)<< "Starting Optimization";
+
+  for (int i = 1; i < syncs.size(); ++i) {
+    syncs[i]->StartInternalThread();
+  }
+
+  // Run root solver on current thread
+  solver_->Solve();
+
+  for (int i = 1; i < syncs.size(); ++i) {
+    syncs[i]->StopInternalThread();
+  }
+}
+
+INSTANTIATE_CLASS(Params);
+INSTANTIATE_CLASS(GPUParams);
+INSTANTIATE_CLASS(P2PSync);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 81a8c69..aa299f8 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -11,6 +11,8 @@ message BlobProto {
   optional BlobShape shape = 7;
   repeated float data = 5 [packed = true];
   repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
 
   // 4D dimensions -- deprecated.  Use "shape" instead.
   optional int32 num = 1 [default = 0];
@@ -96,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 37 (last added: iter_size)
+// SolverParameter next available ID: 40 (last added: momentum2)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -151,7 +153,23 @@ message SolverParameter {
   optional int32 max_iter = 7; // the maximum number of iterations
   // accumulate gradients over `iter_size` x `batch_size` instances
   optional int32 iter_size = 36 [default = 1];
-  optional string lr_policy = 8; // The learning rate decay policy.
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
   optional float gamma = 9; // The parameter to compute the learning rate.
   optional float power = 10; // The parameter to compute the learning rate.
   optional float momentum = 11; // The momentum value.
@@ -173,6 +191,11 @@ message SolverParameter {
   // whether to snapshot diff in the results or not. Snapshotting diff will help
   // debugging but the final protocol buffer size will be much larger.
   optional bool snapshot_diff = 16 [default = false];
+  enum SnapshotFormat {
+    HDF5 = 0;
+    BINARYPROTO = 1;
+  }
+  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
   // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
   enum SolverMode {
     CPU = 0;
@@ -191,10 +214,19 @@ message SolverParameter {
     SGD = 0;
     NESTEROV = 1;
     ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
   }
   optional SolverType solver_type = 30 [default = SGD];
-  // numerical stability for AdaGrad
+  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
   optional float delta = 31 [default = 1e-8];
+  // parameters for the Adam solver
+  optional float momentum2 = 39 [default = 0.999];
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 38;
 
   // If true, print information about the state of the net that may help with
   // debugging learning problems.
@@ -269,7 +301,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 137 (last added: reduction_param)
+// LayerParameter next available layer-specific ID: 139 (last added: tile_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -290,7 +322,7 @@ message LayerParameter {
 
   // The blobs containing the numeric parameters of the layer.
   repeated BlobProto blobs = 7;
-  
+
   // Specifies on which bottoms the backpropagation should be skipped.
   // The size must be either 0 or equal to the number of bottoms.
   repeated bool propagate_down = 11;
@@ -325,6 +357,7 @@ message LayerParameter {
   optional DropoutParameter dropout_param = 108;
   optional DummyDataParameter dummy_data_param = 109;
   optional EltwiseParameter eltwise_param = 110;
+  optional EmbedParameter embed_param = 137;
   optional ExpParameter exp_param = 111;
   optional FlattenParameter flatten_param = 135;
   optional HDF5DataParameter hdf5_data_param = 112;
@@ -350,6 +383,7 @@ message LayerParameter {
   optional SliceParameter slice_param = 126;
   optional TanHParameter tanh_param = 127;
   optional ThresholdParameter threshold_param = 128;
+  optional TileParameter tile_param = 138;
   optional WindowDataParameter window_data_param = 129;
 }
 
@@ -431,7 +465,7 @@ message ContrastiveLossParameter {
   // Hadsell paper. New models should probably use this version.
   // legacy_version = true uses (margin - d^2). This is kept to support /
   // reproduce existing models and results
-  optional bool legacy_version = 2 [default = false]; 
+  optional bool legacy_version = 2 [default = false];
 }
 
 message ConvolutionParameter {
@@ -472,6 +506,7 @@ message DataParameter {
   // to avoid all asynchronous sgd clients to start at the same point. The skip
   // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
   // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
   optional uint32 rand_skip = 7 [default = 0];
   optional DB backend = 8 [default = LEVELDB];
   // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
@@ -487,6 +522,9 @@ message DataParameter {
   optional bool mirror = 6 [default = false];
   // Force the encoded image to have 3 color channels
   optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Number of batches to prefetch to host memory, increase if
+  // data access bandwidth varies).
+  optional uint32 prefetch = 10 [default = 4];
 }
 
 message DropoutParameter {
@@ -526,6 +564,21 @@ message EltwiseParameter {
   optional bool stable_prod_grad = 3 [default = true];
 }
 
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores parameters used by ExpLayer
 message ExpParameter {
   // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
   // Or if base is set to the default (-1), base is set to e,
@@ -579,7 +632,7 @@ message ImageDataParameter {
   // Specify the data source.
   optional string source = 1;
   // Specify the batch size.
-  optional uint32 batch_size = 4;
+  optional uint32 batch_size = 4 [default = 1];
   // The rand_skip variable is for the data layer to skip a few data points
   // to avoid all asynchronous sgd clients to start at the same point. The skip
   // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
@@ -703,6 +756,15 @@ message PowerParameter {
 message PythonParameter {
   optional string module = 1;
   optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // If true, each worker solver sequentially run forward from this layer.
+  // This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [default = false];
 }
 
 // Message that stores parameters used by ReductionLayer
@@ -858,6 +920,16 @@ message TanHParameter {
   optional Engine engine = 1 [default = DEFAULT];
 }
 
+// Message that stores parameters used by TileLayer
+message TileParameter {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores parameters used by ThresholdLayer
 message ThresholdParameter {
   optional float threshold = 1 [default = 0]; // Strictly positive values
 }
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index aabe0ed..394ec3b 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -4,24 +4,44 @@
 #include <string>
 #include <vector>
 
+#include "hdf5.h"
+#include "hdf5_hl.h"
+
 #include "caffe/net.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/solver.hpp"
+#include "caffe/util/hdf5.hpp"
 #include "caffe/util/io.hpp"
 #include "caffe/util/math_functions.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
 namespace caffe {
 
+template<typename Dtype>
+void Solver<Dtype>::SetActionFunction(ActionCallback func) {
+  action_request_function_ = func;
+}
+
+template<typename Dtype>
+SolverAction::Enum Solver<Dtype>::GetRequestedAction() {
+  if (action_request_function_) {
+    // If the external request function has been set, call it.
+    return action_request_function_();
+  }
+  return SolverAction::NONE;
+}
+
 template <typename Dtype>
-Solver<Dtype>::Solver(const SolverParameter& param)
-    : net_() {
+Solver<Dtype>::Solver(const SolverParameter& param, const Solver* root_solver)
+    : net_(), callbacks_(), root_solver_(root_solver),
+      requested_early_exit_(false) {
   Init(param);
 }
 
 template <typename Dtype>
-Solver<Dtype>::Solver(const string& param_file)
-    : net_() {
+Solver<Dtype>::Solver(const string& param_file, const Solver* root_solver)
+    : net_(), callbacks_(), root_solver_(root_solver),
+      requested_early_exit_(false) {
   SolverParameter param;
   ReadProtoFromTextFileOrDie(param_file, &param);
   Init(param);
@@ -29,17 +49,21 @@ Solver<Dtype>::Solver(const string& param_file)
 
 template <typename Dtype>
 void Solver<Dtype>::Init(const SolverParameter& param) {
-  LOG(INFO) << "Initializing solver from parameters: " << std::endl
-            << param.DebugString();
+  CHECK(Caffe::root_solver() || root_solver_)
+      << "root_solver_ needs to be set for all non-root solvers";
+  LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: "
+    << std::endl << param.DebugString();
   param_ = param;
   CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
-  if (param_.random_seed() >= 0) {
+  if (Caffe::root_solver() && param_.random_seed() >= 0) {
     Caffe::set_random_seed(param_.random_seed());
   }
   // Scaffolding code
   InitTrainNet();
-  InitTestNets();
-  LOG(INFO) << "Solver scaffolding done.";
+  if (Caffe::root_solver()) {
+    InitTestNets();
+    LOG(INFO) << "Solver scaffolding done.";
+  }
   iter_ = 0;
   current_step_ = 0;
 }
@@ -55,19 +79,22 @@ void Solver<Dtype>::InitTrainNet() {
       << "one of these fields specifying a train_net: " << field_names;
   NetParameter net_param;
   if (param_.has_train_net_param()) {
-    LOG(INFO) << "Creating training net specified in train_net_param.";
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Creating training net specified in train_net_param.";
     net_param.CopyFrom(param_.train_net_param());
   } else if (param_.has_train_net()) {
-    LOG(INFO) << "Creating training net from train_net file: "
-              << param_.train_net();
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Creating training net from train_net file: " << param_.train_net();
     ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
   }
   if (param_.has_net_param()) {
-    LOG(INFO) << "Creating training net specified in net_param.";
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Creating training net specified in net_param.";
     net_param.CopyFrom(param_.net_param());
   }
   if (param_.has_net()) {
-    LOG(INFO) << "Creating training net from net file: " << param_.net();
+    LOG_IF(INFO, Caffe::root_solver())
+        << "Creating training net from net file: " << param_.net();
     ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
   }
   // Set the correct NetState.  We start with the solver defaults (lowest
@@ -79,11 +106,16 @@ void Solver<Dtype>::InitTrainNet() {
   net_state.MergeFrom(net_param.state());
   net_state.MergeFrom(param_.train_state());
   net_param.mutable_state()->CopyFrom(net_state);
-  net_.reset(new Net<Dtype>(net_param));
+  if (Caffe::root_solver()) {
+    net_.reset(new Net<Dtype>(net_param));
+  } else {
+    net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get()));
+  }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
+  CHECK(Caffe::root_solver());
   const bool has_net_param = param_.has_net_param();
   const bool has_net_file = param_.has_net();
   const int num_generic_nets = has_net_param + has_net_file;
@@ -153,7 +185,12 @@ void Solver<Dtype>::InitTestNets() {
     net_params[i].mutable_state()->CopyFrom(net_state);
     LOG(INFO)
         << "Creating test net (#" << i << ") specified by " << sources[i];
-    test_nets_[i].reset(new Net<Dtype>(net_params[i]));
+    if (Caffe::root_solver()) {
+      test_nets_[i].reset(new Net<Dtype>(net_params[i]));
+    } else {
+      test_nets_[i].reset(new Net<Dtype>(net_params[i],
+          root_solver_->test_nets_[i].get()));
+    }
     test_nets_[i]->set_debug_info(param_.debug_info());
   }
 }
@@ -169,29 +206,20 @@ void Solver<Dtype>::Step(int iters) {
 
   while (iter_ < stop_iter) {
     // zero-init the params
-    for (int i = 0; i < net_->params().size(); ++i) {
-      shared_ptr<Blob<Dtype> > blob = net_->params()[i];
-      switch (Caffe::mode()) {
-      case Caffe::CPU:
-        caffe_set(blob->count(), static_cast<Dtype>(0),
-            blob->mutable_cpu_diff());
-        break;
-      case Caffe::GPU:
-#ifndef CPU_ONLY
-        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
-            blob->mutable_gpu_diff());
-#else
-        NO_GPU;
-#endif
+    net_->ClearParamDiffs();
+    if (param_.test_interval() && iter_ % param_.test_interval() == 0
+        && (iter_ > 0 || param_.test_initialization())
+        && Caffe::root_solver()) {
+      TestAll();
+      if (requested_early_exit_) {
+        // Break out of the while loop because stop was requested while testing.
         break;
       }
     }
 
-    if (param_.test_interval() && iter_ % param_.test_interval() == 0
-        && (iter_ > 0 || param_.test_initialization())) {
-      TestAll();
+    for (int i = 0; i < callbacks_.size(); ++i) {
+      callbacks_[i]->on_start();
     }
-
     const bool display = param_.display() && iter_ % param_.display() == 0;
     net_->set_debug_info(display && param_.debug_info());
     // accumulate the loss and gradient
@@ -211,7 +239,8 @@ void Solver<Dtype>::Step(int iters) {
       losses[idx] = loss;
     }
     if (display) {
-      LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
+      LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
+          << ", loss = " << smoothed_loss;
       const vector<Blob<Dtype>*>& result = net_->output_blobs();
       int score_index = 0;
       for (int j = 0; j < result.size(); ++j) {
@@ -226,30 +255,47 @@ void Solver<Dtype>::Step(int iters) {
             loss_msg_stream << " (* " << loss_weight
                             << " = " << loss_weight * result_vec[k] << " loss)";
           }
-          LOG(INFO) << "    Train net output #"
+          LOG_IF(INFO, Caffe::root_solver()) << "    Train net output #"
               << score_index++ << ": " << output_name << " = "
               << result_vec[k] << loss_msg_stream.str();
         }
       }
     }
+    for (int i = 0; i < callbacks_.size(); ++i) {
+      callbacks_[i]->on_gradients_ready();
+    }
     ApplyUpdate();
 
     // Increment the internal iter_ counter -- its value should always indicate
     // the number of times the weights have been updated.
     ++iter_;
 
+    SolverAction::Enum request = GetRequestedAction();
+
     // Save a snapshot if needed.
-    if (param_.snapshot() && iter_ % param_.snapshot() == 0) {
+    if ((param_.snapshot()
+         && iter_ % param_.snapshot() == 0
+         && Caffe::root_solver()) ||
+         (request == SolverAction::SNAPSHOT)) {
       Snapshot();
     }
+    if (SolverAction::STOP == request) {
+      requested_early_exit_ = true;
+      // Break out of training loop.
+      break;
+    }
   }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Solve(const char* resume_file) {
+  CHECK(Caffe::root_solver());
   LOG(INFO) << "Solving " << net_->name();
   LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
 
+  // Initialize to false every time we start solving.
+  requested_early_exit_ = false;
+
   if (resume_file) {
     LOG(INFO) << "Restoring previous solver status from " << resume_file;
     Restore(resume_file);
@@ -264,6 +310,10 @@ void Solver<Dtype>::Solve(const char* resume_file) {
       && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
     Snapshot();
   }
+  if (requested_early_exit_) {
+    LOG(INFO) << "Optimization stopped early.";
+    return;
+  }
   // After the optimization is done, run an additional train and test pass to
   // display the train and test loss/outputs if appropriate (based on the
   // display and test_interval settings, respectively).  Unlike in the rest of
@@ -281,16 +331,18 @@ void Solver<Dtype>::Solve(const char* resume_file) {
   LOG(INFO) << "Optimization Done.";
 }
 
-
 template <typename Dtype>
 void Solver<Dtype>::TestAll() {
-  for (int test_net_id = 0; test_net_id < test_nets_.size(); ++test_net_id) {
+  for (int test_net_id = 0;
+       test_net_id < test_nets_.size() && !requested_early_exit_;
+       ++test_net_id) {
     Test(test_net_id);
   }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::Test(const int test_net_id) {
+  CHECK(Caffe::root_solver());
   LOG(INFO) << "Iteration " << iter_
             << ", Testing net (#" << test_net_id << ")";
   CHECK_NOTNULL(test_nets_[test_net_id].get())->
@@ -301,6 +353,21 @@ void Solver<Dtype>::Test(const int test_net_id) {
   const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id];
   Dtype loss = 0;
   for (int i = 0; i < param_.test_iter(test_net_id); ++i) {
+    SolverAction::Enum request = GetRequestedAction();
+    // Check to see if stoppage of testing/training has been requested.
+    while (request != SolverAction::NONE) {
+        if (SolverAction::SNAPSHOT == request) {
+          Snapshot();
+        } else if (SolverAction::STOP == request) {
+          requested_early_exit_ = true;
+        }
+        request = GetRequestedAction();
+    }
+    if (requested_early_exit_) {
+      // break out of test loop.
+      break;
+    }
+
     Dtype iter_loss;
     const vector<Blob<Dtype>*>& result =
         test_net->Forward(bottom_vec, &iter_loss);
@@ -325,6 +392,10 @@ void Solver<Dtype>::Test(const int test_net_id) {
       }
     }
   }
+  if (requested_early_exit_) {
+    LOG(INFO)     << "Test interrupted.";
+    return;
+  }
   if (param_.test_compute_loss()) {
     loss /= param_.test_iter(test_net_id);
     LOG(INFO) << "Test loss: " << loss;
@@ -341,49 +412,66 @@ void Solver<Dtype>::Test(const int test_net_id) {
                       << " = " << loss_weight * mean_score << " loss)";
     }
     LOG(INFO) << "    Test net output #" << i << ": " << output_name << " = "
-        << mean_score << loss_msg_stream.str();
+              << mean_score << loss_msg_stream.str();
   }
 }
 
-
 template <typename Dtype>
 void Solver<Dtype>::Snapshot() {
-  NetParameter net_param;
-  // For intermediate results, we will also dump the gradient values.
-  net_->ToProto(&net_param, param_.snapshot_diff());
+  CHECK(Caffe::root_solver());
+  string model_filename;
+  switch (param_.snapshot_format()) {
+    case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
+      model_filename = SnapshotToBinaryProto();
+      break;
+    case caffe::SolverParameter_SnapshotFormat_HDF5:
+      model_filename = SnapshotToHDF5();
+      break;
+    default:
+      LOG(FATAL) << "Unsupported snapshot format.";
+  }
+
+  SnapshotSolverState(model_filename);
+}
+
+template <typename Dtype>
+string Solver<Dtype>::SnapshotFilename(const string extension) {
   string filename(param_.snapshot_prefix());
-  string model_filename, snapshot_filename;
   const int kBufferSize = 20;
   char iter_str_buffer[kBufferSize];
   snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_);
-  filename += iter_str_buffer;
-  model_filename = filename + ".caffemodel";
-  LOG(INFO) << "Snapshotting to " << model_filename;
-  WriteProtoToBinaryFile(net_param, model_filename.c_str());
-  SolverState state;
-  SnapshotSolverState(&state);
-  state.set_iter(iter_);
-  state.set_learned_net(model_filename);
-  state.set_current_step(current_step_);
-  snapshot_filename = filename + ".solverstate";
-  LOG(INFO) << "Snapshotting solver state to " << snapshot_filename;
-  WriteProtoToBinaryFile(state, snapshot_filename.c_str());
+  return filename + iter_str_buffer + extension;
 }
 
 template <typename Dtype>
-void Solver<Dtype>::Restore(const char* state_file) {
-  SolverState state;
+string Solver<Dtype>::SnapshotToBinaryProto() {
+  string model_filename = SnapshotFilename(".caffemodel");
+  LOG(INFO) << "Snapshotting to binary proto file " << model_filename;
   NetParameter net_param;
-  ReadProtoFromBinaryFile(state_file, &state);
-  if (state.has_learned_net()) {
-    ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
-    net_->CopyTrainedLayersFrom(net_param);
-  }
-  iter_ = state.iter();
-  current_step_ = state.current_step();
-  RestoreSolverState(state);
+  net_->ToProto(&net_param, param_.snapshot_diff());
+  WriteProtoToBinaryFile(net_param, model_filename);
+  return model_filename;
 }
 
+template <typename Dtype>
+string Solver<Dtype>::SnapshotToHDF5() {
+  string model_filename = SnapshotFilename(".caffemodel.h5");
+  LOG(INFO) << "Snapshotting to HDF5 file " << model_filename;
+  net_->ToHDF5(model_filename, param_.snapshot_diff());
+  return model_filename;
+}
+
+template <typename Dtype>
+void Solver<Dtype>::Restore(const char* state_file) {
+  CHECK(Caffe::root_solver());
+  string state_filename(state_file);
+  if (state_filename.size() >= 3 &&
+      state_filename.compare(state_filename.size() - 3, 3, ".h5") == 0) {
+    RestoreSolverStateFromHDF5(state_filename);
+  } else {
+    RestoreSolverStateFromBinaryProto(state_filename);
+  }
+}
 
 // Return the current learning rate. The currently implemented learning rate
 // policies are as follows:
@@ -442,7 +530,7 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
 template <typename Dtype>
 void SGDSolver<Dtype>::PreSolve() {
   // Initialize the history
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   history_.clear();
   update_.clear();
   temp_.clear();
@@ -458,12 +546,10 @@ template <typename Dtype>
 void SGDSolver<Dtype>::ClipGradients() {
   const Dtype clip_gradients = this->param_.clip_gradients();
   if (clip_gradients < 0) { return; }
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   Dtype sumsq_diff = 0;
   for (int i = 0; i < net_params.size(); ++i) {
-    if (this->net_->param_owners()[i] < 0) {
-      sumsq_diff += net_params[i]->sumsq_diff();
-    }
+    sumsq_diff += net_params[i]->sumsq_diff();
   }
   const Dtype l2norm_diff = std::sqrt(sumsq_diff);
   if (l2norm_diff > clip_gradients) {
@@ -472,21 +558,21 @@ void SGDSolver<Dtype>::ClipGradients() {
         << l2norm_diff << " > " << clip_gradients << ") "
         << "by scale factor " << scale_factor;
     for (int i = 0; i < net_params.size(); ++i) {
-      if (this->net_->param_owners()[i] < 0) {
-        net_params[i]->scale_diff(scale_factor);
-      }
+      net_params[i]->scale_diff(scale_factor);
     }
   }
 }
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ApplyUpdate() {
+  CHECK(Caffe::root_solver());
   Dtype rate = GetLearningRate();
   if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
     LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
   }
   ClipGradients();
-  for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
+  for (int param_id = 0; param_id < this->net_->learnable_params().size();
+       ++param_id) {
     Normalize(param_id);
     Regularize(param_id);
     ComputeUpdateValue(param_id, rate);
@@ -498,7 +584,7 @@ template <typename Dtype>
 void SGDSolver<Dtype>::Normalize(int param_id) {
   if (this->param_.iter_size() == 1) { return; }
   // Scale gradient to counterbalance accumulation.
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
   switch (Caffe::mode()) {
   case Caffe::CPU: {
@@ -522,7 +608,7 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::Regularize(int param_id) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_weight_decay =
       this->net_->params_weight_decay();
   Dtype weight_decay = this->param_.weight_decay();
@@ -584,7 +670,7 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
 
 template <typename Dtype>
 void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype momentum = this->param_.momentum();
   Dtype local_rate = rate * net_params_lr[param_id];
@@ -618,17 +704,76 @@ void SGDSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 }
 
 template <typename Dtype>
-void SGDSolver<Dtype>::SnapshotSolverState(SolverState* state) {
-  state->clear_history();
+void SGDSolver<Dtype>::SnapshotSolverState(const string& model_filename) {
+  switch (this->param_.snapshot_format()) {
+    case caffe::SolverParameter_SnapshotFormat_BINARYPROTO:
+      SnapshotSolverStateToBinaryProto(model_filename);
+      break;
+    case caffe::SolverParameter_SnapshotFormat_HDF5:
+      SnapshotSolverStateToHDF5(model_filename);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported snapshot format.";
+  }
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
+    const string& model_filename) {
+  SolverState state;
+  state.set_iter(this->iter_);
+  state.set_learned_net(model_filename);
+  state.set_current_step(this->current_step_);
+  state.clear_history();
   for (int i = 0; i < history_.size(); ++i) {
     // Add history
-    BlobProto* history_blob = state->add_history();
+    BlobProto* history_blob = state.add_history();
     history_[i]->ToProto(history_blob);
   }
+  string snapshot_filename = Solver<Dtype>::SnapshotFilename(".solverstate");
+  LOG(INFO)
+    << "Snapshotting solver state to binary proto file" << snapshot_filename;
+  WriteProtoToBinaryFile(state, snapshot_filename.c_str());
 }
 
 template <typename Dtype>
-void SGDSolver<Dtype>::RestoreSolverState(const SolverState& state) {
+void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
+    const string& model_filename) {
+  string snapshot_filename =
+      Solver<Dtype>::SnapshotFilename(".solverstate.h5");
+  LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename;
+  hid_t file_hid = H5Fcreate(snapshot_filename.c_str(), H5F_ACC_TRUNC,
+      H5P_DEFAULT, H5P_DEFAULT);
+  CHECK_GE(file_hid, 0)
+      << "Couldn't open " << snapshot_filename << " to save solver state.";
+  hdf5_save_int(file_hid, "iter", this->iter_);
+  hdf5_save_string(file_hid, "learned_net", model_filename);
+  hdf5_save_int(file_hid, "current_step", this->current_step_);
+  hid_t history_hid = H5Gcreate2(file_hid, "history", H5P_DEFAULT, H5P_DEFAULT,
+      H5P_DEFAULT);
+  CHECK_GE(history_hid, 0)
+      << "Error saving solver state to " << snapshot_filename << ".";
+  for (int i = 0; i < history_.size(); ++i) {
+    ostringstream oss;
+    oss << i;
+    hdf5_save_nd_dataset<Dtype>(history_hid, oss.str(), *history_[i]);
+  }
+  H5Gclose(history_hid);
+  H5Fclose(file_hid);
+}
+
+template <typename Dtype>
+void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto(
+    const string& state_file) {
+  SolverState state;
+  ReadProtoFromBinaryFile(state_file, &state);
+  this->iter_ = state.iter();
+  if (state.has_learned_net()) {
+    NetParameter net_param;
+    ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param);
+    this->net_->CopyTrainedLayersFrom(net_param);
+  }
+  this->current_step_ = state.current_step();
   CHECK_EQ(state.history_size(), history_.size())
       << "Incorrect length of history blobs.";
   LOG(INFO) << "SGDSolver: restoring history";
@@ -638,8 +783,34 @@ void SGDSolver<Dtype>::RestoreSolverState(const SolverState& state) {
 }
 
 template <typename Dtype>
+void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
+  hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+  CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file;
+  this->iter_ = hdf5_load_int(file_hid, "iter");
+  if (H5LTfind_dataset(file_hid, "learned_net")) {
+    string learned_net = hdf5_load_string(file_hid, "learned_net");
+    this->net_->CopyTrainedLayersFrom(learned_net);
+  }
+  this->current_step_ = hdf5_load_int(file_hid, "current_step");
+  hid_t history_hid = H5Gopen2(file_hid, "history", H5P_DEFAULT);
+  CHECK_GE(history_hid, 0) << "Error reading history from " << state_file;
+  int state_history_size = hdf5_get_num_links(history_hid);
+  CHECK_EQ(state_history_size, history_.size())
+      << "Incorrect length of history blobs.";
+  for (int i = 0; i < history_.size(); ++i) {
+    ostringstream oss;
+    oss << i;
+    hdf5_load_nd_dataset<Dtype>(history_hid, oss.str().c_str(), 0,
+                                kMaxBlobAxes, history_[i].get());
+  }
+  H5Gclose(history_hid);
+  H5Fclose(file_hid);
+}
+
+template <typename Dtype>
 void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  CHECK(Caffe::root_solver());
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype momentum = this->param_.momentum();
   Dtype local_rate = rate * net_params_lr[param_id];
@@ -699,7 +870,8 @@ void NesterovSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
 
 template <typename Dtype>
 void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
-  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  CHECK(Caffe::root_solver());
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
   const vector<float>& net_params_lr = this->net_->params_lr();
   Dtype delta = this->param_.delta();
   Dtype local_rate = rate * net_params_lr[param_id];
@@ -775,9 +947,336 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
   }
 }
 
+template <typename Dtype>
+void RMSPropSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+
+  // get the learning rate
+  Dtype delta = this->param_.delta();
+  Dtype rms_decay = this->param_.rms_decay();
+  Dtype local_rate = rate * net_params_lr[param_id];
+
+  switch (Caffe::mode()) {
+  case Caffe::CPU:
+    // compute square of gradient in update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history
+    caffe_cpu_axpby(net_params[param_id] -> count(),
+        Dtype(1-rms_decay), this->update_[param_id]->cpu_data(),
+        rms_decay, this->history_[param_id]-> mutable_cpu_data());
+
+    // prepare update
+    caffe_powx(net_params[param_id]->count(),
+        this->history_[param_id]->cpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_cpu_data());
+
+    caffe_add_scalar(net_params[param_id]->count(),
+        delta, this->update_[param_id]->mutable_cpu_data());
+
+    caffe_div(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // scale and copy
+    caffe_cpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->cpu_data(), Dtype(0),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  case Caffe::GPU:
+#ifndef CPU_ONLY
+    // compute square of gradient in update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history
+    caffe_gpu_axpby(net_params[param_id] -> count(),
+        Dtype(1-rms_decay), this->update_[param_id]->gpu_data(),
+        rms_decay, this->history_[param_id]-> mutable_gpu_data());
+
+    // prepare update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        this->history_[param_id]->gpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add_scalar(net_params[param_id]->count(),
+        delta, this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_div(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_axpby(net_params[param_id]->count(), local_rate,
+        this->update_[param_id]->gpu_data(), Dtype(0),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+template <typename Dtype>
+void AdaDeltaSolver<Dtype>::AdaDeltaPreSolve() {
+  // Add the extra history entries for AdaDelta after those from
+  // SGDSolver::PreSolve
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  for (int i = 0; i < net_params.size(); ++i) {
+        const vector<int>& shape = net_params[i]->shape();
+        this->history_.push_back(
+                shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+  }
+}
+
+template <typename Dtype>
+void AdaDeltaSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype delta = this->param_.delta();
+  Dtype momentum = this->param_.momentum();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  size_t update_history_offset = net_params.size();
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    // compute square of gradient in update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history of gradients
+    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->cpu_data(), momentum,
+        this->history_[param_id]->mutable_cpu_data());
+
+    // add delta to history to guard against dividing by zero later
+    caffe_set(net_params[param_id]->count(), delta,
+        this->temp_[param_id]->mutable_cpu_data());
+
+    caffe_add(net_params[param_id]->count(),
+        this->temp_[param_id]->cpu_data(),
+        this->history_[update_history_offset + param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    caffe_add(net_params[param_id]->count(),
+        this->temp_[param_id]->cpu_data(),
+        this->history_[param_id]->cpu_data(),
+        this->temp_[param_id]->mutable_cpu_data());
+
+    // divide history of updates by history of gradients
+    caffe_div(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(),
+        this->temp_[param_id]->cpu_data(),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // jointly compute the RMS of both for update and gradient history
+    caffe_powx(net_params[param_id]->count(),
+        this->update_[param_id]->cpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // compute the update
+    caffe_mul(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(),
+        this->update_[param_id]->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+
+    // compute square of update
+    caffe_powx(net_params[param_id]->count(),
+        net_params[param_id]->cpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_cpu_data());
+
+    // update history of updates
+    caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->cpu_data(), momentum,
+        this->history_[update_history_offset + param_id]->mutable_cpu_data());
+
+    // apply learning rate
+    caffe_cpu_scale(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->cpu_diff(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    // compute square of gradient in update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history of gradients
+    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->gpu_data(), momentum,
+        this->history_[param_id]->mutable_gpu_data());
+
+    // add delta to history to guard against dividing by zero later
+    caffe_gpu_set(net_params[param_id]->count(), delta,
+        this->temp_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add(net_params[param_id]->count(),
+        this->temp_[param_id]->gpu_data(),
+        this->history_[update_history_offset + param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    caffe_gpu_add(net_params[param_id]->count(),
+        this->temp_[param_id]->gpu_data(),
+        this->history_[param_id]->gpu_data(),
+        this->temp_[param_id]->mutable_gpu_data());
+
+    // divide history of updates by history of gradients
+    caffe_gpu_div(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(),
+        this->temp_[param_id]->gpu_data(),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // jointly compute the RMS of both for update and gradient history
+    caffe_gpu_powx(net_params[param_id]->count(),
+        this->update_[param_id]->gpu_data(), Dtype(0.5),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // compute the update and copy to net_diff
+    caffe_gpu_mul(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(),
+        this->update_[param_id]->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
+
+    // compute square of update
+    caffe_gpu_powx(net_params[param_id]->count(),
+        net_params[param_id]->gpu_diff(), Dtype(2),
+        this->update_[param_id]->mutable_gpu_data());
+
+    // update history of updates
+    caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
+        this->update_[param_id]->gpu_data(), momentum,
+        this->history_[update_history_offset + param_id]->mutable_gpu_data());
+
+    // apply learning rate
+    caffe_gpu_scale(net_params[param_id]->count(), local_rate,
+        net_params[param_id]->gpu_diff(),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
+template <typename Dtype>
+void AdamSolver<Dtype>::AdamPreSolve() {
+  // Add the extra history entries for Adam after those from
+  // SGDSolver::PreSolve
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  for (int i = 0; i < net_params.size(); ++i) {
+    const vector<int>& shape = net_params[i]->shape();
+    this->history_.push_back(
+            shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
+  }
+}
+
+template <typename Dtype>
+void AdamSolver<Dtype>::ComputeUpdateValue(int param_id, Dtype rate) {
+  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
+  const vector<float>& net_params_lr = this->net_->params_lr();
+  Dtype local_rate = rate * net_params_lr[param_id];
+  const Dtype beta1 = this->param_.momentum();
+  const Dtype beta2 = this->param_.momentum2();
+
+  // we create aliases for convenience
+  size_t update_history_offset = net_params.size();
+  Blob<Dtype>* val_m = this->history_[param_id].get();
+  Blob<Dtype>* val_v = this->history_[param_id + update_history_offset].get();
+  Blob<Dtype>* val_t = this->temp_[param_id].get();
+
+  const int t = this->iter_  + 1;
+  const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t)) /
+      (Dtype(1.) - pow(beta1, t));
+  const int N = net_params[param_id]->count();
+  const Dtype eps_hat = this->param_.delta();
+
+  switch (Caffe::mode()) {
+    case Caffe::CPU: {
+    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
+    caffe_cpu_axpby(N, Dtype(1)-beta1,
+        net_params[param_id]->cpu_diff(), beta1,
+        val_m->mutable_cpu_data());
+
+    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
+    caffe_mul(N,
+        net_params[param_id]->cpu_diff(),
+        net_params[param_id]->cpu_diff(),
+    val_t->mutable_cpu_data());
+    caffe_cpu_axpby(N, Dtype(1)-beta2,
+        val_t->cpu_data(), beta2,
+        val_v->mutable_cpu_data());
+
+    // set update
+    caffe_powx(N,
+        val_v->cpu_data(), Dtype(0.5),
+        val_t->mutable_cpu_data());
+    caffe_add_scalar(N, eps_hat, val_t->mutable_cpu_data());
+    caffe_div(N,
+        val_m->cpu_data(),
+        val_t->cpu_data(),
+        val_t->mutable_cpu_data());
+
+    caffe_cpu_scale(N, local_rate*correction,
+        val_t->cpu_data(),
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t
+    caffe_gpu_axpby(N, Dtype(1)-beta1,
+        net_params[param_id]->gpu_diff(), beta1,
+        val_m->mutable_gpu_data());
+
+    // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2
+    caffe_gpu_mul(N,
+        net_params[param_id]->gpu_diff(),
+        net_params[param_id]->gpu_diff(),
+        val_t->mutable_gpu_data());
+    caffe_gpu_axpby(N, Dtype(1)-beta2,
+        val_t->gpu_data(), beta2,
+        val_v->mutable_gpu_data());
+
+    // set update
+    caffe_gpu_powx(N,
+        val_v->gpu_data(), Dtype(0.5),
+        val_t->mutable_gpu_data());
+    caffe_gpu_add_scalar(N, eps_hat,
+        val_t->mutable_gpu_data());
+    caffe_gpu_div(N,
+        val_m->gpu_data(),
+        val_t->gpu_data(),
+        val_t->mutable_gpu_data());
+
+    caffe_gpu_scale(N, local_rate*correction,
+        val_t->gpu_data(),
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
 INSTANTIATE_CLASS(Solver);
 INSTANTIATE_CLASS(SGDSolver);
 INSTANTIATE_CLASS(NesterovSolver);
 INSTANTIATE_CLASS(AdaGradSolver);
+INSTANTIATE_CLASS(RMSPropSolver);
+INSTANTIATE_CLASS(AdaDeltaSolver);
+INSTANTIATE_CLASS(AdamSolver);
 
 }  // namespace caffe
diff --git a/src/caffe/syncedmem.cpp b/src/caffe/syncedmem.cpp
index 7617ccf..a667a86 100644
--- a/src/caffe/syncedmem.cpp
+++ b/src/caffe/syncedmem.cpp
@@ -12,8 +12,14 @@ SyncedMemory::~SyncedMemory() {
   }
 
 #ifndef CPU_ONLY
-  if (gpu_ptr_) {
+  if (gpu_ptr_ && own_gpu_data_) {
+    int initial_device;
+    cudaGetDevice(&initial_device);
+    if (gpu_device_ != -1) {
+      CUDA_CHECK(cudaSetDevice(gpu_device_));
+    }
     CUDA_CHECK(cudaFree(gpu_ptr_));
+    cudaSetDevice(initial_device);
   }
 #endif  // CPU_ONLY
 }
@@ -48,13 +54,17 @@ inline void SyncedMemory::to_gpu() {
 #ifndef CPU_ONLY
   switch (head_) {
   case UNINITIALIZED:
+    CUDA_CHECK(cudaGetDevice(&gpu_device_));
     CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
     caffe_gpu_memset(size_, 0, gpu_ptr_);
     head_ = HEAD_AT_GPU;
+    own_gpu_data_ = true;
     break;
   case HEAD_AT_CPU:
     if (gpu_ptr_ == NULL) {
+      CUDA_CHECK(cudaGetDevice(&gpu_device_));
       CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+      own_gpu_data_ = true;
     }
     caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
     head_ = SYNCED;
@@ -92,6 +102,26 @@ const void* SyncedMemory::gpu_data() {
 #endif
 }
 
+void SyncedMemory::set_gpu_data(void* data) {
+#ifndef CPU_ONLY
+  CHECK(data);
+  if (own_gpu_data_) {
+    int initial_device;
+    cudaGetDevice(&initial_device);
+    if (gpu_device_ != -1) {
+      CUDA_CHECK(cudaSetDevice(gpu_device_));
+    }
+    CUDA_CHECK(cudaFree(gpu_ptr_));
+    cudaSetDevice(initial_device);
+  }
+  gpu_ptr_ = data;
+  head_ = HEAD_AT_GPU;
+  own_gpu_data_ = false;
+#else
+  NO_GPU;
+#endif
+}
+
 void* SyncedMemory::mutable_cpu_data() {
   to_cpu();
   head_ = HEAD_AT_CPU;
@@ -108,6 +138,20 @@ void* SyncedMemory::mutable_gpu_data() {
 #endif
 }
 
+#ifndef CPU_ONLY
+void SyncedMemory::async_gpu_push(const cudaStream_t& stream) {
+  CHECK(head_ == HEAD_AT_CPU);
+  if (gpu_ptr_ == NULL) {
+    CUDA_CHECK(cudaGetDevice(&gpu_device_));
+    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
+    own_gpu_data_ = true;
+  }
+  const cudaMemcpyKind put = cudaMemcpyHostToDevice;
+  CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
+  // Assume caller will synchronize on the stream before use
+  head_ = SYNCED;
+}
+#endif
 
 }  // namespace caffe
 
diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
index c14b67c..94e529b 100644
--- a/src/caffe/test/test_accuracy_layer.cpp
+++ b/src/caffe/test/test_accuracy_layer.cpp
@@ -22,6 +22,7 @@ class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
       : blob_bottom_data_(new Blob<Dtype>()),
         blob_bottom_label_(new Blob<Dtype>()),
         blob_top_(new Blob<Dtype>()),
+        blob_top_per_class_(new Blob<Dtype>()),
         top_k_(3) {
     vector<int> shape(2);
     shape[0] = 100;
@@ -34,6 +35,8 @@ class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
     blob_bottom_vec_.push_back(blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_label_);
     blob_top_vec_.push_back(blob_top_);
+    blob_top_per_class_vec_.push_back(blob_top_);
+    blob_top_per_class_vec_.push_back(blob_top_per_class_);
   }
 
   virtual void FillBottoms() {
@@ -56,12 +59,15 @@ class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
     delete blob_bottom_data_;
     delete blob_bottom_label_;
     delete blob_top_;
+    delete blob_top_per_class_;
   }
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_label_;
   Blob<Dtype>* const blob_top_;
+  Blob<Dtype>* const blob_top_per_class_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
+  vector<Blob<Dtype>*> blob_top_per_class_vec_;
   int top_k_;
 };
 
@@ -90,6 +96,20 @@ TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
   EXPECT_EQ(this->blob_top_->width(), 1);
 }
 
+TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
+  LayerParameter layer_param;
+  AccuracyLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 1);
+  EXPECT_EQ(this->blob_top_->channels(), 1);
+  EXPECT_EQ(this->blob_top_->height(), 1);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+  EXPECT_EQ(this->blob_top_per_class_->num(), 10);
+  EXPECT_EQ(this->blob_top_per_class_->channels(), 1);
+  EXPECT_EQ(this->blob_top_per_class_->height(), 1);
+  EXPECT_EQ(this->blob_top_per_class_->width(), 1);
+}
+
 TYPED_TEST(AccuracyLayerTest, TestForwardCPU) {
   LayerParameter layer_param;
   AccuracyLayer<TypeParam> layer(layer_param);
@@ -228,4 +248,91 @@ TYPED_TEST(AccuracyLayerTest, TestForwardCPUTopK) {
               num_correct_labels / 100.0, 1e-4);
 }
 
+TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::CPU);
+  AccuracyLayer<TypeParam> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+
+  TypeParam max_value;
+  int max_id;
+  int num_correct_labels = 0;
+  const int num_class = this->blob_top_per_class_->num();
+  vector<int> correct_per_class(num_class, 0);
+  vector<int> num_per_class(num_class, 0);
+  for (int i = 0; i < 100; ++i) {
+    max_value = -FLT_MAX;
+    max_id = 0;
+    for (int j = 0; j < 10; ++j) {
+      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+        max_id = j;
+      }
+    }
+    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+      ++num_correct_labels;
+      ++correct_per_class[max_id];
+    }
+  }
+  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+              num_correct_labels / 100.0, 1e-4);
+  for (int i = 0; i < num_class; ++i) {
+    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+                static_cast<float>(correct_per_class[i]) / num_per_class[i],
+                1e-4);
+  }
+}
+
+
+TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
+  LayerParameter layer_param;
+  Caffe::set_mode(Caffe::CPU);
+  const TypeParam kIgnoreLabelValue = -1;
+  layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
+  AccuracyLayer<TypeParam> layer(layer_param);
+  // Manually set some labels to the ignore label value (-1).
+  this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
+  this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
+  this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+
+  TypeParam max_value;
+  int max_id;
+  int num_correct_labels = 0;
+  const int num_class = this->blob_top_per_class_->num();
+  vector<int> correct_per_class(num_class, 0);
+  vector<int> num_per_class(num_class, 0);
+  int count = 0;
+  for (int i = 0; i < 100; ++i) {
+    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+      continue;
+    }
+    ++count;
+    max_value = -FLT_MAX;
+    max_id = 0;
+    for (int j = 0; j < 10; ++j) {
+      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+        max_id = j;
+      }
+    }
+    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+      ++num_correct_labels;
+      ++correct_per_class[max_id];
+    }
+  }
+  EXPECT_EQ(count, 97);
+  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+              num_correct_labels / TypeParam(count), 1e-4);
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+                TypeParam(correct_per_class[i]) / num_per_class[i],
+                1e-4);
+  }
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_concat_layer.cpp b/src/caffe/test/test_concat_layer.cpp
index 662a50f..088e0a4 100644
--- a/src/caffe/test/test_concat_layer.cpp
+++ b/src/caffe/test/test_concat_layer.cpp
@@ -173,4 +173,13 @@ TYPED_TEST(ConcatLayerTest, TestGradientChannels) {
     this->blob_top_vec_);
 }
 
+TYPED_TEST(ConcatLayerTest, TestGradientChannelsBottomOneOnly) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConcatLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradient(&layer, this->blob_bottom_vec_0_,
+    this->blob_top_vec_, 1);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_data/generate_sample_data.py b/src/caffe/test/test_data/generate_sample_data.py
index ab55726..3703b41 100644
--- a/src/caffe/test/test_data/generate_sample_data.py
+++ b/src/caffe/test/test_data/generate_sample_data.py
@@ -1,5 +1,5 @@
 """
-Generate data used in the HDF5DataLayer test.
+Generate data used in the HDF5DataLayer and GradientBasedSolver tests.
 """
 import os
 import numpy as np
@@ -7,6 +7,8 @@ import h5py
 
 script_dir = os.path.dirname(os.path.abspath(__file__))
 
+# Generate HDF5DataLayer sample_data.h5
+
 num_cols = 8
 num_rows = 10
 height = 6
@@ -51,3 +53,27 @@ with h5py.File(script_dir + '/sample_data_2_gzip.h5', 'w') as f:
 with open(script_dir + '/sample_data_list.txt', 'w') as f:
     f.write(script_dir + '/sample_data.h5\n')
     f.write(script_dir + '/sample_data_2_gzip.h5\n')
+
+# Generate GradientBasedSolver solver_data.h5
+
+num_cols = 3
+num_rows = 8
+height = 10
+width = 10
+
+data = np.random.randn(num_rows, num_cols, height, width)
+data = data.reshape(num_rows, num_cols, height, width)
+data = data.astype('float32')
+
+targets = np.random.randn(num_rows, 1)
+targets = targets.astype('float32')
+
+print data
+print targets
+
+with h5py.File(script_dir + '/solver_data.h5', 'w') as f:
+    f['data'] = data
+    f['targets'] = targets
+
+with open(script_dir + '/solver_data_list.txt', 'w') as f:
+    f.write(script_dir + '/solver_data.h5\n')
diff --git a/src/caffe/test/test_data/solver_data.h5 b/src/caffe/test/test_data/solver_data.h5
new file mode 100644
index 0000000..7ee05ea
Binary files /dev/null and b/src/caffe/test/test_data/solver_data.h5 differ
diff --git a/src/caffe/test/test_data/solver_data_list.txt b/src/caffe/test/test_data/solver_data_list.txt
new file mode 100644
index 0000000..a6552f5
--- /dev/null
+++ b/src/caffe/test/test_data/solver_data_list.txt
@@ -0,0 +1 @@
+src/caffe/test/test_data/solver_data.h5
diff --git a/src/caffe/test/test_embed_layer.cpp b/src/caffe/test/test_embed_layer.cpp
new file mode 100644
index 0000000..7a4fb98
--- /dev/null
+++ b/src/caffe/test/test_embed_layer.cpp
@@ -0,0 +1,183 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+#ifndef CPU_ONLY
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+#endif
+
+template <typename TypeParam>
+class EmbedLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+ protected:
+  EmbedLayerTest()
+      : blob_bottom_(new Blob<Dtype>(4, 1, 1, 1)),
+        blob_top_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~EmbedLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(EmbedLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(EmbedLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_top_->num_axes(), 5);
+  EXPECT_EQ(this->blob_top_->shape(0), 4);
+  EXPECT_EQ(this->blob_top_->shape(1), 1);
+  EXPECT_EQ(this->blob_top_->shape(2), 1);
+  EXPECT_EQ(this->blob_top_->shape(3), 1);
+  EXPECT_EQ(this->blob_top_->shape(4), 10);
+}
+
+TYPED_TEST(EmbedLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  const int kNumOutput = 10;
+  const int kInputDim = 5;
+  embed_param->set_num_output(kNumOutput);
+  embed_param->set_input_dim(kInputDim);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->set_bias_term(false);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(1, layer->blobs().size());
+  vector<int> weight_shape(2);
+  weight_shape[0] = kInputDim;
+  weight_shape[1] = kNumOutput;
+  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
+  }
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> weight_offset(2, 0);
+  vector<int> top_offset(5, 0);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
+    weight_offset[1] = 0;
+    top_offset[0] = i;
+    top_offset[4] = 0;
+    for (int j = 0; j < kNumOutput; ++j) {
+      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset),
+                this->blob_top_->data_at(top_offset));
+      ++top_offset[4];
+      ++weight_offset[1];
+    }
+  }
+}
+
+TYPED_TEST(EmbedLayerTest, TestForwardWithBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  const int kNumOutput = 10;
+  const int kInputDim = 5;
+  embed_param->set_num_output(kNumOutput);
+  embed_param->set_input_dim(kInputDim);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
+  embed_param->set_bias_term(true);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(2, layer->blobs().size());
+  vector<int> weight_shape(2);
+  weight_shape[0] = kInputDim;
+  weight_shape[1] = kNumOutput;
+  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
+  }
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> bias_offset(1, 0);
+  vector<int> weight_offset(2, 0);
+  vector<int> top_offset(5, 0);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
+    weight_offset[1] = 0;
+    top_offset[0] = i;
+    top_offset[4] = 0;
+    bias_offset[0] = 0;
+    for (int j = 0; j < kNumOutput; ++j) {
+      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset) +
+                layer->blobs()[1]->data_at(bias_offset),
+                this->blob_top_->data_at(top_offset));
+      ++top_offset[4];
+      ++weight_offset[1];
+      ++bias_offset[0];
+    }
+  }
+}
+
+TYPED_TEST(EmbedLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  embed_param->set_bias_term(false);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  EmbedLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  this->blob_bottom_->mutable_cpu_data()[0] = 4;
+  this->blob_bottom_->mutable_cpu_data()[1] = 2;
+  this->blob_bottom_->mutable_cpu_data()[2] = 2;
+  this->blob_bottom_->mutable_cpu_data()[3] = 3;
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, -2);
+}
+
+TYPED_TEST(EmbedLayerTest, TestGradientWithBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  embed_param->set_bias_term(true);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
+  EmbedLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  this->blob_bottom_->mutable_cpu_data()[0] = 4;
+  this->blob_bottom_->mutable_cpu_data()[1] = 2;
+  this->blob_bottom_->mutable_cpu_data()[2] = 2;
+  this->blob_bottom_->mutable_cpu_data()[3] = 3;
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, -2);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index c9135d6..7ad7467 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -8,8 +8,10 @@
 #include "gtest/gtest.h"
 
 #include "caffe/common.hpp"
+#include "caffe/parallel.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/solver.hpp"
+#include "caffe/util/io.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -23,12 +25,27 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   GradientBasedSolverTest() :
-      seed_(1701), num_(4), channels_(3), height_(10), width_(10) {}
+      seed_(1701), num_(4), channels_(3), height_(10), width_(10),
+      share_(false) {
+        input_file_ = new string(
+        CMAKE_SOURCE_DIR "caffe/test/test_data/solver_data_list.txt" CMAKE_EXT);
+      }
+  ~GradientBasedSolverTest() {
+    delete input_file_;
+  }
 
+  string snapshot_prefix_;
   shared_ptr<SGDSolver<Dtype> > solver_;
+  shared_ptr<P2PSync<Dtype> > sync_;
   int seed_;
+  // Dimensions are determined by generate_sample_data.py
+  // TODO this is brittle and the hdf5 file should be checked instead.
   int num_, channels_, height_, width_;
-  Dtype delta_;  // Stability constant for AdaGrad.
+  bool share_;
+  Dtype delta_;  // Stability constant for RMSProp, AdaGrad, AdaDelta and Adam
+
+  // Test data: check out generate_sample_data.py in the same directory.
+  string* input_file_;
 
   virtual SolverParameter_SolverType solver_type() = 0;
   virtual void InitSolver(const SolverParameter& param) = 0;
@@ -36,9 +53,6 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
   virtual void InitSolverFromProtoString(const string& proto) {
     SolverParameter param;
     CHECK(google::protobuf::TextFormat::ParseFromString(proto, &param));
-    // Disable saving a final snapshot so the tests don't pollute the user's
-    // working directory with useless snapshots.
-    param.set_snapshot_after_train(false);
     // Set the solver_mode according to current Caffe::mode.
     switch (Caffe::mode()) {
       case Caffe::CPU:
@@ -51,47 +65,58 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
         LOG(FATAL) << "Unknown Caffe mode: " << Caffe::mode();
     }
     InitSolver(param);
-    delta_ = (solver_type() == SolverParameter_SolverType_ADAGRAD) ?
-         param.delta() : 0;
+    delta_ = param.delta();
   }
 
-  void RunLeastSquaresSolver(const Dtype learning_rate,
+  string RunLeastSquaresSolver(const Dtype learning_rate,
       const Dtype weight_decay, const Dtype momentum, const int num_iters,
-      const int iter_size = 1) {
+      const int iter_size = 1, const int devices = 1,
+      const bool snapshot = false, const char* from_snapshot = NULL) {
     ostringstream proto;
+    int device_id = 0;
+#ifndef CPU_ONLY
+    if (Caffe::mode() == Caffe::GPU) {
+      CUDA_CHECK(cudaGetDevice(&device_id));
+    }
+#endif
     proto <<
+       "snapshot_after_train: " << snapshot << " "
        "max_iter: " << num_iters << " "
        "base_lr: " << learning_rate << " "
        "lr_policy: 'fixed' "
        "iter_size: " << iter_size << " "
+       "device_id: " << device_id << " "
        "net_param { "
        "  name: 'TestNetwork' "
        "  layer { "
        "    name: 'data' "
-       "    type: 'DummyData' "
-       "    dummy_data_param { "
-       "      num: " << num_ / iter_size << " "
-       "      channels: " << channels_ << " "
-       "      height: " << height_ << " "
-       "      width: " << width_ << " "
-       "      channels: 1 "
-       "      height: 1 "
-       "      width: 1 "
-       "      data_filler { "
-       "        type: 'constant' "
-       "        value: 1.0 "
-       "      } "
-       "      data_filler { "
-       "        type: 'gaussian' "
-       "        std: 1.0 "
-       "      } "
+       "    type: 'HDF5Data' "
+       "    hdf5_data_param { "
+       "      source: '" << *(this->input_file_) << "' "
+       "      batch_size: " << num_ / iter_size << " "
        "    } "
        "    top: 'data' "
        "    top: 'targets' "
-       "  } "
+       "  } ";
+    if (share_) {
+      proto <<
+         "  layer { "
+         "    name: 'slice' "
+         "    type: 'Slice' "
+         "    bottom: 'data' "
+         "    top: 'data1' "
+         "    top: 'data2' "
+         "    slice_param { "
+         "      axis: 0 "
+         "    } "
+         "  } ";
+    }
+    proto <<
        "  layer { "
        "    name: 'innerprod' "
        "    type: 'InnerProduct' "
+       "    param { name: 'weights' } "
+       "    param { name: 'bias' } "
        "    inner_product_param { "
        "      num_output: 1 "
        "      weight_filler { "
@@ -103,9 +128,42 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
        "        std: 1.0 "
        "      } "
        "    } "
-       "    bottom: 'data' "
-       "    top: 'innerprod' "
-       "  } "
+       "    bottom: '" << string(share_ ? "data1": "data") << "' "
+       "    top: '" << string(share_ ? "innerprod1": "innerprod") << "' "
+       "  } ";
+    if (share_) {
+      proto <<
+         "  layer { "
+         "    name: 'innerprod2' "
+         "    type: 'InnerProduct' "
+         "    param { name: 'weights' } "
+         "    param { name: 'bias' } "
+         "    inner_product_param { "
+         "      num_output: 1 "
+         "      weight_filler { "
+         "        type: 'gaussian' "
+         "        std: 1.0 "
+         "      } "
+         "      bias_filler { "
+         "        type: 'gaussian' "
+         "        std: 1.0 "
+         "      } "
+         "    } "
+         "    bottom: 'data2' "
+         "    top: 'innerprod2' "
+         "  } "
+         "  layer { "
+         "    name: 'concat' "
+         "    type: 'Concat' "
+         "    bottom: 'innerprod1' "
+         "    bottom: 'innerprod2' "
+         "    top: 'innerprod' "
+         "    concat_param { "
+         "      axis: 0 "
+         "    } "
+         "  } ";
+    }
+    proto <<
        "  layer { "
        "    name: 'loss' "
        "    type: 'EuclideanLoss' "
@@ -119,9 +177,46 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     if (momentum != 0) {
       proto << "momentum: " << momentum << " ";
     }
+    MakeTempDir(&snapshot_prefix_);
+    proto << "snapshot_prefix: '" << snapshot_prefix_ << "/' ";
+    if (snapshot) {
+      proto << "snapshot: " << num_iters << " ";
+    }
     Caffe::set_random_seed(this->seed_);
     this->InitSolverFromProtoString(proto.str());
-    this->solver_->Solve();
+    if (from_snapshot != NULL) {
+      this->solver_->Restore(from_snapshot);
+      vector<Blob<Dtype>*> empty_bottom_vec;
+      for (int i = 0; i < this->solver_->iter(); ++i) {
+        this->solver_->net()->Forward(empty_bottom_vec);
+      }
+    }
+    if (devices == 1) {
+      this->solver_->Solve();
+    } else {
+      LOG(INFO) << "Multi-GPU test on " << devices << " devices";
+      vector<int> gpus;
+      // put current device at the beginning
+      int device_id = solver_->param().device_id();
+      gpus.push_back(device_id);
+      for (int i = 0; gpus.size() < devices; ++i) {
+        if (i != device_id)
+          gpus.push_back(i);
+      }
+      Caffe::set_solver_count(gpus.size());
+      this->sync_.reset(new P2PSync<Dtype>(
+          this->solver_, NULL, this->solver_->param()));
+      this->sync_->run(gpus);
+      Caffe::set_solver_count(1);
+    }
+    if (snapshot) {
+      ostringstream resume_file;
+      resume_file << snapshot_prefix_ << "/_iter_" << num_iters
+                  << ".solverstate";
+      string resume_filename = resume_file.str();
+      return resume_filename;
+    }
+    return string();
   }
 
   // Compute an update value given the current state of the train net,
@@ -129,7 +224,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
   // updated_params will store the updated weight and bias results,
   // using the blobs' diffs to hold the update values themselves.
   void ComputeLeastSquaresUpdate(const Dtype learning_rate,
-      const Dtype weight_decay, const Dtype momentum,
+      const Dtype weight_decay, const Dtype momentum, const int num_iters,
       vector<shared_ptr<Blob<Dtype> > >* updated_params) {
     const int N = num_;
     const int D = channels_ * height_ * width_;
@@ -195,7 +290,12 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
           ((i == D) ? bias.cpu_data()[0] : weights.cpu_data()[i]);
       // Finally, compute update.
       const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
-      ASSERT_EQ(2, history.size());  // 1 blob for weights, 1 for bias
+      if (solver_type() != SolverParameter_SolverType_ADADELTA
+          && solver_type() != SolverParameter_SolverType_ADAM) {
+        ASSERT_EQ(2, history.size());  // 1 blob for weights, 1 for bias
+      } else {
+        ASSERT_EQ(4, history.size());  // additional blobs for update history
+      }
       Dtype update_value = learning_rate * grad;
       const Dtype history_value = (i == D) ?
             history[1]->cpu_data()[0] : history[0]->cpu_data()[i];
@@ -212,6 +312,40 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
       case SolverParameter_SolverType_ADAGRAD:
         update_value /= std::sqrt(history_value + grad * grad) + delta_;
         break;
+      case SolverParameter_SolverType_RMSPROP: {
+        const Dtype rms_decay = 0.95;
+        update_value /= std::sqrt(rms_decay*history_value
+            + grad * grad * (1 - rms_decay)) + delta_;
+        }
+        break;
+      case SolverParameter_SolverType_ADADELTA:
+      {
+        const Dtype update_history_value = (i == D) ?
+            history[1 + num_param_blobs]->cpu_data()[0] :
+            history[0 + num_param_blobs]->cpu_data()[i];
+        const Dtype weighted_gradient_average =
+            momentum * history_value + (1 - momentum) * (grad * grad);
+        update_value = grad * std::sqrt((update_history_value + delta_) /
+            (weighted_gradient_average + delta_)) * learning_rate;
+        // not actually needed, just here for illustrative purposes
+        // const Dtype weighted_update_average =
+        //   momentum * update_history_value + (1 - momentum) * (update_value);
+        break;
+      }
+      case SolverParameter_SolverType_ADAM: {
+        const Dtype momentum2 = 0.999;
+        const Dtype m = history_value;
+        const Dtype v = (i == D) ?
+            history[1 + num_param_blobs]->cpu_data()[0] :
+            history[0 + num_param_blobs]->cpu_data()[i];
+        const Dtype val_m = (1 - momentum) * grad + momentum * m;
+        const Dtype val_v = (1 - momentum2) * grad * grad + momentum2 * v;
+        Dtype alpha_t = learning_rate *
+            std::sqrt(Dtype(1) - pow(momentum2, num_iters)) /
+            (Dtype(1.) - pow(momentum, num_iters));
+        update_value = alpha_t * val_m / (std::sqrt(val_v) + delta_);
+        break;
+      }
       default:
         LOG(FATAL) << "Unknown solver type: " << solver_type();
       }
@@ -333,20 +467,108 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
   void TestLeastSquaresUpdate(const Dtype learning_rate = 1.0,
       const Dtype weight_decay = 0.0, const Dtype momentum = 0.0,
       const int iter_to_check = 0) {
-    // Initialize the solver and run K (= iter_to_check) solver iterations.
-    RunLeastSquaresSolver(learning_rate, weight_decay, momentum, iter_to_check);
+    const int kNum = num_;
+    const int kIterSize = 1;
+    // Test over all numbers of devices.
+    int available_devices = 1;
+#ifndef CPU_ONLY
+    if (Caffe::mode() == Caffe::GPU) {
+      CUDA_CHECK(cudaGetDeviceCount(&available_devices));
+    }
+#endif
+    for (int devices = 1; devices <= available_devices; ++devices) {
+      // Configure batch size for single / multi device equivalence.
+      // Constant data is needed for multi device as for accumulation.
+      num_ = kNum * devices;
 
-    // Compute the (K+1)th update using the analytic least squares gradient.
-    vector<shared_ptr<Blob<Dtype> > > updated_params;
-    ComputeLeastSquaresUpdate(learning_rate, weight_decay, momentum,
-                              &updated_params);
+      // Initialize the solver and run K (= iter_to_check) solver iterations
+      // (on single device).
+      RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
+                            iter_to_check, kIterSize, 1);
 
-    // Reinitialize the solver and run K+1 solver iterations.
+      // Compute the (K+1)th update using the analytic least squares gradient.
+      vector<shared_ptr<Blob<Dtype> > > updated_params;
+      ComputeLeastSquaresUpdate(learning_rate, weight_decay, momentum,
+          iter_to_check + 1, &updated_params);
+
+      // Reinitialize the solver and run K+1 solver iterations.
+      num_ = kNum;
+      RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
+          iter_to_check + 1, kIterSize, devices);
+
+      // Check that the solver's solution matches ours.
+      CheckLeastSquaresUpdate(updated_params);
+    }
+  }
+
+  void TestSnapshot(const Dtype learning_rate = 1.0,
+      const Dtype weight_decay = 0.0, const Dtype momentum = 0.0,
+      const int num_iters = 1) {
+    // Run the solver for num_iters * 2 iterations.
+    const int total_num_iters = num_iters * 2;
+    bool snapshot = false;
+    const int kIterSize = 1;
+    const int kDevices = 1;
+    RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
+        total_num_iters, kIterSize, kDevices, snapshot);
+
+    // Save the resulting param values.
+    vector<shared_ptr<Blob<Dtype> > > param_copies;
+    const vector<Blob<Dtype>*>& orig_params =
+        solver_->net()->learnable_params();
+    param_copies.resize(orig_params.size());
+    for (int i = 0; i < orig_params.size(); ++i) {
+      param_copies[i].reset(new Blob<Dtype>());
+      const bool kReshape = true;
+      for (int copy_diff = false; copy_diff <= true; ++copy_diff) {
+        param_copies[i]->CopyFrom(*orig_params[i], copy_diff, kReshape);
+      }
+    }
+
+    // Save the solver history
+    vector<shared_ptr<Blob<Dtype> > > history_copies;
+    const vector<shared_ptr<Blob<Dtype> > >& orig_history = solver_->history();
+    history_copies.resize(orig_history.size());
+    for (int i = 0; i < orig_history.size(); ++i) {
+      history_copies[i].reset(new Blob<Dtype>());
+      const bool kReshape = true;
+      for (int copy_diff = false; copy_diff <= true; ++copy_diff) {
+        history_copies[i]->CopyFrom(*orig_history[i], copy_diff, kReshape);
+      }
+    }
+
+    // Run the solver for num_iters iterations and snapshot.
+    snapshot = true;
+    string snapshot_name = RunLeastSquaresSolver(learning_rate, weight_decay,
+        momentum, num_iters, kIterSize, kDevices, snapshot);
+
+    // Reinitialize the solver and run for num_iters more iterations.
+    snapshot = false;
     RunLeastSquaresSolver(learning_rate, weight_decay, momentum,
-                          iter_to_check + 1);
+        total_num_iters, kIterSize, kDevices,
+        snapshot, snapshot_name.c_str());
 
-    // Check that the solver's solution matches ours.
-    CheckLeastSquaresUpdate(updated_params);
+    // Check that params now match.
+    const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
+    for (int i = 0; i < params.size(); ++i) {
+      for (int j = 0; j < params[i]->count(); ++j) {
+        EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+            << "param " << i << " data differed at dim " << j;
+        EXPECT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+            << "param " << i << " diff differed at dim " << j;
+      }
+    }
+
+    // Check that history now matches.
+    const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
+    for (int i = 0; i < history.size(); ++i) {
+      for (int j = 0; j < history[i]->count(); ++j) {
+        EXPECT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+            << "history blob " << i << " data differed at dim " << j;
+        EXPECT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+            << "history blob " << i << " diff differed at dim " << j;
+      }
+    }
   }
 };
 
@@ -371,23 +593,38 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdate) {
   this->TestLeastSquaresUpdate();
 }
 
-TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateLROneTenth) {
+TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateLROneHundredth) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
+  const Dtype kLearningRate = 0.01;
   this->TestLeastSquaresUpdate(kLearningRate);
 }
 
 TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithWeightDecay) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
+  const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.5;
-  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
+  const Dtype kMomentum = 0;
+  const int kNumIters = 1;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithWeightDecayMultiIter) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
 }
 
 TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentum) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
-  const Dtype kWeightDecay = 0.0;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0;
   const Dtype kMomentum = 0.5;
   const int kNumIters = 1;
   for (int i = 0; i <= kNumIters; ++i) {
@@ -397,8 +634,8 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentum) {
 
 TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
-  const Dtype kWeightDecay = 0.0;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0;
   const Dtype kMomentum = 0.5;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
@@ -409,18 +646,30 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
 TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverything) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.9;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.5;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
     this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
   }
 }
 
+TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.5;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
 TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.1;
+  const Dtype kWeightDecay = 0.5;
   const Dtype kMomentum = 0.9;
   const int kNumIters = 4;
   const int kIterSize = 2;
@@ -428,6 +677,42 @@ TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
       kIterSize);
 }
 
+TYPED_TEST(SGDSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->share_ = true;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(SGDSolverTest, TestSnapshot) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(SGDSolverTest, TestSnapshotShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+
 template <typename TypeParam>
 class AdaGradSolverTest : public GradientBasedSolverTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -447,15 +732,15 @@ TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdate) {
   this->TestLeastSquaresUpdate();
 }
 
-TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateLROneTenth) {
+TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateLROneHundredth) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
+  const Dtype kLearningRate = 0.01;
   this->TestLeastSquaresUpdate(kLearningRate);
 }
 
 TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithWeightDecay) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
+  const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.5;
   this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
 }
@@ -463,25 +748,74 @@ TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithWeightDecay) {
 TYPED_TEST(AdaGradSolverTest, TestAdaGradLeastSquaresUpdateWithEverything) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.0;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
     this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
   }
 }
 
+TYPED_TEST(AdaGradSolverTest,
+      TestAdaGradLeastSquaresUpdateWithEverythingShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
 TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.1;
-  const Dtype kMomentum = 0.0;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
   const int kNumIters = 4;
   const int kIterSize = 2;
   this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
       kIterSize);
 }
 
+TYPED_TEST(AdaGradSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->share_ = true;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(AdaGradSolverTest, TestSnapshot) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdaGradSolverTest, TestSnapshotShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+
 template <typename TypeParam>
 class NesterovSolverTest : public GradientBasedSolverTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -501,23 +835,35 @@ TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdate) {
   this->TestLeastSquaresUpdate();
 }
 
-TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneTenth) {
+TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateLROneHundredth) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 0.1;
+  const Dtype kLearningRate = 0.01;
   this->TestLeastSquaresUpdate(kLearningRate);
 }
 
 TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithWeightDecay) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
+  const Dtype kLearningRate = 0.01;
   const Dtype kWeightDecay = 0.5;
   this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
 }
 
+TYPED_TEST(NesterovSolverTest,
+           TestNesterovLeastSquaresUpdateWithWeightDecayMultiIter) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
 TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
-  const Dtype kWeightDecay = 0.0;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0;
   const Dtype kMomentum = 0.5;
   const int kNumIters = 1;
   for (int i = 0; i <= kNumIters; ++i) {
@@ -527,8 +873,8 @@ TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithMomentum) {
 
 TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
   typedef typename TypeParam::Dtype Dtype;
-  const Dtype kLearningRate = 1.0;
-  const Dtype kWeightDecay = 0.0;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0;
   const Dtype kMomentum = 0.5;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
@@ -539,7 +885,7 @@ TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
 TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
-  const Dtype kWeightDecay = 0.1;
+  const Dtype kWeightDecay = 0.5;
   const Dtype kMomentum = 0.9;
   const int kNumIters = 4;
   for (int i = 0; i <= kNumIters; ++i) {
@@ -547,15 +893,407 @@ TYPED_TEST(NesterovSolverTest, TestNesterovLeastSquaresUpdateWithEverything) {
   }
 }
 
+TYPED_TEST(NesterovSolverTest,
+           TestNesterovLeastSquaresUpdateWithEverythingShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
 TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
   typedef typename TypeParam::Dtype Dtype;
   const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(NesterovSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->share_ = true;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(NesterovSolverTest, TestSnapshot) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(NesterovSolverTest, TestSnapshotShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+template <typename TypeParam>
+class AdaDeltaSolverTest : public GradientBasedSolverTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  virtual void InitSolver(const SolverParameter& param) {
+    this->solver_.reset(new AdaDeltaSolver<Dtype>(param));
+  }
+
+  virtual SolverParameter_SolverType solver_type() {
+    return SolverParameter_SolverType_ADADELTA;
+  }
+};
+
+TYPED_TEST_CASE(AdaDeltaSolverTest, TestDtypesAndDevices);
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdate) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  this->TestLeastSquaresUpdate(kLearningRate);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithWeightDecay) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.95;
+  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithHalfMomentum) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.0;
+  const Dtype kMomentum = 0.5;
+  const int kNumIters = 1;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+  }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithMomentum) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.0;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 1;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+  }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.0;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 4;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithEverything) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.1;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 4;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdaDeltaSolverTest,
+           TestAdaDeltaLeastSquaresUpdateWithEverythingShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
   const Dtype kWeightDecay = 0.1;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.1;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.1;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->share_ = true;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestSnapshot) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.1;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 4;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdaDeltaSolverTest, TestSnapshotShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.1;
+  const Dtype kWeightDecay = 0.1;
+  const Dtype kMomentum = 0.95;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+template <typename TypeParam>
+class AdamSolverTest : public GradientBasedSolverTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  virtual void InitSolver(const SolverParameter& param) {
+    SolverParameter new_param = param;
+    const Dtype momentum = 0.9;
+    new_param.set_momentum(momentum);
+    const Dtype momentum2 = 0.999;
+    new_param.set_momentum2(momentum2);
+    this->solver_.reset(new AdamSolver<Dtype>(new_param));
+  }
+  virtual SolverParameter_SolverType solver_type() {
+    return SolverParameter_SolverType_ADAM;
+  }
+};
+
+TYPED_TEST_CASE(AdamSolverTest, TestDtypesAndDevices);
+
+TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdate) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0;
+  const Dtype kMomentum = 0.9;
+  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+}
+
+TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdateWithWeightDecay) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum);
+}
+
+TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdateWithEverything) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
   const Dtype kMomentum = 0.9;
   const int kNumIters = 4;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdamSolverTest, TestAdamLeastSquaresUpdateWithEverythingShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdamSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(AdamSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->share_ = true;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(AdamSolverTest, TestSnapshot) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(AdamSolverTest, TestSnapshotShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.9;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+template <typename TypeParam>
+class RMSPropSolverTest : public GradientBasedSolverTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  virtual void InitSolver(const SolverParameter& param) {
+    const Dtype rms_decay = 0.95;
+    SolverParameter new_param = param;
+    new_param.set_rms_decay(rms_decay);
+    this->solver_.reset(new RMSPropSolver<Dtype>(new_param));
+  }
+  virtual SolverParameter_SolverType solver_type() {
+    return SolverParameter_SolverType_RMSPROP;
+  }
+};
+
+TYPED_TEST_CASE(RMSPropSolverTest, TestDtypesAndDevices);
+
+TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithWeightDecay) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 1.0;
+  const Dtype kWeightDecay = 0.5;
+  this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay);
+}
+
+TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithRmsDecay) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.0;
+  const Dtype kMomentum = 0.0;
+  const int kNumIters = 4;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(RMSPropSolverTest, TestRMSPropLeastSquaresUpdateWithEverything) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.0;
+  const int kNumIters = 4;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(RMSPropSolverTest,
+      TestRMSPropLeastSquaresUpdateWithEverythingShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.0;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 0; i <= kNumIters; ++i) {
+    this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(RMSPropSolverTest, TestLeastSquaresUpdateWithEverythingAccum) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.0;
+  const int kNumIters = 4;
+  const int kIterSize = 2;
+  this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
+      kIterSize);
+}
+
+TYPED_TEST(RMSPropSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0.0;
+  const int kNumIters = 4;
   const int kIterSize = 2;
+  this->share_ = true;
   this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters,
       kIterSize);
 }
 
+TYPED_TEST(RMSPropSolverTest, TestSnapshot) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
+TYPED_TEST(RMSPropSolverTest, TestSnapshotShare) {
+  typedef typename TypeParam::Dtype Dtype;
+  const Dtype kLearningRate = 0.01;
+  const Dtype kWeightDecay = 0.5;
+  const Dtype kMomentum = 0;
+  const int kNumIters = 4;
+  this->share_ = true;
+  for (int i = 1; i <= kNumIters; ++i) {
+    this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i);
+  }
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index a23034f..b56277b 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -6,6 +6,7 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/proto/caffe.pb.h"
+#include "caffe/util/hdf5.hpp"
 #include "caffe/util/io.hpp"
 #include "caffe/vision_layers.hpp"
 
diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp
index c03df17..fbf0c85 100644
--- a/src/caffe/test/test_inner_product_layer.cpp
+++ b/src/caffe/test/test_inner_product_layer.cpp
@@ -23,16 +23,21 @@ class InnerProductLayerTest : public MultiDeviceTest<TypeParam> {
  protected:
   InnerProductLayerTest()
       : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_nobatch_(new Blob<Dtype>(1, 2, 3, 4)),
         blob_top_(new Blob<Dtype>()) {
     // fill the values
     FillerParameter filler_param;
     UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_);
-    blob_bottom_vec_.push_back(blob_bottom_);
     blob_top_vec_.push_back(blob_top_);
   }
-  virtual ~InnerProductLayerTest() { delete blob_bottom_; delete blob_top_; }
+  virtual ~InnerProductLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_nobatch_;
+    delete blob_top_;
+  }
   Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_nobatch_;
   Blob<Dtype>* const blob_top_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
@@ -42,6 +47,7 @@ TYPED_TEST_CASE(InnerProductLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(InnerProductLayerTest, TestSetUp) {
   typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
   LayerParameter layer_param;
   InnerProductParameter* inner_product_param =
       layer_param.mutable_inner_product_param();
@@ -57,6 +63,38 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) {
 
 TYPED_TEST(InnerProductLayerTest, TestForward) {
   typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
+  bool IS_VALID_CUDA = false;
+#ifndef CPU_ONLY
+  IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
+#endif
+  if (Caffe::mode() == Caffe::CPU ||
+      sizeof(Dtype) == 4 || IS_VALID_CUDA) {
+    LayerParameter layer_param;
+    InnerProductParameter* inner_product_param =
+        layer_param.mutable_inner_product_param();
+    inner_product_param->set_num_output(10);
+    inner_product_param->mutable_weight_filler()->set_type("uniform");
+    inner_product_param->mutable_bias_filler()->set_type("uniform");
+    inner_product_param->mutable_bias_filler()->set_min(1);
+    inner_product_param->mutable_bias_filler()->set_max(2);
+    shared_ptr<InnerProductLayer<Dtype> > layer(
+        new InnerProductLayer<Dtype>(layer_param));
+    layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    const Dtype* data = this->blob_top_->cpu_data();
+    const int count = this->blob_top_->count();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_GE(data[i], 1.);
+    }
+  } else {
+    LOG(ERROR) << "Skipping test due to old architecture.";
+  }
+}
+
+TYPED_TEST(InnerProductLayerTest, TestForwardNoBatch) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_nobatch_);
   bool IS_VALID_CUDA = false;
 #ifndef CPU_ONLY
   IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
@@ -87,6 +125,7 @@ TYPED_TEST(InnerProductLayerTest, TestForward) {
 
 TYPED_TEST(InnerProductLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_);
   bool IS_VALID_CUDA = false;
 #ifndef CPU_ONLY
   IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2;
diff --git a/src/caffe/test/test_internal_thread.cpp b/src/caffe/test/test_internal_thread.cpp
index 31882b6..93f1cc5 100644
--- a/src/caffe/test/test_internal_thread.cpp
+++ b/src/caffe/test/test_internal_thread.cpp
@@ -2,6 +2,7 @@
 #include "gtest/gtest.h"
 
 #include "caffe/internal_thread.hpp"
+#include "caffe/util/math_functions.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -13,11 +14,40 @@ class InternalThreadTest : public ::testing::Test {};
 TEST_F(InternalThreadTest, TestStartAndExit) {
   InternalThread thread;
   EXPECT_FALSE(thread.is_started());
-  EXPECT_TRUE(thread.StartInternalThread());
+  thread.StartInternalThread();
   EXPECT_TRUE(thread.is_started());
-  EXPECT_TRUE(thread.WaitForInternalThreadToExit());
+  thread.StopInternalThread();
   EXPECT_FALSE(thread.is_started());
 }
 
+class TestThreadA : public InternalThread {
+  void InternalThreadEntry() {
+    EXPECT_EQ(4244559767, caffe_rng_rand());
+  }
+};
+
+class TestThreadB : public InternalThread {
+  void InternalThreadEntry() {
+    EXPECT_EQ(1726478280, caffe_rng_rand());
+  }
+};
+
+TEST_F(InternalThreadTest, TestRandomSeed) {
+  TestThreadA t1;
+  Caffe::set_random_seed(9658361);
+  t1.StartInternalThread();
+  t1.StopInternalThread();
+
+  TestThreadA t2;
+  Caffe::set_random_seed(9658361);
+  t2.StartInternalThread();
+  t2.StopInternalThread();
+
+  TestThreadB t3;
+  Caffe::set_random_seed(3435563);
+  t3.StartInternalThread();
+  t3.StopInternalThread();
+}
+
 }  // namespace caffe
 
diff --git a/src/caffe/test/test_layer_factory.cpp b/src/caffe/test/test_layer_factory.cpp
index efb1b37..c86fafd 100644
--- a/src/caffe/test/test_layer_factory.cpp
+++ b/src/caffe/test/test_layer_factory.cpp
@@ -1,11 +1,14 @@
 #include <map>
 #include <string>
 
+#include "boost/scoped_ptr.hpp"
 #include "gtest/gtest.h"
 
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
+#include "caffe/util/db.hpp"
+#include "caffe/util/io.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 
@@ -21,11 +24,20 @@ TYPED_TEST(LayerFactoryTest, TestCreateLayer) {
   typename LayerRegistry<Dtype>::CreatorRegistry& registry =
       LayerRegistry<Dtype>::Registry();
   shared_ptr<Layer<Dtype> > layer;
-  LayerParameter layer_param;
   for (typename LayerRegistry<Dtype>::CreatorRegistry::iterator iter =
        registry.begin(); iter != registry.end(); ++iter) {
     // Special case: PythonLayer is checked by pytest
     if (iter->first == "Python") { continue; }
+    LayerParameter layer_param;
+    // Data layers expect a DB
+    if (iter->first == "Data") {
+      string tmp;
+      MakeTempDir(&tmp);
+      boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
+      db->Open(tmp, db::NEW);
+      db->Close();
+      layer_param.mutable_data_param()->set_source(tmp);
+    }
     layer_param.set_type(iter->first);
     layer = LayerRegistry<Dtype>::CreateLayer(layer_param);
     EXPECT_EQ(iter->first, layer->type());
diff --git a/src/caffe/test/test_mvn_layer.cpp b/src/caffe/test/test_mvn_layer.cpp
index 933b432..be23d86 100644
--- a/src/caffe/test/test_mvn_layer.cpp
+++ b/src/caffe/test/test_mvn_layer.cpp
@@ -6,6 +6,7 @@
 #include "caffe/common.hpp"
 #include "caffe/common_layers.hpp"
 #include "caffe/filler.hpp"
+#include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 
 #include "caffe/test/test_caffe_main.hpp"
@@ -73,7 +74,8 @@ TYPED_TEST(MVNLayerTest, TestForward) {
 TYPED_TEST(MVNLayerTest, TestForwardMeanOnly) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.ParseFromString("mvn_param{normalize_variance: false}");
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "mvn_param{normalize_variance: false}", &layer_param));
   MVNLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -105,7 +107,8 @@ TYPED_TEST(MVNLayerTest, TestForwardMeanOnly) {
 TYPED_TEST(MVNLayerTest, TestForwardAcrossChannels) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.ParseFromString("mvn_param{across_channels: true}");
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "mvn_param{across_channels: true}", &layer_param));
   MVNLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -149,7 +152,8 @@ TYPED_TEST(MVNLayerTest, TestGradient) {
 TYPED_TEST(MVNLayerTest, TestGradientMeanOnly) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.ParseFromString("mvn_param{normalize_variance: false}");
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "mvn_param{normalize_variance: false}", &layer_param));
   MVNLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-3);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
@@ -159,7 +163,8 @@ TYPED_TEST(MVNLayerTest, TestGradientMeanOnly) {
 TYPED_TEST(MVNLayerTest, TestGradientAcrossChannels) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  layer_param.ParseFromString("mvn_param{across_channels: true}");
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "mvn_param{across_channels: true}", &layer_param));
   MVNLayer<Dtype> layer(layer_param);
   GradientChecker<Dtype> checker(1e-2, 1e-3);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 56959f4..12998d8 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -1107,11 +1107,10 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
   EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
   Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
   Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   this->net_->Forward(bottom);
   this->net_->Backward();
   // Compute the expected update as the data minus the two diffs.
@@ -1124,11 +1123,7 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
   // Make sure the diffs are non-trivial.
   for (int i = 0; i < count; ++i) {
     EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
-    EXPECT_NE(0, ip2_weights->cpu_diff()[i]);
-    EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]);
   }
-  caffe_axpy(count, Dtype(1), ip2_weights->cpu_diff(),
-             shared_params.mutable_cpu_diff());
   caffe_axpy(count, Dtype(-1), shared_params.cpu_diff(),
              shared_params.mutable_cpu_data());
   const Dtype* expected_updated_params = shared_params.cpu_data();
@@ -1165,8 +1160,8 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
     EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
     EXPECT_NE(0, ip2_weights->cpu_diff()[i]);
     EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]);
-    EXPECT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i],
-              shared_params.cpu_diff()[i]);
+    EXPECT_FLOAT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i],
+                    shared_params.cpu_diff()[i]);
   }
   caffe_axpy(count, Dtype(-1), ip1_weights->cpu_diff(),
              unshared_params1.mutable_cpu_data());
@@ -1196,11 +1191,10 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) {
   EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
   Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
   Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   this->net_->ForwardBackward(bottom);
   this->net_->Update();
   Blob<Dtype> shared_params;
@@ -1223,14 +1217,13 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) {
   ASSERT_FALSE(NULL == ip1_weights);
   ASSERT_FALSE(NULL == ip2_weights);
   EXPECT_NE(ip1_weights, ip2_weights);
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   for (int i = 0; i < count; ++i) {
     EXPECT_FLOAT_EQ(shared_params.cpu_data()[i], ip1_weights->cpu_data()[i]);
   }
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
 }
 
 TYPED_TEST(NetTest, TestParamPropagateDown) {
diff --git a/src/caffe/test/test_tile_layer.cpp b/src/caffe/test/test_tile_layer.cpp
new file mode 100644
index 0000000..540aac3
--- /dev/null
+++ b/src/caffe/test/test_tile_layer.cpp
@@ -0,0 +1,162 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class TileLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  TileLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+    FillerParameter filler_param;
+    filler_param.set_mean(0.0);
+    filler_param.set_std(1.0);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(blob_bottom_);
+  }
+
+  virtual ~TileLayerTest() {
+    delete blob_bottom_;
+    delete blob_top_;
+  }
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(TileLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(TileLayerTest, TestTrivialSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  const int kNumTiles = 1;
+  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
+  for (int i = 0; i < this->blob_bottom_->num_axes(); ++i) {
+    layer_param.mutable_tile_param()->set_axis(i);
+    TileLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(this->blob_top_->num_axes(), this->blob_bottom_->num_axes());
+    for (int j = 0; j < this->blob_bottom_->num_axes(); ++j) {
+      EXPECT_EQ(this->blob_top_->shape(j), this->blob_bottom_->shape(j));
+    }
+  }
+}
+
+TYPED_TEST(TileLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  const int kNumTiles = 3;
+  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
+  for (int i = 0; i < this->blob_bottom_->num_axes(); ++i) {
+    layer_param.mutable_tile_param()->set_axis(i);
+    TileLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(this->blob_top_->num_axes(), this->blob_bottom_->num_axes());
+    for (int j = 0; j < this->blob_bottom_->num_axes(); ++j) {
+      const int top_dim =
+          ((i == j) ? kNumTiles : 1) * this->blob_bottom_->shape(j);
+      EXPECT_EQ(top_dim, this->blob_top_->shape(j));
+    }
+  }
+}
+
+TYPED_TEST(TileLayerTest, TestForwardNum) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  const int kTileAxis = 0;
+  const int kNumTiles = 3;
+  layer_param.mutable_tile_param()->set_axis(kTileAxis);
+  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
+  TileLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_top_->num(); ++n) {
+    for (int c = 0; c < this->blob_top_->channels(); ++c) {
+       for (int h = 0; h < this->blob_top_->height(); ++h) {
+         for (int w = 0; w < this->blob_top_->width(); ++w) {
+           const int bottom_n = n % this->blob_bottom_->num();
+           EXPECT_EQ(this->blob_bottom_->data_at(bottom_n, c, h, w),
+                     this->blob_top_->data_at(n, c, h, w));
+         }
+       }
+    }
+  }
+}
+
+TYPED_TEST(TileLayerTest, TestForwardChannels) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  const int kNumTiles = 3;
+  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
+  TileLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int n = 0; n < this->blob_top_->num(); ++n) {
+    for (int c = 0; c < this->blob_top_->channels(); ++c) {
+       for (int h = 0; h < this->blob_top_->height(); ++h) {
+         for (int w = 0; w < this->blob_top_->width(); ++w) {
+           const int bottom_c = c % this->blob_bottom_->channels();
+           EXPECT_EQ(this->blob_bottom_->data_at(n, bottom_c, h, w),
+                     this->blob_top_->data_at(n, c, h, w));
+         }
+       }
+    }
+  }
+}
+
+TYPED_TEST(TileLayerTest, TestTrivialGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  const int kNumTiles = 1;
+  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
+  TileLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(TileLayerTest, TestGradientNum) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  const int kTileAxis = 0;
+  const int kNumTiles = 3;
+  layer_param.mutable_tile_param()->set_axis(kTileAxis);
+  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
+  TileLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(TileLayerTest, TestGradientChannels) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  const int kTileAxis = 1;
+  const int kNumTiles = 3;
+  layer_param.mutable_tile_param()->set_axis(kTileAxis);
+  layer_param.mutable_tile_param()->set_tiles(kNumTiles);
+  TileLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp
index eec6276..0067202 100644
--- a/src/caffe/test/test_upgrade_proto.cpp
+++ b/src/caffe/test/test_upgrade_proto.cpp
@@ -2,12 +2,15 @@
 #include <string>
 #include <vector>
 
+#include "boost/scoped_ptr.hpp"
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
+#include "caffe/util/db.hpp"
+#include "caffe/util/io.hpp"
 #include "caffe/util/upgrade_proto.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
@@ -2901,6 +2904,15 @@ TEST_F(NetUpgradeTest, TestUpgradeV1LayerType) {
       continue;  // Empty string isn't actually a valid layer type.
     }
     layer_param.set_type(v2_layer_type);
+    // Data layers expect a DB
+    if (v2_layer_type == "Data") {
+      string tmp;
+      MakeTempDir(&tmp);
+      boost::scoped_ptr<db::DB> db(db::GetDB(DataParameter_DB_LEVELDB));
+      db->Open(tmp, db::NEW);
+      db->Close();
+      layer_param.mutable_data_param()->set_source(tmp);
+    }
     layer = LayerRegistry<float>::CreateLayer(layer_param);
     EXPECT_EQ(v2_layer_type, layer->type());
   }
diff --git a/src/caffe/util/blocking_queue.cpp b/src/caffe/util/blocking_queue.cpp
new file mode 100644
index 0000000..d1d1fa8
--- /dev/null
+++ b/src/caffe/util/blocking_queue.cpp
@@ -0,0 +1,96 @@
+#include <boost/thread.hpp>
+#include <string>
+
+#include "caffe/data_layers.hpp"
+#include "caffe/data_reader.hpp"
+#include "caffe/parallel.hpp"
+#include "caffe/util/blocking_queue.hpp"
+
+namespace caffe {
+
+template<typename T>
+class BlockingQueue<T>::sync {
+ public:
+  mutable boost::mutex mutex_;
+  boost::condition_variable condition_;
+};
+
+template<typename T>
+BlockingQueue<T>::BlockingQueue()
+    : sync_(new sync()) {
+}
+
+template<typename T>
+void BlockingQueue<T>::push(const T& t) {
+  boost::mutex::scoped_lock lock(sync_->mutex_);
+  queue_.push(t);
+  lock.unlock();
+  sync_->condition_.notify_one();
+}
+
+template<typename T>
+bool BlockingQueue<T>::try_pop(T* t) {
+  boost::mutex::scoped_lock lock(sync_->mutex_);
+
+  if (queue_.empty()) {
+    return false;
+  }
+
+  *t = queue_.front();
+  queue_.pop();
+  return true;
+}
+
+template<typename T>
+T BlockingQueue<T>::pop(const string& log_on_wait) {
+  boost::mutex::scoped_lock lock(sync_->mutex_);
+
+  while (queue_.empty()) {
+    if (!log_on_wait.empty()) {
+      LOG_EVERY_N(INFO, 1000)<< log_on_wait;
+    }
+    sync_->condition_.wait(lock);
+  }
+
+  T t = queue_.front();
+  queue_.pop();
+  return t;
+}
+
+template<typename T>
+bool BlockingQueue<T>::try_peek(T* t) {
+  boost::mutex::scoped_lock lock(sync_->mutex_);
+
+  if (queue_.empty()) {
+    return false;
+  }
+
+  *t = queue_.front();
+  return true;
+}
+
+template<typename T>
+T BlockingQueue<T>::peek() {
+  boost::mutex::scoped_lock lock(sync_->mutex_);
+
+  while (queue_.empty()) {
+    sync_->condition_.wait(lock);
+  }
+
+  return queue_.front();
+}
+
+template<typename T>
+size_t BlockingQueue<T>::size() const {
+  boost::mutex::scoped_lock lock(sync_->mutex_);
+  return queue_.size();
+}
+
+template class BlockingQueue<Batch<float>*>;
+template class BlockingQueue<Batch<double>*>;
+template class BlockingQueue<Datum*>;
+template class BlockingQueue<shared_ptr<DataReader::QueuePair> >;
+template class BlockingQueue<P2PSync<float>*>;
+template class BlockingQueue<P2PSync<double>*>;
+
+}  // namespace caffe
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
new file mode 100644
index 0000000..d0d05f7
--- /dev/null
+++ b/src/caffe/util/hdf5.cpp
@@ -0,0 +1,160 @@
+#include "caffe/util/hdf5.hpp"
+
+#include <string>
+#include <vector>
+
+namespace caffe {
+
+// Verifies format of data stored in HDF5 file and reshapes blob accordingly.
+template <typename Dtype>
+void hdf5_load_nd_dataset_helper(
+    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
+    Blob<Dtype>* blob) {
+  // Verify that the dataset exists.
+  CHECK(H5LTfind_dataset(file_id, dataset_name_))
+      << "Failed to find HDF5 dataset " << dataset_name_;
+  // Verify that the number of dimensions is in the accepted range.
+  herr_t status;
+  int ndims;
+  status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims);
+  CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_;
+  CHECK_GE(ndims, min_dim);
+  CHECK_LE(ndims, max_dim);
+
+  // Verify that the data format is what we expect: float or double.
+  std::vector<hsize_t> dims(ndims);
+  H5T_class_t class_;
+  status = H5LTget_dataset_info(
+      file_id, dataset_name_, dims.data(), &class_, NULL);
+  CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
+  CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
+
+  vector<int> blob_dims(dims.size());
+  for (int i = 0; i < dims.size(); ++i) {
+    blob_dims[i] = dims[i];
+  }
+  blob->Reshape(blob_dims);
+}
+
+template <>
+void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
+        int min_dim, int max_dim, Blob<float>* blob) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+  herr_t status = H5LTread_dataset_float(
+    file_id, dataset_name_, blob->mutable_cpu_data());
+  CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
+}
+
+template <>
+void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
+        int min_dim, int max_dim, Blob<double>* blob) {
+  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
+  herr_t status = H5LTread_dataset_double(
+    file_id, dataset_name_, blob->mutable_cpu_data());
+  CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
+}
+
+template <>
+void hdf5_save_nd_dataset<float>(
+    const hid_t file_id, const string& dataset_name, const Blob<float>& blob,
+    bool write_diff) {
+  int num_axes = blob.num_axes();
+  hsize_t *dims = new hsize_t[num_axes];
+  for (int i = 0; i < num_axes; ++i) {
+    dims[i] = blob.shape(i);
+  }
+  const float* data;
+  if (write_diff) {
+    data = blob.cpu_diff();
+  } else {
+    data = blob.cpu_data();
+  }
+  herr_t status = H5LTmake_dataset_float(
+      file_id, dataset_name.c_str(), num_axes, dims, data);
+  CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
+  delete[] dims;
+}
+
+template <>
+void hdf5_save_nd_dataset<double>(
+    hid_t file_id, const string& dataset_name, const Blob<double>& blob,
+    bool write_diff) {
+  int num_axes = blob.num_axes();
+  hsize_t *dims = new hsize_t[num_axes];
+  for (int i = 0; i < num_axes; ++i) {
+    dims[i] = blob.shape(i);
+  }
+  const double* data;
+  if (write_diff) {
+    data = blob.cpu_diff();
+  } else {
+    data = blob.cpu_data();
+  }
+  herr_t status = H5LTmake_dataset_double(
+      file_id, dataset_name.c_str(), num_axes, dims, data);
+  CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
+  delete[] dims;
+}
+
+string hdf5_load_string(hid_t loc_id, const string& dataset_name) {
+  // Get size of dataset
+  size_t size;
+  H5T_class_t class_;
+  herr_t status = \
+    H5LTget_dataset_info(loc_id, dataset_name.c_str(), NULL, &class_, &size);
+  CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name;
+  char *buf = new char[size];
+  status = H5LTread_dataset_string(loc_id, dataset_name.c_str(), buf);
+  CHECK_GE(status, 0)
+    << "Failed to load int dataset with name " << dataset_name;
+  string val(buf);
+  delete[] buf;
+  return val;
+}
+
+void hdf5_save_string(hid_t loc_id, const string& dataset_name,
+                      const string& s) {
+  herr_t status = \
+    H5LTmake_dataset_string(loc_id, dataset_name.c_str(), s.c_str());
+  CHECK_GE(status, 0)
+    << "Failed to save string dataset with name " << dataset_name;
+}
+
+int hdf5_load_int(hid_t loc_id, const string& dataset_name) {
+  int val;
+  herr_t status = H5LTread_dataset_int(loc_id, dataset_name.c_str(), &val);
+  CHECK_GE(status, 0)
+    << "Failed to load int dataset with name " << dataset_name;
+  return val;
+}
+
+void hdf5_save_int(hid_t loc_id, const string& dataset_name, int i) {
+  hsize_t one = 1;
+  herr_t status = \
+    H5LTmake_dataset_int(loc_id, dataset_name.c_str(), 1, &one, &i);
+  CHECK_GE(status, 0)
+    << "Failed to save int dataset with name " << dataset_name;
+}
+
+int hdf5_get_num_links(hid_t loc_id) {
+  H5G_info_t info;
+  herr_t status = H5Gget_info(loc_id, &info);
+  CHECK_GE(status, 0) << "Error while counting HDF5 links.";
+  return info.nlinks;
+}
+
+string hdf5_get_name_by_idx(hid_t loc_id, int idx) {
+  ssize_t str_size = H5Lget_name_by_idx(
+      loc_id, ".", H5_INDEX_NAME, H5_ITER_NATIVE, idx, NULL, 0, H5P_DEFAULT);
+  CHECK_GE(str_size, 0) << "Error retrieving HDF5 dataset at index " << idx;
+  char *c_str = new char[str_size+1];
+  ssize_t status = H5Lget_name_by_idx(
+      loc_id, ".", H5_INDEX_NAME, H5_ITER_NATIVE, idx, c_str, str_size+1,
+      H5P_DEFAULT);
+  CHECK_GE(status, 0) << "Error retrieving HDF5 dataset at index " << idx;
+  string result(c_str);
+  delete[] c_str;
+  return result;
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/insert_splits.cpp b/src/caffe/util/insert_splits.cpp
index 416f80a..475a2a9 100644
--- a/src/caffe/util/insert_splits.cpp
+++ b/src/caffe/util/insert_splits.cpp
@@ -32,7 +32,8 @@ void InsertSplits(const NetParameter& param, NetParameter* param_split) {
       const string& blob_name = layer_param.bottom(j);
       if (blob_name_to_last_top_idx.find(blob_name) ==
           blob_name_to_last_top_idx.end()) {
-        LOG(FATAL) << "Unknown blob input " << blob_name << " to layer " << j;
+        LOG(FATAL) << "Unknown bottom blob '" << blob_name << "' (layer '"
+                   << layer_param.name() << "', bottom index " << j << ")";
       }
       const pair<int, int>& bottom_idx = make_pair(i, j);
       const pair<int, int>& top_idx = blob_name_to_last_top_idx[blob_name];
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 77ef7f2..6f03314 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -228,79 +228,5 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
   datum->set_data(buffer);
 }
 
-// Verifies format of data stored in HDF5 file and reshapes blob accordingly.
-template <typename Dtype>
-void hdf5_load_nd_dataset_helper(
-    hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
-    Blob<Dtype>* blob) {
-  // Verify that the dataset exists.
-  CHECK(H5LTfind_dataset(file_id, dataset_name_))
-      << "Failed to find HDF5 dataset " << dataset_name_;
-  // Verify that the number of dimensions is in the accepted range.
-  herr_t status;
-  int ndims;
-  status = H5LTget_dataset_ndims(file_id, dataset_name_, &ndims);
-  CHECK_GE(status, 0) << "Failed to get dataset ndims for " << dataset_name_;
-  CHECK_GE(ndims, min_dim);
-  CHECK_LE(ndims, max_dim);
-
-  // Verify that the data format is what we expect: float or double.
-  std::vector<hsize_t> dims(ndims);
-  H5T_class_t class_;
-  status = H5LTget_dataset_info(
-      file_id, dataset_name_, dims.data(), &class_, NULL);
-  CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name_;
-  CHECK_EQ(class_, H5T_FLOAT) << "Expected float or double data";
-
-  vector<int> blob_dims(dims.size());
-  for (int i = 0; i < dims.size(); ++i) {
-    blob_dims[i] = dims[i];
-  }
-  blob->Reshape(blob_dims);
-}
-
-template <>
-void hdf5_load_nd_dataset<float>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<float>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-  herr_t status = H5LTread_dataset_float(
-    file_id, dataset_name_, blob->mutable_cpu_data());
-  CHECK_GE(status, 0) << "Failed to read float dataset " << dataset_name_;
-}
-
-template <>
-void hdf5_load_nd_dataset<double>(hid_t file_id, const char* dataset_name_,
-        int min_dim, int max_dim, Blob<double>* blob) {
-  hdf5_load_nd_dataset_helper(file_id, dataset_name_, min_dim, max_dim, blob);
-  herr_t status = H5LTread_dataset_double(
-    file_id, dataset_name_, blob->mutable_cpu_data());
-  CHECK_GE(status, 0) << "Failed to read double dataset " << dataset_name_;
-}
-
-template <>
-void hdf5_save_nd_dataset<float>(
-    const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
-  hsize_t dims[HDF5_NUM_DIMS];
-  dims[0] = blob.num();
-  dims[1] = blob.channels();
-  dims[2] = blob.height();
-  dims[3] = blob.width();
-  herr_t status = H5LTmake_dataset_float(
-      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
-  CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
-}
-
-template <>
-void hdf5_save_nd_dataset<double>(
-    const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
-  hsize_t dims[HDF5_NUM_DIMS];
-  dims[0] = blob.num();
-  dims[1] = blob.channels();
-  dims[2] = blob.height();
-  dims[3] = blob.width();
-  herr_t status = H5LTmake_dataset_double(
-      file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
-  CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
-}
 
 }  // namespace caffe
diff --git a/src/caffe/util/signal_handler.cpp b/src/caffe/util/signal_handler.cpp
new file mode 100644
index 0000000..5d764ec
--- /dev/null
+++ b/src/caffe/util/signal_handler.cpp
@@ -0,0 +1,115 @@
+#include <boost/bind.hpp>
+#include <glog/logging.h>
+
+#include <signal.h>
+#include <csignal>
+
+#include "caffe/util/signal_handler.h"
+
+namespace {
+  static volatile sig_atomic_t got_sigint = false;
+  static volatile sig_atomic_t got_sighup = false;
+  static bool already_hooked_up = false;
+
+  void handle_signal(int signal) {
+    switch (signal) {
+    case SIGHUP:
+      got_sighup = true;
+      break;
+    case SIGINT:
+      got_sigint = true;
+      break;
+    }
+  }
+
+  void HookupHandler() {
+    if (already_hooked_up) {
+      LOG(FATAL) << "Tried to hookup signal handlers more than once.";
+    }
+    already_hooked_up = true;
+
+    struct sigaction sa;
+    // Setup the handler
+    sa.sa_handler = &handle_signal;
+    // Restart the system call, if at all possible
+    sa.sa_flags = SA_RESTART;
+    // Block every signal during the handler
+    sigfillset(&sa.sa_mask);
+    // Intercept SIGHUP and SIGINT
+    if (sigaction(SIGHUP, &sa, NULL) == -1) {
+      LOG(FATAL) << "Cannot install SIGHUP handler.";
+    }
+    if (sigaction(SIGINT, &sa, NULL) == -1) {
+      LOG(FATAL) << "Cannot install SIGINT handler.";
+    }
+  }
+
+  // Set the signal handlers to the default.
+  void UnhookHandler() {
+    if (already_hooked_up) {
+      struct sigaction sa;
+      // Setup the sighub handler
+      sa.sa_handler = SIG_DFL;
+      // Restart the system call, if at all possible
+      sa.sa_flags = SA_RESTART;
+      // Block every signal during the handler
+      sigfillset(&sa.sa_mask);
+      // Intercept SIGHUP and SIGINT
+      if (sigaction(SIGHUP, &sa, NULL) == -1) {
+        LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
+      }
+      if (sigaction(SIGINT, &sa, NULL) == -1) {
+        LOG(FATAL) << "Cannot uninstall SIGINT handler.";
+      }
+
+      already_hooked_up = false;
+    }
+  }
+
+  // Return true iff a SIGINT has been received since the last time this
+  // function was called.
+  bool GotSIGINT() {
+    bool result = got_sigint;
+    got_sigint = false;
+    return result;
+  }
+
+  // Return true iff a SIGHUP has been received since the last time this
+  // function was called.
+  bool GotSIGHUP() {
+    bool result = got_sighup;
+    got_sighup = false;
+    return result;
+  }
+}  // namespace
+
+namespace caffe {
+
+SignalHandler::SignalHandler(SolverAction::Enum SIGINT_action,
+                             SolverAction::Enum SIGHUP_action):
+  SIGINT_action_(SIGINT_action),
+  SIGHUP_action_(SIGHUP_action) {
+  HookupHandler();
+}
+
+SignalHandler::~SignalHandler() {
+  UnhookHandler();
+}
+
+SolverAction::Enum SignalHandler::CheckForSignals() const {
+  if (GotSIGHUP()) {
+    return SIGHUP_action_;
+  }
+  if (GotSIGINT()) {
+    return SIGINT_action_;
+  }
+  return SolverAction::NONE;
+}
+
+// Return the function that the solver can use to find out if a snapshot or
+// early exit is being requested.
+ActionCallback SignalHandler::GetActionFunction() {
+  return boost::bind(&SignalHandler::CheckForSignals, this);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 38a0602..92e5cf5 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -588,8 +588,8 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
   if (NetNeedsV0ToV1Upgrade(*param)) {
     // NetParameter was specified using the old style (V0LayerParameter); try to
     // upgrade it.
-    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "V0LayerParameter: " << param_file;
+    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
+              << "V0LayerParameter: " << param_file;
     NetParameter original_param(*param);
     if (!UpgradeV0Net(original_param, param)) {
       success = false;
@@ -599,29 +599,29 @@ bool UpgradeNetAsNeeded(const string& param_file, NetParameter* param) {
       LOG(INFO) << "Successfully upgraded file specified using deprecated "
                 << "V0LayerParameter";
     }
-    LOG(ERROR) << "Note that future Caffe releases will not support "
+    LOG(WARNING) << "Note that future Caffe releases will not support "
         << "V0NetParameter; use ./build/tools/upgrade_net_proto_text for "
         << "prototxt and ./build/tools/upgrade_net_proto_binary for model "
         << "weights upgrade this and any other net protos to the new format.";
   }
   // NetParameter uses old style data transformation fields; try to upgrade it.
   if (NetNeedsDataUpgrade(*param)) {
-    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "transformation parameters: " << param_file;
+    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
+              << "transformation parameters: " << param_file;
     UpgradeNetDataTransformation(param);
     LOG(INFO) << "Successfully upgraded file specified using deprecated "
               << "data transformation parameters.";
-    LOG(ERROR) << "Note that future Caffe releases will only support "
-               << "transform_param messages for transformation fields.";
+    LOG(WARNING) << "Note that future Caffe releases will only support "
+                 << "transform_param messages for transformation fields.";
   }
   if (NetNeedsV1ToV2Upgrade(*param)) {
-    LOG(ERROR) << "Attempting to upgrade input file specified using deprecated "
-               << "V1LayerParameter: " << param_file;
+    LOG(INFO) << "Attempting to upgrade input file specified using deprecated "
+              << "V1LayerParameter: " << param_file;
     NetParameter original_param(*param);
     if (!UpgradeV1Net(original_param, param)) {
       success = false;
       LOG(ERROR) << "Warning: had one or more problems upgrading "
-          << "V1LayerParameter (see above); continuing anyway.";
+                 << "V1LayerParameter (see above); continuing anyway.";
     } else {
       LOG(INFO) << "Successfully upgraded file specified using deprecated "
                 << "V1LayerParameter";
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 0b7523f..ff63860 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -1,3 +1,8 @@
+#ifdef WITH_PYTHON_LAYER
+#include "boost/python.hpp"
+namespace bp = boost::python;
+#endif
+
 #include <glog/logging.h>
 
 #include <cstring>
@@ -7,18 +12,23 @@
 
 #include "boost/algorithm/string.hpp"
 #include "caffe/caffe.hpp"
+#include "caffe/util/signal_handler.h"
 
 using caffe::Blob;
 using caffe::Caffe;
 using caffe::Net;
 using caffe::Layer;
+using caffe::Solver;
 using caffe::shared_ptr;
+using caffe::string;
 using caffe::Timer;
 using caffe::vector;
+using std::ostringstream;
 
-
-DEFINE_int32(gpu, -1,
-    "Run in GPU mode on given device ID.");
+DEFINE_string(gpu, "",
+    "Optional; run in GPU mode on given device IDs separated by ','."
+    "Use '-gpu all' to run on all available GPUs. The effective training "
+    "batch size is multiplied by the number of devices.");
 DEFINE_string(solver, "",
     "The solver definition protocol buffer text file.");
 DEFINE_string(model, "",
@@ -26,10 +36,16 @@ DEFINE_string(model, "",
 DEFINE_string(snapshot, "",
     "Optional; the snapshot solver state to resume training.");
 DEFINE_string(weights, "",
-    "Optional; the pretrained weights to initialize finetuning. "
-    "Cannot be set simultaneously with snapshot.");
+    "Optional; the pretrained weights to initialize finetuning, "
+    "separated by ','. Cannot be set simultaneously with snapshot.");
 DEFINE_int32(iterations, 50,
     "The number of iterations to run.");
+DEFINE_string(sigint_effect, "stop",
+             "Optional; action to take when a SIGINT signal is received: "
+              "snapshot, stop or none.");
+DEFINE_string(sighup_effect, "snapshot",
+             "Optional; action to take when a SIGHUP signal is received: "
+             "snapshot, stop or none.");
 
 // A simple registry for caffe commands.
 typedef int (*BrewFunction)();
@@ -61,6 +77,29 @@ static BrewFunction GetBrewFunction(const caffe::string& name) {
   }
 }
 
+// Parse GPU ids or use all available devices
+static void get_gpus(vector<int>* gpus) {
+  if (FLAGS_gpu == "all") {
+    int count = 0;
+#ifndef CPU_ONLY
+    CUDA_CHECK(cudaGetDeviceCount(&count));
+#else
+    NO_GPU;
+#endif
+    for (int i = 0; i < count; ++i) {
+      gpus->push_back(i);
+    }
+  } else if (FLAGS_gpu.size()) {
+    vector<string> strings;
+    boost::split(strings, FLAGS_gpu, boost::is_any_of(","));
+    for (int i = 0; i < strings.size(); ++i) {
+      gpus->push_back(boost::lexical_cast<int>(strings[i]));
+    }
+  } else {
+    CHECK_EQ(gpus->size(), 0);
+  }
+}
+
 // caffe commands to call by
 //     caffe <command> <args>
 //
@@ -69,10 +108,13 @@ static BrewFunction GetBrewFunction(const caffe::string& name) {
 
 // Device Query: show diagnostic information for a GPU device.
 int device_query() {
-  CHECK_GT(FLAGS_gpu, -1) << "Need a device ID to query.";
-  LOG(INFO) << "Querying device ID = " << FLAGS_gpu;
-  caffe::Caffe::SetDevice(FLAGS_gpu);
-  caffe::Caffe::DeviceQuery();
+  LOG(INFO) << "Querying GPUs " << FLAGS_gpu;
+  vector<int> gpus;
+  get_gpus(&gpus);
+  for (int i = 0; i < gpus.size(); ++i) {
+    caffe::Caffe::SetDevice(gpus[i]);
+    caffe::Caffe::DeviceQuery();
+  }
   return 0;
 }
 RegisterBrewFunction(device_query);
@@ -91,6 +133,22 @@ void CopyLayers(caffe::Solver<float>* solver, const std::string& model_list) {
   }
 }
 
+// Translate the signal effect the user specified on the command-line to the
+// corresponding enumeration.
+caffe::SolverAction::Enum GetRequestedAction(
+    const std::string& flag_value) {
+  if (flag_value == "stop") {
+    return caffe::SolverAction::STOP;
+  }
+  if (flag_value == "snapshot") {
+    return caffe::SolverAction::SNAPSHOT;
+  }
+  if (flag_value == "none") {
+    return caffe::SolverAction::NONE;
+  }
+  LOG(FATAL) << "Invalid signal effect \""<< flag_value << "\" was specified";
+}
+
 // Train / Finetune a model.
 int train() {
   CHECK_GT(FLAGS_solver.size(), 0) << "Need a solver definition to train.";
@@ -101,34 +159,56 @@ int train() {
   caffe::SolverParameter solver_param;
   caffe::ReadProtoFromTextFileOrDie(FLAGS_solver, &solver_param);
 
-  // If the gpu flag is not provided, allow the mode and device to be set
+  // If the gpus flag is not provided, allow the mode and device to be set
   // in the solver prototxt.
-  if (FLAGS_gpu < 0
+  if (FLAGS_gpu.size() == 0
       && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
-    FLAGS_gpu = solver_param.device_id();
+      if (solver_param.has_device_id()) {
+          FLAGS_gpu = ""  +
+              boost::lexical_cast<string>(solver_param.device_id());
+      } else {  // Set default GPU if unspecified
+          FLAGS_gpu = "" + boost::lexical_cast<string>(0);
+      }
   }
 
-  // Set device id and mode
-  if (FLAGS_gpu >= 0) {
-    LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu;
-    Caffe::SetDevice(FLAGS_gpu);
-    Caffe::set_mode(Caffe::GPU);
-  } else {
-    LOG(INFO) << "Use CPU.";
+  vector<int> gpus;
+  get_gpus(&gpus);
+  if (gpus.size() == 0) {
     Caffe::set_mode(Caffe::CPU);
+  } else {
+    ostringstream s;
+    for (int i = 0; i < gpus.size(); ++i) {
+      s << (i ? ", " : "") << gpus[i];
+    }
+    LOG(INFO) << "Using GPUs " << s.str();
+
+    solver_param.set_device_id(gpus[0]);
+    Caffe::SetDevice(gpus[0]);
+    Caffe::set_mode(Caffe::GPU);
+    Caffe::set_solver_count(gpus.size());
   }
 
-  LOG(INFO) << "Starting Optimization";
+  caffe::SignalHandler signal_handler(
+        GetRequestedAction(FLAGS_sigint_effect),
+        GetRequestedAction(FLAGS_sighup_effect));
+
   shared_ptr<caffe::Solver<float> >
     solver(caffe::GetSolver<float>(solver_param));
 
+  solver->SetActionFunction(signal_handler.GetActionFunction());
+
   if (FLAGS_snapshot.size()) {
     LOG(INFO) << "Resuming from " << FLAGS_snapshot;
-    solver->Solve(FLAGS_snapshot);
+    solver->Restore(FLAGS_snapshot.c_str());
   } else if (FLAGS_weights.size()) {
-    CopyLayers(&*solver, FLAGS_weights);
-    solver->Solve();
+    CopyLayers(solver.get(), FLAGS_weights);
+  }
+
+  if (gpus.size() > 1) {
+    caffe::P2PSync<float> sync(solver, NULL, solver->param());
+    sync.run(gpus);
   } else {
+    LOG(INFO) << "Starting Optimization";
     solver->Solve();
   }
   LOG(INFO) << "Optimization Done.";
@@ -143,9 +223,11 @@ int test() {
   CHECK_GT(FLAGS_weights.size(), 0) << "Need model weights to score.";
 
   // Set device id and mode
-  if (FLAGS_gpu >= 0) {
-    LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu;
-    Caffe::SetDevice(FLAGS_gpu);
+  vector<int> gpus;
+  get_gpus(&gpus);
+  if (gpus.size() != 0) {
+    LOG(INFO) << "Use GPU with device ID " << gpus[0];
+    Caffe::SetDevice(gpus[0]);
     Caffe::set_mode(Caffe::GPU);
   } else {
     LOG(INFO) << "Use CPU.";
@@ -208,9 +290,11 @@ int time() {
   CHECK_GT(FLAGS_model.size(), 0) << "Need a model definition to time.";
 
   // Set device id and mode
-  if (FLAGS_gpu >= 0) {
-    LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu;
-    Caffe::SetDevice(FLAGS_gpu);
+  vector<int> gpus;
+  get_gpus(&gpus);
+  if (gpus.size() != 0) {
+    LOG(INFO) << "Use GPU with device ID " << gpus[0];
+    Caffe::SetDevice(gpus[0]);
     Caffe::set_mode(Caffe::GPU);
   } else {
     LOG(INFO) << "Use CPU.";
@@ -304,7 +388,16 @@ int main(int argc, char** argv) {
   // Run tool or show usage.
   caffe::GlobalInit(&argc, &argv);
   if (argc == 2) {
-    return GetBrewFunction(caffe::string(argv[1]))();
+#ifdef WITH_PYTHON_LAYER
+    try {
+#endif
+      return GetBrewFunction(caffe::string(argv[1]))();
+#ifdef WITH_PYTHON_LAYER
+    } catch (bp::error_already_set) {
+      PyErr_Print();
+      return 1;
+    }
+#endif
   } else {
     gflags::ShowUsageWithFlagsRestrict(argv[0], "tools/caffe");
   }
diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp
index 816a91f..aad1f1f 100644
--- a/tools/convert_imageset.cpp
+++ b/tools/convert_imageset.cpp
@@ -44,6 +44,8 @@ DEFINE_string(encode_type, "",
 
 int main(int argc, char** argv) {
   ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
 
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
@@ -140,13 +142,13 @@ int main(int argc, char** argv) {
       // Commit db
       txn->Commit();
       txn.reset(db->NewTransaction());
-      LOG(ERROR) << "Processed " << count << " files.";
+      LOG(INFO) << "Processed " << count << " files.";
     }
   }
   // write the last batch
   if (count % 1000 != 0) {
     txn->Commit();
-    LOG(ERROR) << "Processed " << count << " files.";
+    LOG(INFO) << "Processed " << count << " files.";
   }
   return 0;
 }
diff --git a/tools/extra/parse_log.py b/tools/extra/parse_log.py
index 09ea216..48f9bee 100755
--- a/tools/extra/parse_log.py
+++ b/tools/extra/parse_log.py
@@ -28,7 +28,7 @@ def parse_log(path_to_log):
     regex_iteration = re.compile('Iteration (\d+)')
     regex_train_output = re.compile('Train net output #(\d+): (\S+) = ([\.\deE+-]+)')
     regex_test_output = re.compile('Test net output #(\d+): (\S+) = ([\.\deE+-]+)')
-    regex_learning_rate = re.compile('lr = ([\.\d]+)')
+    regex_learning_rate = re.compile('lr = ([-+]?[0-9]*\.?[0-9]+([eE]?[-+]?[0-9]+)?)')
 
     # Pick out lines of interest
     iteration = -1

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/caffe-contrib.git