[caffe] 01/05: patches: Apply updates from upstream master branch. Remove merged patch cmake-change-static-lib-name.patch.

Tue Dec 19 02:48:13 UTC 2017

This is an automated email from the git hooks/post-receive script.

cdluminate-guest pushed a commit to branch master
in repository caffe.

commit 052fec4f55fc211e0209dd3d834e9035f5917671
Author: Mo Zhou <cdluminate at gmail.com>
Date:   Tue Dec 19 02:28:35 2017 +0000

    patches: Apply updates from upstream master branch. Remove merged patch cmake-change-static-lib-name.patch.
---
 debian/patches/cmake-change-static-lib-name.patch |   46 -
 debian/patches/series                             |    3 +-
 debian/patches/upstream-master-1.0-to-head.patch  | 1651 +++++++++++++++++++++
 3 files changed, 1653 insertions(+), 47 deletions(-)

diff --git a/debian/patches/cmake-change-static-lib-name.patch b/debian/patches/cmake-change-static-lib-name.patch
deleted file mode 100644
index 1c892e5..0000000
--- a/debian/patches/cmake-change-static-lib-name.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-Purpose: change the statis library name so it would not be ambiguous.
-Forward: This is already merged upstream.
-diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
-index b9152e9..4a80556 100644
---- a/src/caffe/CMakeLists.txt
-+++ b/src/caffe/CMakeLists.txt
-@@ -3,12 +3,12 @@ file(GLOB proto_files proto/*.proto)
- caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
- 
- # include python files either to force generation
--add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
--caffe_default_properties(proto)
--target_link_libraries(proto PUBLIC ${PROTOBUF_LIBRARIES})
--target_include_directories(proto PUBLIC ${PROTOBUF_INCLUDE_DIR})
-+add_library(caffeproto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-+caffe_default_properties(caffeproto)
-+target_link_libraries(caffeproto PUBLIC ${PROTOBUF_LIBRARIES})
-+target_include_directories(caffeproto PUBLIC ${PROTOBUF_INCLUDE_DIR})
- 
--list(INSERT Caffe_LINKER_LIBS 0 PUBLIC proto) # note, crucial to prepend!
-+list(INSERT Caffe_LINKER_LIBS 0 PUBLIC caffeproto) # note, crucial to prepend!
- 
- # --[ Caffe library
- 
-@@ -42,7 +42,7 @@ set_target_properties(caffe PROPERTIES
- # ---[ Install
- install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
- install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
--install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
-+install(TARGETS caffe caffeproto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
- 
- file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
- list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
-diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
-index ad91f54..09bb09b 100644
---- a/cmake/ConfigGen.cmake
-+++ b/cmake/ConfigGen.cmake
-@@ -33,7 +33,7 @@ function(caffe_generate_export_configs)
-   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
- 
-   # Add targets to the build-tree export set
--  export(TARGETS caffe proto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
-+  export(TARGETS caffe caffeproto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
-   export(PACKAGE Caffe)
- 
-   # ---[ Configure install-tree CaffeConfig.cmake file ]---
diff --git a/debian/patches/series b/debian/patches/series
index 10d6a91..3f0216c 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,5 +1,6 @@
+upstream-master-1.0-to-head.patch
+
 cmake-using-basic-blas
 cmake-fix-python-module-installdir
 fix-more-float-comparison-precision-issue2
 cmake-link-correct-python-lib.patch
-cmake-change-static-lib-name.patch
diff --git a/debian/patches/upstream-master-1.0-to-head.patch b/debian/patches/upstream-master-1.0-to-head.patch
new file mode 100644
index 0000000..d3fed86
--- /dev/null
+++ b/debian/patches/upstream-master-1.0-to-head.patch
@@ -0,0 +1,1651 @@
+diff --git a/Makefile b/Makefile
+index 4d324160..c6d5685b 100644
+--- a/Makefile
++++ b/Makefile
+@@ -694,6 +694,6 @@ $(DISTRIBUTE_DIR): all py | $(DISTRIBUTE_SUBDIRS)
+ 	install -m 644 $(DYNAMIC_NAME) $(DISTRIBUTE_DIR)/lib
+ 	cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_NAME_SHORT)
+ 	# add python - it's not the standard way, indeed...
+-	cp -r python $(DISTRIBUTE_DIR)/python
++	cp -r python $(DISTRIBUTE_DIR)/
+ 
+ -include $(DEPS)
+diff --git a/Makefile.config.example b/Makefile.config.example
+index d552b38a..79905935 100644
+--- a/Makefile.config.example
++++ b/Makefile.config.example
+@@ -33,6 +33,7 @@ CUDA_DIR := /usr/local/cuda
+ # CUDA architecture setting: going with all of them.
+ # For CUDA < 6.0, comment the *_50 through *_61 lines for compatibility.
+ # For CUDA < 8.0, comment the *_60 and *_61 lines for compatibility.
++# For CUDA >= 9.0, comment the *_20 and *_21 lines for compatibility.
+ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
+ 		-gencode arch=compute_20,code=sm_21 \
+ 		-gencode arch=compute_30,code=sm_30 \
+diff --git a/README.md b/README.md
+index 0ae3616b..fe259535 100644
+--- a/README.md
++++ b/README.md
+@@ -15,6 +15,14 @@ Check out the [project site](http://caffe.berkeleyvision.org) for all the detail
+ 
+ and step-by-step examples.
+ 
++## Custom distributions
++
++ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors (HSW, BDW, SKX, Xeon Phi).
++- [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
++- [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
++
++## Community
++
+ [![Join the chat at https://gitter.im/BVLC/caffe](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/BVLC/caffe?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+ 
+ Please join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) or [gitter chat](https://gitter.im/BVLC/caffe) to ask questions and talk about methods and models.
+diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
+index ad91f542..09bb09b4 100644
+--- a/cmake/ConfigGen.cmake
++++ b/cmake/ConfigGen.cmake
+@@ -33,7 +33,7 @@ function(caffe_generate_export_configs)
+   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
+ 
+   # Add targets to the build-tree export set
+-  export(TARGETS caffe proto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
++  export(TARGETS caffe caffeproto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
+   export(PACKAGE Caffe)
+ 
+   # ---[ Configure install-tree CaffeConfig.cmake file ]---
+diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
+index 4a5bac47..c48255c8 100644
+--- a/cmake/Dependencies.cmake
++++ b/cmake/Dependencies.cmake
+@@ -5,7 +5,7 @@ set(Caffe_DEFINITIONS "")
+ set(Caffe_COMPILE_OPTIONS "")
+ 
+ # ---[ Boost
+-find_package(Boost 1.55 REQUIRED COMPONENTS system thread filesystem)
++find_package(Boost 1.54 REQUIRED COMPONENTS system thread filesystem)
+ list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
+ list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
+ 
+diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
+index 8eaab594..4d44e613 100644
+--- a/cmake/Modules/FindvecLib.cmake
++++ b/cmake/Modules/FindvecLib.cmake
+@@ -12,11 +12,12 @@ endif()
+ 
+ set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Headers")
+ 
++exec_program(xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
+ find_path(vecLib_INCLUDE_DIR vecLib.h
+           DOC "vecLib include directory"
+           PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
+                 /System/Library/${__veclib_include_suffix}
+-                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
++                ${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+           NO_DEFAULT_PATH)
+ 
+ include(FindPackageHandleStandardArgs)
+diff --git a/docs/development.md b/docs/development.md
+index ec05bbee..36cd3995 100644
+--- a/docs/development.md
++++ b/docs/development.md
+@@ -116,5 +116,5 @@ To get a list of all options `googletest` provides, simply pass the `--help` fla
+ 
+ - **Run `make lint` to check C++ code.**
+ - Wrap lines at 80 chars.
+-- Follow [Google C++ style](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml) and [Google python style](http://google-styleguide.googlecode.com/svn/trunk/pyguide.html) + [PEP 8](http://legacy.python.org/dev/peps/pep-0008/).
++- Follow [Google C++ style](https://google.github.io/styleguide/cppguide.html) and [Google python style](https://google.github.io/styleguide/pyguide.html) + [PEP 8](http://legacy.python.org/dev/peps/pep-0008/).
+ - Remember that “a foolish consistency is the hobgoblin of little minds,” so use your best judgement to write the clearest code for your particular case.
+diff --git a/docs/install_apt.md b/docs/install_apt.md
+index ee2cd287..e361a92d 100644
+--- a/docs/install_apt.md
++++ b/docs/install_apt.md
+@@ -4,10 +4,43 @@ title: "Installation: Ubuntu"
+ 
+ # Ubuntu Installation
+ 
++### For Ubuntu (>= 17.04)
++
++**Installing pre-compiled Caffe**
++
++Everything including caffe itself is packaged in 17.04 and higher versions.
++To install pre-compiled Caffe package, just do it by
++
++    sudo apt install caffe-cpu
++
++for CPU-only version, or
++
++    sudo apt install caffe-cuda
++
++for CUDA version. Note, the cuda version may break if your NVIDIA driver
++and CUDA toolkit are not installed by APT.
++
++[Package status of CPU-only version](https://launchpad.net/ubuntu/+source/caffe)
++
++[Package status of CUDA version](https://launchpad.net/ubuntu/+source/caffe-contrib)
++
++**Installing Caffe from source**
++
++We may install the dependencies by merely one line
++
++    sudo apt build-dep caffe-cpu        # dependencies for CPU-only version
++    sudo apt build-dep caffe-cuda       # dependencies for CUDA version
++
++It requires a `deb-src` line in your `sources.list`.
++Continue with [compilation](installation.html#compilation).
++
++### For Ubuntu (\< 17.04)
++
+ **General dependencies**
+ 
+     sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
+     sudo apt-get install --no-install-recommends libboost-all-dev
++    sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev
+ 
+ **CUDA**: Install by `apt-get` or the NVIDIA `.run` package.
+ The NVIDIA package tends to follow more recent library and driver versions, but the installation is more manual.
+@@ -22,12 +55,6 @@ This can be skipped for CPU-only installation.
+ 
+ CUDA 8 is required on Ubuntu 16.04.
+ 
+-**Remaining dependencies, 14.04**
+-
+-Everything is packaged in 14.04.
+-
+-    sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev
+-
+ **Remaining dependencies, 12.04**
+ 
+ These dependencies need manual installation in 12.04.
+diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
+index 65fe7092..0a6a3b96 100644
+--- a/docs/install_apt_debian.md
++++ b/docs/install_apt_debian.md
+@@ -8,24 +8,28 @@ Caffe packages are available for several Debian versions, as shown in the
+ following chart:
+ 
+ ```
+-Your Distro     |  CPU_ONLY  |  CUDA  |     Alias
++Your Distro     |  CPU_ONLY  |  CUDA  | Codename
+ ----------------+------------+--------+-------------------
+-Debian/stable   |     ✘      |   ✘    | Debian Jessie
+-Debian/testing  |     ✔      |   ✔    | Debian Stretch/Sid
+-Debian/unstable |     ✔      |   ✔    | Debian Sid
++Debian/oldstable|     ✘      |   ✘    | Jessie (8.0)
++Debian/stable   |     ✔      |   ✔    | Stretch (9.0)
++Debian/testing  |     ✔      |   ✔    | Buster
++Debian/unstable |     ✔      |   ✔    | Buster
+ ```
+ 
+ * `✘ ` You should take a look at [Ubuntu installation instruction](install_apt.html).
+ 
+ * `✔ ` You can install caffe with a single command line following this guide.
+ 
+-Last update: 2017-02-01
++* [Package status of CPU-only version](https://tracker.debian.org/pkg/caffe)
++
++* [Package status of CUDA version](https://tracker.debian.org/pkg/caffe-contrib)
++
++Last update: 2017-07-08
+ 
+ ## Binary installation with APT
+ 
+-Apart from the installation methods based on source, Debian/unstable
+-and Debian/testing users can install pre-compiled Caffe packages from
+-the official archive.
++Apart from the installation methods based on source, Debian users can install
++pre-compiled Caffe packages from the official archive with APT.
+ 
+ Make sure that your `/etc/apt/sources.list` contains `contrib` and `non-free`
+ sections if you want to install the CUDA version, for instance:
+@@ -44,7 +48,8 @@ $ caffe                                              # command line interface wo
+ $ python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
+ ```
+ 
+-These Caffe packages should work for you out of box.
++These Caffe packages should work for you out of box. However, the CUDA version
++may break if your NVIDIA driver and CUDA toolkit are not installed with APT.
+ 
+ #### Customizing caffe packages
+ 
+@@ -96,18 +101,22 @@ Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
+ Some users may find their favorate compiler doesn't work with CUDA.
+ 
+ ```
+-CXX compiler |  CUDA 7.5  |  CUDA 8.0  |
+--------------+------------+------------+-
+-GCC-7        |     ?      |     ?      |
+-GCC-6        |     ✘      |     ✘      |
+-GCC-5        |     ✔ [1]  |     ✔      |
+-CLANG-4.0    |     ?      |     ?      |
+-CLANG-3.9    |     ✘      |     ✘      |
+-CLANG-3.8    |     ?      |     ✔      |
++CXX compiler |  CUDA 7.5  |  CUDA 8.0  |  CUDA 9.0  |
++-------------+------------+------------+------------+
++GCC-8        |     ?      |     ?      |     ?      |
++GCC-7        |     ?      |     ?      |     ?      |
++GCC-6        |     ✘      |     ✘      |     ✔      |
++GCC-5        |     ✔ [1]  |     ✔      |     ✔      |
++-------------+------------+------------+------------+
++CLANG-4.0    |     ?      |     ?      |     ?      |
++CLANG-3.9    |     ✘      |     ✘      |     ✔      |
++CLANG-3.8    |     ?      |     ✔      |     ✔      |
+ ```
+ 
+ `[1]` CUDA 7.5 's `host_config.h` must be patched before working with GCC-5.
+ 
++`[2]` CUDA 9.0: https://devblogs.nvidia.com/parallelforall/cuda-9-features-revealed/
++
+ BTW, please forget the GCC-4.X series, since its `libstdc++` ABI is not compatible with GCC-5's.
+ You may encounter failure linking GCC-4.X object files against GCC-5 libraries.
+ (See https://wiki.debian.org/GCC5 )
+@@ -152,10 +161,3 @@ and hack the packaging scripts, then build your customized package.
+ $ sudo apt install caffe-doc
+ $ dpkg -L caffe-doc
+ ```
+-
+-* Where can I find the Debian package status?
+-
+-```
+-https://tracker.debian.org/pkg/caffe          (for the CPU_ONLY version)
+-https://tracker.debian.org/pkg/caffe-contrib  (for the CUDA version)
+-```
+diff --git a/docs/installation.md b/docs/installation.md
+index 42f1d0ce..6104cc24 100644
+--- a/docs/installation.md
++++ b/docs/installation.md
+@@ -42,14 +42,14 @@ Optional dependencies:
+ 
+ * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
+ * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
+-* cuDNN for GPU acceleration (v6)
++* cuDNN for GPU acceleration (v7)
+ 
+ Pycaffe and Matcaffe interfaces have their own natural needs.
+ 
+ * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
+ * For MATLAB Caffe: MATLAB with the `mex` compiler.
+ 
+-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v6; older versions are supported in older Caffe.
++**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v7; older versions are supported in older Caffe.
+ 
+ **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
+ 
+diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
+index 2faacc58..78a46f3a 100644
+--- a/docs/tutorial/layers.md
++++ b/docs/tutorial/layers.md
+@@ -87,7 +87,7 @@ Layers:
+ * [ELU](layers/elu.html) - exponential linear rectification.
+ * [Sigmoid](layers/sigmoid.html)
+ * [TanH](layers/tanh.html)
+-* [Absolute Value](layers/abs.html)
++* [Absolute Value](layers/absval.html)
+ * [Power](layers/power.html) - f(x) = (shift + scale * x) ^ power.
+ * [Exp](layers/exp.html) - f(x) = base ^ (shift + scale * x).
+ * [Log](layers/log.html) - f(x) = log(x).
+diff --git a/docs/tutorial/layers/lrn.md b/docs/tutorial/layers/lrn.md
+index 2fbef734..f5e48292 100644
+--- a/docs/tutorial/layers/lrn.md
++++ b/docs/tutorial/layers/lrn.md
+@@ -14,7 +14,7 @@ title: Local Response Normalization (LRN)
+         - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
+         - `alpha` [default 1]: the scaling parameter (see below)
+         - `beta` [default 5]: the exponent (see below)
+-        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
++        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locations (`WITHIN_CHANNEL`)
+ 
+ The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, wher [...]
+ 
+diff --git a/examples/brewing-logreg.ipynb b/examples/brewing-logreg.ipynb
+index c053b73b..0f87185a 100644
+--- a/examples/brewing-logreg.ipynb
++++ b/examples/brewing-logreg.ipynb
+@@ -73,12 +73,12 @@
+     ")\n",
+     "\n",
+     "# Split into train and test\n",
+-    "X, Xt, y, yt = sklearn.cross_validation.train_test_split(X, y)\n",
++    "X, Xt, y, yt = sklearn.model_selection.train_test_split(X, y)\n",
+     "\n",
+     "# Visualize sample of the data\n",
+     "ind = np.random.permutation(X.shape[0])[:1000]\n",
+     "df = pd.DataFrame(X[ind])\n",
+-    "_ = pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
++    "_ = pd.plotting.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
+    ]
+   },
+   {
+@@ -111,7 +111,7 @@
+     "%%timeit\n",
+     "# Train and test the scikit-learn SGD logistic regression.\n",
+     "clf = sklearn.linear_model.SGDClassifier(\n",
+-    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='auto')\n",
++    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='balanced')\n",
+     "\n",
+     "clf.fit(X, y)\n",
+     "yt_pred = clf.predict(Xt)\n",
+diff --git a/examples/web_demo/readme.md b/examples/web_demo/readme.md
+index fe74b9ef..e50c4f10 100644
+--- a/examples/web_demo/readme.md
++++ b/examples/web_demo/readme.md
+@@ -11,7 +11,7 @@ priority: 10
+ ## Requirements
+ 
+ The demo server requires Python with some dependencies.
+-To make sure you have the dependencies, please run `pip install -r examples/web_demo/requirements.txt`, and also make sure that you've compiled the Python Caffe interface and that it is on your `PYTHONPATH` (see [installation instructions](/installation.html)).
++To make sure you have the dependencies, please run `pip install -r examples/web_demo/requirements.txt`, and also make sure that you've compiled the Python Caffe interface and that it is on your `PYTHONPATH` (see [installation instructions](http://caffe.berkeleyvision.org/installation.html)).
+ 
+ Make sure that you have obtained the Reference CaffeNet Model and the ImageNet Auxiliary Data:
+ 
+diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
+index dad9ad46..bb92ded7 100644
+--- a/include/caffe/filler.hpp
++++ b/include/caffe/filler.hpp
+@@ -250,10 +250,10 @@ class BilinearFiller : public Filler<Dtype> {
+     CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
+     Dtype* data = blob->mutable_cpu_data();
+     int f = ceil(blob->width() / 2.);
+-    float c = (2 * f - 1 - f % 2) / (2. * f);
++    Dtype c = (blob->width() - 1) / (2. * f);
+     for (int i = 0; i < blob->count(); ++i) {
+-      float x = i % blob->width();
+-      float y = (i / blob->width()) % blob->height();
++      Dtype x = i % blob->width();
++      Dtype y = (i / blob->width()) % blob->height();
+       data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+     }
+     CHECK_EQ(this->filler_param_.sparse(), -1)
+diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
+index a9ad3225..dd2247b9 100644
+--- a/include/caffe/layers/accuracy_layer.hpp
++++ b/include/caffe/layers/accuracy_layer.hpp
+@@ -68,6 +68,8 @@ class AccuracyLayer : public Layer<Dtype> {
+    */
+   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+       const vector<Blob<Dtype>*>& top);
++  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
++      const vector<Blob<Dtype>*>& top);
+ 
+ 
+   /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
+@@ -77,6 +79,8 @@ class AccuracyLayer : public Layer<Dtype> {
+       if (propagate_down[i]) { NOT_IMPLEMENTED; }
+     }
+   }
++  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
++      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+ 
+   int label_axis_, outer_num_, inner_num_;
+ 
+diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp
+index c4fda122..5219fa5c 100644
+--- a/include/caffe/layers/crop_layer.hpp
++++ b/include/caffe/layers/crop_layer.hpp
+@@ -41,13 +41,15 @@ class CropLayer : public Layer<Dtype> {
+   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+ 
+-  vector<int> offsets;
++  Blob<int> offsets;
++  Blob<int> src_strides_;
++  Blob<int> dest_strides_;
+ 
+  private:
+   // Recursive copy function.
+   void crop_copy(const vector<Blob<Dtype>*>& bottom,
+                const vector<Blob<Dtype>*>& top,
+-               const vector<int>& offsets,
++               const int* offsets,
+                vector<int> indices,
+                int cur_dim,
+                const Dtype* src_data,
+diff --git a/include/caffe/layers/infogain_loss_layer.hpp b/include/caffe/layers/infogain_loss_layer.hpp
+index edecde82..3b3caa27 100644
+--- a/include/caffe/layers/infogain_loss_layer.hpp
++++ b/include/caffe/layers/infogain_loss_layer.hpp
+@@ -13,20 +13,21 @@
+ namespace caffe {
+ 
+ /**
+- * @brief A generalization of MultinomialLogisticLossLayer that takes an
++ * @brief A generalization of SoftmaxWithLossLayer that takes an
+  *        "information gain" (infogain) matrix specifying the "value" of all label
+  *        pairs.
+  *
+- * Equivalent to the MultinomialLogisticLossLayer if the infogain matrix is the
++ * Equivalent to the SoftmaxWithLossLayer if the infogain matrix is the
+  * identity.
+  *
+  * @param bottom input Blob vector (length 2-3)
+  *   -# @f$ (N \times C \times H \times W) @f$
+- *      the predictions @f$ \hat{p} @f$, a Blob with values in
+- *      @f$ [0, 1] @f$ indicating the predicted probability of each of the
+- *      @f$ K = CHW @f$ classes.  Each prediction vector @f$ \hat{p}_n @f$
+- *      should sum to 1 as in a probability distribution: @f$
+- *      \forall n \sum\limits_{k=1}^K \hat{p}_{nk} = 1 @f$.
++ *      the predictions @f$ x @f$, a Blob with values in
++ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
++ *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
++ *      probability distribution over classes using the softmax function
++ *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
++ *      \left[\sum_{k'} \exp(x_{nk'})\right] @f$ (see SoftmaxLayer).
+  *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+  *      the labels @f$ l @f$, an integer-valued Blob with values
+  *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+@@ -35,7 +36,7 @@ namespace caffe {
+  *      (\b optional) the infogain matrix @f$ H @f$.  This must be provided as
+  *      the third bottom blob input if not provided as the infogain_mat in the
+  *      InfogainLossParameter. If @f$ H = I @f$, this layer is equivalent to the
+- *      MultinomialLogisticLossLayer.
++ *      SoftmaxWithLossLayer.
+  * @param top output Blob vector (length 1)
+  *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+  *      the computed infogain multinomial logistic loss: @f$ E =
+@@ -98,8 +99,8 @@ class InfogainLossLayer : public LossLayer<Dtype> {
+    *      infogain matrix, if provided as bottom[2])
+    * @param bottom input Blob vector (length 2-3)
+    *   -# @f$ (N \times C \times H \times W) @f$
+-   *      the predictions @f$ \hat{p} @f$; Backward computes diff
+-   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
++   *      the predictions @f$ x @f$; Backward computes diff
++   *      @f$ \frac{\partial E}{\partial x} @f$
+    *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+    *      the labels -- ignored as we can't compute their error gradients
+    *   -# @f$ (1 \times 1 \times K \times K) @f$
+diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
+index 498cfe38..cd3f93f6 100644
+--- a/include/caffe/util/cudnn.hpp
++++ b/include/caffe/util/cudnn.hpp
+@@ -44,6 +44,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+ #if CUDNN_VERSION_MIN(6, 0, 0)
+     case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+       return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
++#endif
++#if CUDNN_VERSION_MIN(7, 0, 0)
++    case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
++      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
++    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
++      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+ #endif
+   }
+   return "Unknown cudnn status";
+diff --git a/matlab/+caffe/Net.m b/matlab/+caffe/Net.m
+index 349e060e..bb99ec89 100644
+--- a/matlab/+caffe/Net.m
++++ b/matlab/+caffe/Net.m
+@@ -69,7 +69,9 @@ classdef Net < handle
+       self.blob_names = self.attributes.blob_names;
+     end
+     function delete (self)
+-      caffe_('delete_net', self.hNet_self);
++      if ~isempty(self.hNet_self)
++        caffe_('delete_net', self.hNet_self);
++      end
+     end
+     function layer = layers(self, layer_name)
+       CHECK(ischar(layer_name), 'layer_name must be a string');
+diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
+index d7f43fff..72659a4f 100644
+--- a/python/caffe/_caffe.cpp
++++ b/python/caffe/_caffe.cpp
+@@ -464,6 +464,14 @@ BOOST_PYTHON_MODULE(_caffe) {
+     .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
+         &Blob<Dtype>::count))
+     .def("reshape",           bp::raw_function(&Blob_Reshape))
++#ifndef CPU_ONLY
++    .add_property("_gpu_data_ptr",
++        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
++          &Blob<Dtype>::mutable_gpu_data))
++    .add_property("_gpu_diff_ptr",
++        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
++          &Blob<Dtype>::mutable_gpu_diff))
++#endif
+     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
+           NdarrayCallPolicies()))
+     .add_property("diff",     bp::make_function(&Blob<Dtype>::mutable_cpu_diff,
+diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
+index ea29fed8..983760a7 100644
+--- a/python/caffe/classifier.py
++++ b/python/caffe/classifier.py
+@@ -92,7 +92,7 @@ class Classifier(caffe.Net):
+ 
+         # For oversampling, average predictions across crops.
+         if oversample:
+-            predictions = predictions.reshape((len(predictions) / 10, 10, -1))
++            predictions = predictions.reshape((len(predictions) // 10, 10, -1))
+             predictions = predictions.mean(1)
+ 
+         return predictions
+diff --git a/python/caffe/io.py b/python/caffe/io.py
+index 966c164c..ed4b3bef 100644
+--- a/python/caffe/io.py
++++ b/python/caffe/io.py
+@@ -256,7 +256,12 @@ class Transformer:
+             if len(ms) != 3:
+                 raise ValueError('Mean shape invalid')
+             if ms != self.inputs[in_][1:]:
+-                raise ValueError('Mean shape incompatible with input shape.')
++                in_shape = self.inputs[in_][1:]
++                m_min, m_max = mean.min(), mean.max()
++                normal_mean = (mean - m_min) / (m_max - m_min)
++                mean = resize_image(normal_mean.transpose((1,2,0)),
++                        in_shape[1:]).transpose((2,0,1)) * \
++                        (m_max - m_min) + m_min
+         self.mean[in_] = mean
+ 
+     def set_input_scale(self, in_, scale):
+@@ -323,7 +328,7 @@ def resize_image(im, new_dims, interp_order=1):
+             # skimage is fast but only understands {1,3} channel images
+             # in [0, 1].
+             im_std = (im - im_min) / (im_max - im_min)
+-            resized_std = resize(im_std, new_dims, order=interp_order)
++            resized_std = resize(im_std, new_dims, order=interp_order, mode='constant')
+             resized_im = resized_std * (im_max - im_min) + im_min
+         else:
+             # the image is a constant -- avoid divide by 0
+diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
+index afd27690..ee1d38c3 100644
+--- a/python/caffe/test/test_net.py
++++ b/python/caffe/test/test_net.py
+@@ -72,41 +72,41 @@ class TestNet(unittest.TestCase):
+         self.net.backward()
+ 
+     def test_forward_start_end(self):
+-        conv_blob=self.net.blobs['conv'];
+-        ip_blob=self.net.blobs['ip_blob'];
+-        sample_data=np.random.uniform(size=conv_blob.data.shape);
+-        sample_data=sample_data.astype(np.float32);
+-        conv_blob.data[:]=sample_data;
+-        forward_blob=self.net.forward(start='ip',end='ip');
+-        self.assertIn('ip_blob',forward_blob);
+-
+-        manual_forward=[];
++        conv_blob=self.net.blobs['conv']
++        ip_blob=self.net.blobs['ip_blob']
++        sample_data=np.random.uniform(size=conv_blob.data.shape)
++        sample_data=sample_data.astype(np.float32)
++        conv_blob.data[:]=sample_data
++        forward_blob=self.net.forward(start='ip',end='ip')
++        self.assertIn('ip_blob',forward_blob)
++
++        manual_forward=[]
+         for i in range(0,conv_blob.data.shape[0]):
+           dot=np.dot(self.net.params['ip'][0].data,
+-                     conv_blob.data[i].reshape(-1));
+-          manual_forward.append(dot+self.net.params['ip'][1].data);
+-        manual_forward=np.array(manual_forward);
++                     conv_blob.data[i].reshape(-1))
++          manual_forward.append(dot+self.net.params['ip'][1].data)
++        manual_forward=np.array(manual_forward)
+ 
+-        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3);
++        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3,atol=1e-5)
+ 
+     def test_backward_start_end(self):
+-        conv_blob=self.net.blobs['conv'];
+-        ip_blob=self.net.blobs['ip_blob'];
++        conv_blob=self.net.blobs['conv']
++        ip_blob=self.net.blobs['ip_blob']
+         sample_data=np.random.uniform(size=ip_blob.data.shape)
+-        sample_data=sample_data.astype(np.float32);
+-        ip_blob.diff[:]=sample_data;
+-        backward_blob=self.net.backward(start='ip',end='ip');
+-        self.assertIn('conv',backward_blob);
++        sample_data=sample_data.astype(np.float32)
++        ip_blob.diff[:]=sample_data
++        backward_blob=self.net.backward(start='ip',end='ip')
++        self.assertIn('conv',backward_blob)
+ 
+-        manual_backward=[];
++        manual_backward=[]
+         for i in range(0,conv_blob.data.shape[0]):
+           dot=np.dot(self.net.params['ip'][0].data.transpose(),
+-                     sample_data[i].reshape(-1));
+-          manual_backward.append(dot);
+-        manual_backward=np.array(manual_backward);
+-        manual_backward=manual_backward.reshape(conv_blob.data.shape);
++                     sample_data[i].reshape(-1))
++          manual_backward.append(dot)
++        manual_backward=np.array(manual_backward)
++        manual_backward=manual_backward.reshape(conv_blob.data.shape)
+ 
+-        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3);
++        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3,atol=1e-5)
+ 
+     def test_clear_param_diffs(self):
+         # Run a forward/backward step to have non-zero diffs
+diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
+index dac5d2f9..abf9cf1c 100755
+--- a/scripts/travis/install-deps.sh
++++ b/scripts/travis/install-deps.sh
+@@ -9,10 +9,10 @@ apt-get -y update
+ apt-get install -y --no-install-recommends \
+   build-essential \
+   graphviz \
+-  libboost-filesystem1.55-dev \
+-  libboost-python1.55-dev \
+-  libboost-system1.55-dev \
+-  libboost-thread1.55-dev \
++  libboost-filesystem-dev \
++  libboost-python-dev \
++  libboost-system-dev \
++  libboost-thread-dev \
+   libgflags-dev \
+   libgoogle-glog-dev \
+   libhdf5-serial-dev \
+@@ -106,7 +106,7 @@ if $WITH_CUDA ; then
+   ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
+ 
+   if $WITH_CUDNN ; then
+-    apt-get install -y --no-install-recommends libcudnn6-dev
++    apt-get install -y --no-install-recommends libcudnn7-dev
+   fi
+ fi
+ 
+diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
+index b9152e92..4a805568 100644
+--- a/src/caffe/CMakeLists.txt
++++ b/src/caffe/CMakeLists.txt
+@@ -3,12 +3,12 @@ file(GLOB proto_files proto/*.proto)
+ caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
+ 
+ # include python files either to force generation
+-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+-caffe_default_properties(proto)
+-target_link_libraries(proto PUBLIC ${PROTOBUF_LIBRARIES})
+-target_include_directories(proto PUBLIC ${PROTOBUF_INCLUDE_DIR})
++add_library(caffeproto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
++caffe_default_properties(caffeproto)
++target_link_libraries(caffeproto PUBLIC ${PROTOBUF_LIBRARIES})
++target_include_directories(caffeproto PUBLIC ${PROTOBUF_INCLUDE_DIR})
+ 
+-list(INSERT Caffe_LINKER_LIBS 0 PUBLIC proto) # note, crucial to prepend!
++list(INSERT Caffe_LINKER_LIBS 0 PUBLIC caffeproto) # note, crucial to prepend!
+ 
+ # --[ Caffe library
+ 
+@@ -42,7 +42,7 @@ set_target_properties(caffe PROPERTIES
+ # ---[ Install
+ install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+ install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
+-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(TARGETS caffe caffeproto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ 
+ file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
+ list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
+diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
+index 4eddbb5c..b6d95b54 100644
+--- a/src/caffe/layers/accuracy_layer.cpp
++++ b/src/caffe/layers/accuracy_layer.cpp
+@@ -52,8 +52,6 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+   const Dtype* bottom_label = bottom[1]->cpu_data();
+   const int dim = bottom[0]->count() / outer_num_;
+   const int num_labels = bottom[0]->shape(label_axis_);
+-  vector<Dtype> maxval(top_k_+1);
+-  vector<int> max_id(top_k_+1);
+   if (top.size() > 1) {
+     caffe_set(nums_buffer_.count(), Dtype(0), nums_buffer_.mutable_cpu_data());
+     caffe_set(top[1]->count(), Dtype(0), top[1]->mutable_cpu_data());
+@@ -66,32 +64,29 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+       if (has_ignore_label_ && label_value == ignore_label_) {
+         continue;
+       }
+-      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
+       DCHECK_GE(label_value, 0);
+       DCHECK_LT(label_value, num_labels);
++      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
++      const Dtype prob_of_true_class = bottom_data[i * dim
++                                                   + label_value * inner_num_
++                                                   + j];
++      int num_better_predictions = -1;  // true_class also counts as "better"
+       // Top-k accuracy
+-      std::vector<std::pair<Dtype, int> > bottom_data_vector;
+-      for (int k = 0; k < num_labels; ++k) {
+-        bottom_data_vector.push_back(std::make_pair(
+-            bottom_data[i * dim + k * inner_num_ + j], k));
++      for (int k = 0; k < num_labels && num_better_predictions < top_k_; ++k) {
++        num_better_predictions +=
++          (bottom_data[i * dim + k * inner_num_ + j] >= prob_of_true_class);
+       }
+-      std::partial_sort(
+-          bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
+-          bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
+-      // check if true label is in top k predictions
+-      for (int k = 0; k < top_k_; k++) {
+-        if (bottom_data_vector[k].second == label_value) {
+-          ++accuracy;
+-          if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
+-          break;
+-        }
++      // check if there are less than top_k_ predictions
++      if (num_better_predictions < top_k_) {
++        ++accuracy;
++        if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
+       }
+       ++count;
+     }
+   }
+ 
+   // LOG(INFO) << "Accuracy: " << accuracy;
+-  top[0]->mutable_cpu_data()[0] = accuracy / count;
++  top[0]->mutable_cpu_data()[0] = (count == 0) ? 0 : (accuracy / count);
+   if (top.size() > 1) {
+     for (int i = 0; i < top[1]->count(); ++i) {
+       top[1]->mutable_cpu_data()[i] =
+@@ -102,6 +97,10 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+   // Accuracy layer should not be used as a loss function.
+ }
+ 
++#ifdef CPU_ONLY
++STUB_GPU(AccuracyLayer);
++#endif
++
+ INSTANTIATE_CLASS(AccuracyLayer);
+ REGISTER_LAYER_CLASS(Accuracy);
+ 
+diff --git a/src/caffe/layers/accuracy_layer.cu b/src/caffe/layers/accuracy_layer.cu
+new file mode 100644
+index 00000000..a8cff936
+--- /dev/null
++++ b/src/caffe/layers/accuracy_layer.cu
+@@ -0,0 +1,147 @@
++#include <vector>
++
++#include "caffe/layers/accuracy_layer.hpp"
++#include "caffe/util/math_functions.hpp"
++
++
++namespace caffe {
++
++template <typename Dtype>
++__global__ void AccuracyForwardGPU(const int nthreads,
++          const Dtype* bottom_data, const Dtype* label, Dtype* acc,
++          const int num, const int dim, const int spatial_dim,
++          const int num_labels, const int top_k,
++          const bool has_ignore_label_, const int ignore_label_,
++          Dtype* counts) {
++  CUDA_KERNEL_LOOP(index, nthreads) {
++    const int n = index / spatial_dim;
++    const int s = index % spatial_dim;
++    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
++    const Dtype prob_of_true_class = bottom_data[n * dim
++                                                 + label_value * spatial_dim
++                                                 + s];
++    int num_better_predictions = -1;  // true_class also counts as "better"
++    if (has_ignore_label_ && label_value == ignore_label_) {
++      acc[index] = 0;
++      counts[index] = 0;
++    } else {
++      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
++        num_better_predictions +=
++          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
++      }
++      acc[index] = (num_better_predictions < top_k);
++      counts[index] = 1;
++    }
++  }
++}
++
++template <typename Dtype>
++__global__ void AccuracyForwardWithPerClassGPU(const int nthreads,
++          const Dtype* bottom_data, const Dtype* label,
++          Dtype* acc, Dtype* counts,
++          const int num, const int dim, const int spatial_dim,
++          const int num_labels, const int top_k,
++          const bool has_ignore_label_, const int ignore_label_) {
++  CUDA_KERNEL_LOOP(index, nthreads) {
++    const int n = index / spatial_dim;
++    const int s = index % spatial_dim;
++    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
++    const Dtype prob_of_true_class = bottom_data[n * dim
++                                                 + label_value * spatial_dim
++                                                 + s];
++    if (has_ignore_label_ && label_value == ignore_label_) {
++      // nothing to be done.
++    } else {
++      int num_better_predictions = -1;  // true_class also counts as "better"
++      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
++        num_better_predictions +=
++          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
++      }
++      acc[label_value*nthreads + index] += (num_better_predictions < top_k);
++      counts[label_value*nthreads + index] = 1;
++    }
++  }
++}
++
++template <typename Dtype>
++void AccuracyLayer<Dtype>::Forward_gpu(
++    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
++  const Dtype* bottom_data = bottom[0]->gpu_data();
++  const Dtype* bottom_label = bottom[1]->gpu_data();
++  const int dim = bottom[0]->count() / outer_num_;
++  const int num_labels = bottom[0]->shape(label_axis_);
++  const int nthreads = outer_num_ * inner_num_;
++  // Since this memory is not used for anything,
++  // we use it here to avoid having to allocate new GPU
++  // memory to accumulate intermediate results in the kernel.
++  Dtype* acc_data = bottom[0]->mutable_gpu_diff();
++  if (top.size() == 1) {
++    // simple case - report only global accuracy.
++
++    // Similarly, this memory is never used elsewhere, and thus we can use it
++    // to avoid having to allocate additional GPU memory.
++    Dtype* counts = bottom[1]->mutable_gpu_diff();
++    // NOLINT_NEXT_LINE(whitespace/operators)
++    AccuracyForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
++        CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
++        acc_data, outer_num_, dim, inner_num_, num_labels, top_k_,
++        has_ignore_label_, ignore_label_, counts);
++    Dtype acc;
++    caffe_gpu_asum(nthreads, acc_data, &acc);
++    Dtype valid_count;
++    caffe_gpu_asum(nthreads, counts, &valid_count);
++    if (valid_count > 0) {
++      top[0]->mutable_cpu_data()[0] = acc / valid_count;
++    } else {
++      top[0]->mutable_cpu_data()[0] = 0;
++    }
++  } else {
++    // need to report per-class accuracy as well
++
++    // allocate space for more detailed "counts"
++    nums_buffer_.ReshapeLike(*bottom[0]);
++    Dtype* counts = nums_buffer_.mutable_gpu_data();
++
++    caffe_gpu_set(bottom[0]->count(), Dtype(0), acc_data);
++    caffe_gpu_set(nums_buffer_.count(), Dtype(0), counts);
++
++    // NOLINT_NEXT_LINE(whitespace/operators)
++    AccuracyForwardWithPerClassGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
++        CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
++        acc_data, counts, outer_num_, dim, inner_num_, num_labels, top_k_,
++        has_ignore_label_, ignore_label_);
++
++    // get the overall accuracy
++    Dtype acc;
++    caffe_gpu_asum(bottom[0]->count(), acc_data, &acc);
++    Dtype valid_count;
++    caffe_gpu_asum(nums_buffer_.count(), counts, &valid_count);
++    if (valid_count > 0) {
++      top[0]->mutable_cpu_data()[0] = acc / valid_count;
++    } else {
++      top[0]->mutable_cpu_data()[0] = 0;
++    }
++
++    // get per-class accuracy
++    Dtype* per_class_acc = top[1]->mutable_cpu_data();
++    for (int l = 0; l < num_labels; l++) {
++      caffe_gpu_asum(nthreads, acc_data + l*nthreads, per_class_acc+l);
++      caffe_gpu_asum(nthreads, counts + l*nthreads, &valid_count);
++      if (valid_count > 0) {
++        per_class_acc[l] /= valid_count;
++      } else {
++        per_class_acc[l] = 0;
++      }
++    }
++  }
++}
++
++
++template <typename Dtype>
++void AccuracyLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
++    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
++  if (propagate_down[1]) {  NOT_IMPLEMENTED;  }
++}
++
++INSTANTIATE_LAYER_GPU_FUNCS(AccuracyLayer);
++}  // namespace caffe
+diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
+index ef8c177c..65ea8f8b 100644
+--- a/src/caffe/layers/crop_layer.cpp
++++ b/src/caffe/layers/crop_layer.cpp
+@@ -40,8 +40,10 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+   const int start_axis = bottom[0]->CanonicalAxisIndex(param.axis());
+ 
+   // Initialize offsets to 0 and the new shape to the current shape of the data.
+-  offsets = vector<int>(input_dim, 0);
+   vector<int> new_shape(bottom[0]->shape());
++  vector<int> offsets_shape(1, input_dim);
++  offsets.Reshape(offsets_shape);
++  int* offset_data = offsets.mutable_cpu_data();
+ 
+   // Determine crop offsets and the new shape post-crop.
+   for (int i = 0; i < input_dim; ++i) {
+@@ -63,15 +65,22 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+           << "size " << bottom[1]->shape(i) << " and offset " << crop_offset;
+     }
+     new_shape[i] = new_size;
+-    offsets[i] = crop_offset;
++    offset_data[i] = crop_offset;
+   }
+   top[0]->Reshape(new_shape);
++  // Compute strides
++  src_strides_.Reshape(offsets_shape);
++  dest_strides_.Reshape(offsets_shape);
++  for (int i = 0; i < input_dim; ++i) {
++    src_strides_.mutable_cpu_data()[i] = bottom[0]->count(i + 1, input_dim);
++    dest_strides_.mutable_cpu_data()[i] = top[0]->count(i + 1, input_dim);
++  }
+ }
+ 
+ template <typename Dtype>
+ void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
+              const vector<Blob<Dtype>*>& top,
+-             const vector<int>& offsets,
++             const int* offsets,
+              vector<int> indices,
+              int cur_dim,
+              const Dtype* src_data,
+@@ -115,7 +124,8 @@ void CropLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+   std::vector<int> indices(top[0]->num_axes(), 0);
+   const Dtype* bottom_data = bottom[0]->cpu_data();
+   Dtype* top_data = top[0]->mutable_cpu_data();
+-  crop_copy(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
++  crop_copy(bottom, top, offsets.cpu_data(), indices, 0, bottom_data, top_data,
++      true);
+ }
+ 
+ template <typename Dtype>
+@@ -127,7 +137,8 @@ void CropLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+   if (propagate_down[0]) {
+     caffe_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
+     std::vector<int> indices(top[0]->num_axes(), 0);
+-    crop_copy(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false);
++    crop_copy(bottom, top, offsets.cpu_data(), indices, 0, top_diff,
++        bottom_diff, false);
+   }
+ }
+ 
+diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
+index 677077cd..4ece9cd1 100644
+--- a/src/caffe/layers/crop_layer.cu
++++ b/src/caffe/layers/crop_layer.cu
+@@ -4,90 +4,63 @@
+ 
+ namespace caffe {
+ 
+-// Copy (one line per thread) from one array to another, with arbitrary
+-// strides in the last two dimensions.
++__device__ int compute_uncropped_index(
++    int index,
++    const int ndims,
++    const int* src_strides,
++    const int* dest_strides,
++    const int* offsets) {
++  int dest_index = index;
++  int src_index = 0;
++  for (int i = 0; i < ndims; ++i) {
++      int coord = dest_index / dest_strides[i];
++      dest_index -= coord * dest_strides[i];
++      src_index += src_strides[i] * (coord + offsets[i]);
++  }
++  return src_index;
++}
++
+ template <typename Dtype>
+-__global__ void copy_kernel(const int n, const int height, const int width,
+-    const int src_inner_stride,
+-    const int dest_inner_stride,
++__global__ void crop_kernel_forward(const int nthreads,
++    const int ndims,
++    const int* src_strides,
++    const int* dest_strides,
++    const int* offsets,
+     const Dtype* src, Dtype* dest) {
+-  CUDA_KERNEL_LOOP(index, n) {
+-    int src_start = index * src_inner_stride;
+-    int dest_start = index * dest_inner_stride;
+-    for (int i = 0; i < width; ++i) {
+-      dest[dest_start + i] = src[src_start + i];
+-    }
++  CUDA_KERNEL_LOOP(index, nthreads) {
++    int src_index = compute_uncropped_index(
++        index, ndims, src_strides, dest_strides, offsets);
++    dest[index] = src[src_index];
+   }
+ }
+ 
+ template <typename Dtype>
+-void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
+-             const vector<Blob<Dtype>*>& top,
+-             const vector<int>& offsets,
+-             vector<int> indices,
+-             int cur_dim,
+-             const Dtype* src_data,
+-             Dtype* dest_data,
+-             bool is_forward) {
+-  if (cur_dim + 2 < top[0]->num_axes()) {
+-    // We are not yet at the final dimension, call copy recursivley
+-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
+-      indices[cur_dim] = i;
+-      crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1,
+-                src_data, dest_data, is_forward);
+-    }
+-  } else {
+-    // We are at the last two dimensions, which are stored continuously in
+-    // memory. With (N,C,H,W)
+-    //              (0,1,2,3) cur_dim   -> H
+-    //                        cur_dim+1 -> W
+-    const int lines = top[0]->shape(cur_dim);
+-    const int height = top[0]->shape(cur_dim);
+-    const int width = top[0]->shape(cur_dim+1);
+-    std::vector<int> ind_off(cur_dim+2, 0);
+-    for (int j = 0; j < cur_dim; ++j) {
+-        ind_off[j] = indices[j] + offsets[j];
+-    }
+-    ind_off[cur_dim] = offsets[cur_dim];
+-    ind_off[cur_dim+1] = offsets[cur_dim+1];
+-    // Compute copy strides
+-    const int src_inner_stride = bottom[0]->shape(cur_dim+1);
+-    const int dest_inner_stride = top[0]->shape(cur_dim+1);
+-
+-    if (is_forward) {
+-      const Dtype* bottom_data = bottom[0]->gpu_data() +
+-          bottom[0]->offset(ind_off);
+-      Dtype* top_data = top[0]->mutable_gpu_data() +
+-          top[0]->offset(indices);
+-      // NOLINT_NEXT_LINE(whitespace/operators)
+-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
+-          lines, height, width,
+-          src_inner_stride,
+-          dest_inner_stride,
+-          bottom_data, top_data);
+-
+-    } else {
+-      const Dtype* top_diff = top[0]->gpu_diff() +
+-          top[0]->offset(indices);
+-      Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() +
+-          bottom[0]->offset(ind_off);
+-      // NOLINT_NEXT_LINE(whitespace/operators)
+-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
+-          lines, height, width,
+-          dest_inner_stride,
+-          src_inner_stride,
+-          top_diff, bottom_diff);
+-    }
++__global__ void crop_kernel_backward(const int nthreads,
++    const int ndims,
++    const int* src_strides,
++    const int* dest_strides,
++    const int* offsets,
++    Dtype* src, const Dtype* dest) {
++  CUDA_KERNEL_LOOP(index, nthreads) {
++    int src_index = compute_uncropped_index(
++        index, ndims, src_strides, dest_strides, offsets);
++    src[src_index] = dest[index];
+   }
+ }
+ 
+ template <typename Dtype>
+ void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+     const vector<Blob<Dtype>*>& top) {
+-  std::vector<int> indices(top[0]->num_axes(), 0);
+   const Dtype* bottom_data = bottom[0]->gpu_data();
+   Dtype* top_data = top[0]->mutable_gpu_data();
+-  crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
++  int n = top[0]->count();
++  // NOLINT_NEXT_LINE(whitespace/operators)
++  crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
++      bottom[0]->num_axes(),
++      src_strides_.gpu_data(),
++      dest_strides_.gpu_data(),
++      offsets.gpu_data(),
++      bottom_data, top_data);
+ }
+ 
+ template <typename Dtype>
+@@ -95,12 +68,17 @@ void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+   const Dtype* top_diff = top[0]->gpu_diff();
+   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
++  int n = top[0]->count();
+ 
+   if (propagate_down[0]) {
+     caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
+-    std::vector<int> indices(top[0]->num_axes(), 0);
+-    crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff,
+-                  false);
++    // NOLINT_NEXT_LINE(whitespace/operators)
++    crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
++        bottom[0]->num_axes(),
++        src_strides_.gpu_data(),
++        dest_strides_.gpu_data(),
++        offsets.gpu_data(),
++        bottom_diff, top_diff);
+   }
+ }
+ 
+diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
+index 6fe808bd..e5cc9d5e 100644
+--- a/src/caffe/test/test_accuracy_layer.cpp
++++ b/src/caffe/test/test_accuracy_layer.cpp
+@@ -13,8 +13,10 @@
+ 
+ namespace caffe {
+ 
+-template <typename Dtype>
+-class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
++template <typename TypeParam>
++class AccuracyLayerTest : public MultiDeviceTest<TypeParam> {
++  typedef typename TypeParam::Dtype Dtype;
++
+  protected:
+   AccuracyLayerTest()
+       : blob_bottom_data_(new Blob<Dtype>()),
+@@ -69,11 +71,12 @@ class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
+   int top_k_;
+ };
+ 
+-TYPED_TEST_CASE(AccuracyLayerTest, TestDtypes);
++TYPED_TEST_CASE(AccuracyLayerTest, TestDtypesAndDevices);
+ 
+ TYPED_TEST(AccuracyLayerTest, TestSetup) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+   EXPECT_EQ(this->blob_top_->num(), 1);
+   EXPECT_EQ(this->blob_top_->channels(), 1);
+@@ -82,11 +85,12 @@ TYPED_TEST(AccuracyLayerTest, TestSetup) {
+ }
+ 
+ TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+   AccuracyParameter* accuracy_param =
+       layer_param.mutable_accuracy_param();
+   accuracy_param->set_top_k(5);
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+   EXPECT_EQ(this->blob_top_->num(), 1);
+   EXPECT_EQ(this->blob_top_->channels(), 1);
+@@ -95,8 +99,9 @@ TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
+ }
+ 
+ TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+   EXPECT_EQ(this->blob_top_->num(), 1);
+   EXPECT_EQ(this->blob_top_->channels(), 1);
+@@ -108,33 +113,39 @@ TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
+   EXPECT_EQ(this->blob_top_per_class_->width(), 1);
+ }
+ 
+-TYPED_TEST(AccuracyLayerTest, TestForwardCPU) {
++TYPED_TEST(AccuracyLayerTest, TestForward) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+-
+-  TypeParam max_value;
+-  int max_id;
+-  int num_correct_labels = 0;
+-  for (int i = 0; i < 100; ++i) {
+-    max_value = -FLT_MAX;
+-    max_id = 0;
+-    for (int j = 0; j < 10; ++j) {
+-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+-        max_id = j;
++
++  // repeat the forward
++  for (int iter = 0; iter < 3; iter++) {
++    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
++
++    Dtype max_value;
++    int max_id;
++    int num_correct_labels = 0;
++    for (int i = 0; i < 100; ++i) {
++      max_value = -FLT_MAX;
++      max_id = 0;
++      for (int j = 0; j < 10; ++j) {
++        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
++          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
++          max_id = j;
++        }
++      }
++      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
++        ++num_correct_labels;
+       }
+     }
+-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+-      ++num_correct_labels;
+-    }
++    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
++                num_correct_labels / Dtype(100.0), 1e-4);
+   }
+-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+-              num_correct_labels / 100.0, 1e-4);
+ }
+ 
+ TYPED_TEST(AccuracyLayerTest, TestForwardWithSpatialAxes) {
++  typedef typename TypeParam::Dtype Dtype;
+   this->blob_bottom_data_->Reshape(2, 10, 4, 5);
+   vector<int> label_shape(3);
+   label_shape[0] = 2; label_shape[1] = 4; label_shape[2] = 5;
+@@ -142,195 +153,218 @@ TYPED_TEST(AccuracyLayerTest, TestForwardWithSpatialAxes) {
+   this->FillBottoms();
+   LayerParameter layer_param;
+   layer_param.mutable_accuracy_param()->set_axis(1);
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+-
+-  TypeParam max_value;
+-  const int num_labels = this->blob_bottom_label_->count();
+-  int max_id;
+-  int num_correct_labels = 0;
+-  vector<int> label_offset(3);
+-  for (int n = 0; n < this->blob_bottom_data_->num(); ++n) {
+-    for (int h = 0; h < this->blob_bottom_data_->height(); ++h) {
+-      for (int w = 0; w < this->blob_bottom_data_->width(); ++w) {
+-        max_value = -FLT_MAX;
+-        max_id = 0;
+-        for (int c = 0; c < this->blob_bottom_data_->channels(); ++c) {
+-          const TypeParam pred_value =
+-              this->blob_bottom_data_->data_at(n, c, h, w);
+-          if (pred_value > max_value) {
+-            max_value = pred_value;
+-            max_id = c;
++
++  // repeat the forward
++  for (int iter = 0; iter < 3; iter++) {
++    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
++
++    Dtype max_value;
++    const int num_labels = this->blob_bottom_label_->count();
++    int max_id;
++    int num_correct_labels = 0;
++    vector<int> label_offset(3);
++    for (int n = 0; n < this->blob_bottom_data_->num(); ++n) {
++      for (int h = 0; h < this->blob_bottom_data_->height(); ++h) {
++        for (int w = 0; w < this->blob_bottom_data_->width(); ++w) {
++          max_value = -FLT_MAX;
++          max_id = 0;
++          for (int c = 0; c < this->blob_bottom_data_->channels(); ++c) {
++            const Dtype pred_value =
++                this->blob_bottom_data_->data_at(n, c, h, w);
++            if (pred_value > max_value) {
++              max_value = pred_value;
++              max_id = c;
++            }
++          }
++          label_offset[0] = n; label_offset[1] = h; label_offset[2] = w;
++          const int correct_label =
++              static_cast<int>(this->blob_bottom_label_->data_at(label_offset));
++          if (max_id == correct_label) {
++            ++num_correct_labels;
+           }
+-        }
+-        label_offset[0] = n; label_offset[1] = h; label_offset[2] = w;
+-        const int correct_label =
+-            static_cast<int>(this->blob_bottom_label_->data_at(label_offset));
+-        if (max_id == correct_label) {
+-          ++num_correct_labels;
+         }
+       }
+     }
++    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
++                num_correct_labels / Dtype(num_labels), 1e-4);
+   }
+-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+-              num_correct_labels / TypeParam(num_labels), 1e-4);
+ }
+ 
+ TYPED_TEST(AccuracyLayerTest, TestForwardIgnoreLabel) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+-  const TypeParam kIgnoreLabelValue = -1;
++  const Dtype kIgnoreLabelValue = -1;
+   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   // Manually set some labels to the ignore label value (-1).
+   this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
+   this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
+   this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+-
+-  TypeParam max_value;
+-  int max_id;
+-  int num_correct_labels = 0;
+-  int count = 0;
+-  for (int i = 0; i < 100; ++i) {
+-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+-      continue;
+-    }
+-    ++count;
+-    max_value = -FLT_MAX;
+-    max_id = 0;
+-    for (int j = 0; j < 10; ++j) {
+-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+-        max_id = j;
++
++  // repeat the forward
++  for (int iter = 0; iter < 3; iter++) {
++    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
++
++    Dtype max_value;
++    int max_id;
++    int num_correct_labels = 0;
++    int count = 0;
++    for (int i = 0; i < 100; ++i) {
++      if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
++        continue;
++      }
++      ++count;
++      max_value = -FLT_MAX;
++      max_id = 0;
++      for (int j = 0; j < 10; ++j) {
++        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
++          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
++          max_id = j;
++        }
++      }
++      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
++        ++num_correct_labels;
+       }
+     }
+-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+-      ++num_correct_labels;
+-    }
++    EXPECT_EQ(count, 97);  // We set 3 out of 100 labels to kIgnoreLabelValue.
++    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
++                num_correct_labels / Dtype(count), 1e-4);
+   }
+-  EXPECT_EQ(count, 97);  // We set 3 out of 100 labels to kIgnoreLabelValue.
+-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+-              num_correct_labels / TypeParam(count), 1e-4);
+ }
+ 
+-TYPED_TEST(AccuracyLayerTest, TestForwardCPUTopK) {
++TYPED_TEST(AccuracyLayerTest, TestForwardTopK) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+   AccuracyParameter* accuracy_param = layer_param.mutable_accuracy_param();
+   accuracy_param->set_top_k(this->top_k_);
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+-
+-  TypeParam current_value;
+-  int current_rank;
+-  int num_correct_labels = 0;
+-  for (int i = 0; i < 100; ++i) {
+-    for (int j = 0; j < 10; ++j) {
+-      current_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+-      current_rank = 0;
+-      for (int k = 0; k < 10; ++k) {
+-        if (this->blob_bottom_data_->data_at(i, k, 0, 0) > current_value) {
+-          ++current_rank;
++
++  // repeat the forward
++  for (int iter = 0; iter < 3; iter++) {
++    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
++
++    Dtype current_value;
++    int current_rank;
++    int num_correct_labels = 0;
++    for (int i = 0; i < 100; ++i) {
++      for (int j = 0; j < 10; ++j) {
++        current_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
++        current_rank = 0;
++        for (int k = 0; k < 10; ++k) {
++          if (this->blob_bottom_data_->data_at(i, k, 0, 0) > current_value) {
++            ++current_rank;
++          }
++        }
++        if (current_rank < this->top_k_ &&
++            j == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
++          ++num_correct_labels;
+         }
+-      }
+-      if (current_rank < this->top_k_ &&
+-          j == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+-        ++num_correct_labels;
+       }
+     }
+-  }
+ 
+-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+-              num_correct_labels / 100.0, 1e-4);
++    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
++                num_correct_labels / Dtype(100.0), 1e-4);
++  }
+ }
+ 
+-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
++TYPED_TEST(AccuracyLayerTest, TestForwardPerClass) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+-
+-  TypeParam max_value;
+-  int max_id;
+-  int num_correct_labels = 0;
+-  const int num_class = this->blob_top_per_class_->num();
+-  vector<int> correct_per_class(num_class, 0);
+-  vector<int> num_per_class(num_class, 0);
+-  for (int i = 0; i < 100; ++i) {
+-    max_value = -FLT_MAX;
+-    max_id = 0;
+-    for (int j = 0; j < 10; ++j) {
+-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+-        max_id = j;
++  // repeat the forward
++  for (int iter = 0; iter < 3; iter++) {
++    layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
++
++    Dtype max_value;
++    int max_id;
++    int num_correct_labels = 0;
++    const int num_class = this->blob_top_per_class_->num();
++    vector<int> correct_per_class(num_class, 0);
++    vector<int> num_per_class(num_class, 0);
++    for (int i = 0; i < 100; ++i) {
++      max_value = -FLT_MAX;
++      max_id = 0;
++      for (int j = 0; j < 10; ++j) {
++        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
++          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
++          max_id = j;
++        }
++      }
++      ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
++      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
++        ++num_correct_labels;
++        ++correct_per_class[max_id];
+       }
+     }
+-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+-      ++num_correct_labels;
+-      ++correct_per_class[max_id];
++    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
++                num_correct_labels / 100.0, 1e-4);
++    for (int i = 0; i < num_class; ++i) {
++      Dtype accuracy_per_class = (num_per_class[i] > 0 ?
++         static_cast<Dtype>(correct_per_class[i]) / num_per_class[i] : 0);
++      EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
++                  accuracy_per_class, 1e-4);
+     }
+   }
+-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+-              num_correct_labels / 100.0, 1e-4);
+-  for (int i = 0; i < num_class; ++i) {
+-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
+-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
+-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+-                accuracy_per_class, 1e-4);
+-  }
+ }
+ 
+ 
+-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
++TYPED_TEST(AccuracyLayerTest, TestForwardPerClassWithIgnoreLabel) {
++  typedef typename TypeParam::Dtype Dtype;
+   LayerParameter layer_param;
+-  const TypeParam kIgnoreLabelValue = -1;
++  const Dtype kIgnoreLabelValue = -1;
+   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
+-  AccuracyLayer<TypeParam> layer(layer_param);
++  AccuracyLayer<Dtype> layer(layer_param);
+   // Manually set some labels to the ignore label value (-1).
+   this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
+   this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
+   this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
+   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+-
+-  TypeParam max_value;
+-  int max_id;
+-  int num_correct_labels = 0;
+-  const int num_class = this->blob_top_per_class_->num();
+-  vector<int> correct_per_class(num_class, 0);
+-  vector<int> num_per_class(num_class, 0);
+-  int count = 0;
+-  for (int i = 0; i < 100; ++i) {
+-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+-      continue;
+-    }
+-    ++count;
+-    max_value = -FLT_MAX;
+-    max_id = 0;
+-    for (int j = 0; j < 10; ++j) {
+-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+-        max_id = j;
++
++  // repeat the forward
++  for (int iter = 0; iter < 3; iter++) {
++    layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
++
++    Dtype max_value;
++    int max_id;
++    int num_correct_labels = 0;
++    const int num_class = this->blob_top_per_class_->num();
++    vector<int> correct_per_class(num_class, 0);
++    vector<int> num_per_class(num_class, 0);
++    int count = 0;
++    for (int i = 0; i < 100; ++i) {
++      if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
++        continue;
++      }
++      ++count;
++      max_value = -FLT_MAX;
++      max_id = 0;
++      for (int j = 0; j < 10; ++j) {
++        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
++          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
++          max_id = j;
++        }
++      }
++      ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
++      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
++        ++num_correct_labels;
++        ++correct_per_class[max_id];
+       }
+     }
+-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+-      ++num_correct_labels;
+-      ++correct_per_class[max_id];
++    EXPECT_EQ(count, 97);
++    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
++                num_correct_labels / Dtype(count), 1e-4);
++    for (int i = 0; i < 10; ++i) {
++      Dtype accuracy_per_class = (num_per_class[i] > 0 ?
++         static_cast<Dtype>(correct_per_class[i]) / num_per_class[i] : 0);
++      EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
++                  accuracy_per_class, 1e-4);
+     }
+   }
+-  EXPECT_EQ(count, 97);
+-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+-              num_correct_labels / TypeParam(count), 1e-4);
+-  for (int i = 0; i < 10; ++i) {
+-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
+-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
+-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+-                accuracy_per_class, 1e-4);
+-  }
+ }
+ 
+ }  // namespace caffe
+diff --git a/src/caffe/test/test_filler.cpp b/src/caffe/test/test_filler.cpp
+index 26e9b217..f84d707b 100644
+--- a/src/caffe/test/test_filler.cpp
++++ b/src/caffe/test/test_filler.cpp
+@@ -29,7 +29,7 @@ TYPED_TEST(ConstantFillerTest, TestFill) {
+   const int count = this->blob_->count();
+   const TypeParam* data = this->blob_->cpu_data();
+   for (int i = 0; i < count; ++i) {
+-    EXPECT_GE(data[i], this->filler_param_.value());
++    EXPECT_EQ(data[i], this->filler_param_.value());
+   }
+ }
+ 
+@@ -238,4 +238,45 @@ TYPED_TEST(MSRAFillerTest, TestFillAverage) {
+   this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
+ }
+ 
++template <typename Dtype>
++class BilinearFillerTest : public ::testing::Test {
++ protected:
++  BilinearFillerTest() : filler_param_() {}
++  virtual void test_params(const int n) {
++    this->blob_ = new Blob<Dtype>(1000, 2, n, n);
++    this->filler_.reset(new BilinearFiller<Dtype>(this->filler_param_));
++    this->filler_->Fill(blob_);
++    EXPECT_TRUE(this->blob_);
++    const int outer_num = this->blob_->count(0, 2);
++    const int inner_num = this->blob_->count(2, 4);
++    const Dtype* data = this->blob_->cpu_data();
++    int f = ceil(this->blob_->width() / 2.);
++    Dtype c = (this->blob_->width() - 1) / (2. * f);
++    for (int i = 0; i < outer_num; ++i) {
++      for (int j = 0; j < inner_num; ++j) {
++        Dtype x = j % this->blob_->width();
++        Dtype y = (j / this->blob_->width()) % this->blob_->height();
++        Dtype expected_value = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
++        const Dtype actual_value = data[i * inner_num + j];
++        EXPECT_NEAR(expected_value, actual_value, 0.01);
++      }
++    }
++  }
++  virtual ~BilinearFillerTest() { delete blob_; }
++  Blob<Dtype>* blob_;
++  FillerParameter filler_param_;
++  shared_ptr<BilinearFiller<Dtype> > filler_;
++};
++
++TYPED_TEST_CASE(BilinearFillerTest, TestDtypes);
++
++TYPED_TEST(BilinearFillerTest, TestFillOdd) {
++  const int n = 7;
++  this->test_params(n);
++}
++TYPED_TEST(BilinearFillerTest, TestFillEven) {
++  const int n = 6;
++  this->test_params(n);
++}
++
+ }  // namespace caffe

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/caffe.git