[Pkg-ceph-commits] [ceph] 15/59: d/p/0001-CoreLocalArray-class.patch, d/p/0002-core-local-array-type-conversions.patch, d/p/0003-Core-local-statistics.patch: Cherry pick rocksdb commits to resolve compatibility with gcc-6 on i386.

Thu Feb 1 15:27:51 UTC 2018

This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch ubuntu/artful
in repository ceph.

commit 54747fad2260bdf30c5d46b058e7285c21ac85d9
Author: James Page <james.page at ubuntu.com>
Date:   Wed Jun 14 10:55:23 2017 +0000

    d/p/0001-CoreLocalArray-class.patch, d/p/0002-core-local-array-type-conversions.patch, d/p/0003-Core-local-statistics.patch: Cherry pick rocksdb commits to resolve compatibility with gcc-6 on i386.
---
 debian/changelog                                   |   4 +
 debian/patches/0001-CoreLocalArray-class.patch     | 246 +++++++++++
 .../0002-core-local-array-type-conversions.patch   |  66 +++
 debian/patches/0003-Core-local-statistics.patch    | 454 +++++++++++++++++++++
 debian/patches/series                              |   3 +
 debian/rules                                       |   8 +
 6 files changed, 781 insertions(+)

diff --git a/debian/changelog b/debian/changelog
index c16dc5a..cf212e3 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -34,6 +34,10 @@ ceph (12.0.3-0ubuntu1) UNRELEASED; urgency=medium
     - d/p/i386-build-fixes.patch: Misc patches to ensure that only
       a minimal set of SIMD instructions are used to i386, inline
       with previous autotools configuration.
+    - d/p/0001-CoreLocalArray-class.patch,
+      d/p/0002-core-local-array-type-conversions.patch,
+      d/p/0003-Core-local-statistics.patch: Cherry pick rocksdb commits
+      to resolve compatibility with gcc-6 on i386.
 
  -- James Page <james.page at ubuntu.com>  Fri, 26 May 2017 09:42:42 +0000
 
diff --git a/debian/patches/0001-CoreLocalArray-class.patch b/debian/patches/0001-CoreLocalArray-class.patch
new file mode 100644
index 0000000..4695a88
--- /dev/null
+++ b/debian/patches/0001-CoreLocalArray-class.patch
@@ -0,0 +1,246 @@
+From 8b2811f9871029023cd6b798f753743df9be36e7 Mon Sep 17 00:00:00 2001
+From: Andrew Kryczka <andrewkr at fb.com>
+Date: Wed, 10 May 2017 18:16:31 -0700
+Subject: [PATCH 1/3] CoreLocalArray class
+
+Summary:
+Moved the logic for core-local array out of ConcurrentArena and into a separate class because I want to reuse it for core-local stats.
+Closes https://github.com/facebook/rocksdb/pull/2256
+
+Differential Revision: D5011518
+
+Pulled By: ajkr
+
+fbshipit-source-id: a75a7b8f7b7a42fd6273489ada405f14c6be196a
+(cherry picked from commit cda5fde2d96624df38afc7f02b6b3e699648c62d)
+---
+ util/concurrent_arena.cc | 25 ++++----------
+ util/concurrent_arena.h  | 25 +++++++-------
+ util/core_local.h        | 84 ++++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 103 insertions(+), 31 deletions(-)
+ create mode 100644 util/core_local.h
+
+diff --git a/src/rocksdb/util/concurrent_arena.cc b/src/rocksdb/util/concurrent_arena.cc
+index df87011..a0feb93 100644
+--- a/src/rocksdb/util/concurrent_arena.cc
++++ b/src/rocksdb/util/concurrent_arena.cc
+@@ -16,35 +16,24 @@
+ namespace rocksdb {
+ 
+ #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+-__thread uint32_t ConcurrentArena::tls_cpuid = 0;
++__thread size_t ConcurrentArena::tls_cpuid = 0;
+ #endif
+ 
+ ConcurrentArena::ConcurrentArena(size_t block_size, size_t huge_page_size)
+-    : shard_block_size_(block_size / 8), arena_(block_size, huge_page_size) {
+-  // find a power of two >= num_cpus and >= 8
+-  auto num_cpus = std::thread::hardware_concurrency();
+-  index_mask_ = 7;
+-  while (index_mask_ + 1 < num_cpus) {
+-    index_mask_ = index_mask_ * 2 + 1;
+-  }
+-
+-  shards_.reset(new Shard[index_mask_ + 1]);
++    : shard_block_size_(block_size / 8),
++      shards_(),
++      arena_(block_size, huge_page_size) {
+   Fixup();
+ }
+ 
+ ConcurrentArena::Shard* ConcurrentArena::Repick() {
+-  int cpuid = port::PhysicalCoreID();
+-  if (UNLIKELY(cpuid < 0)) {
+-    // cpu id unavailable, just pick randomly
+-    cpuid =
+-        Random::GetTLSInstance()->Uniform(static_cast<int>(index_mask_) + 1);
+-  }
++  auto shard_and_index = shards_.AccessElementAndIndex();
+ #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+   // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we
+   // have repicked
+-  tls_cpuid = cpuid | (static_cast<int>(index_mask_) + 1);
++  tls_cpuid = shard_and_index.second | shards_.Size();
+ #endif
+-  return &shards_[cpuid & index_mask_];
++  return shard_and_index.first;
+ }
+ 
+ }  // namespace rocksdb
+diff --git a/src/rocksdb/util/concurrent_arena.h b/src/rocksdb/util/concurrent_arena.h
+index 3a20bb6..a6db1e9 100644
+--- a/src/rocksdb/util/concurrent_arena.h
++++ b/src/rocksdb/util/concurrent_arena.h
+@@ -14,6 +14,7 @@
+ #include "port/likely.h"
+ #include "util/allocator.h"
+ #include "util/arena.h"
++#include "util/core_local.h"
+ #include "util/mutexlock.h"
+ #include "util/thread_local.h"
+ 
+@@ -63,9 +64,7 @@ class ConcurrentArena : public Allocator {
+ 
+   size_t ApproximateMemoryUsage() const {
+     std::unique_lock<SpinMutex> lock(arena_mutex_, std::defer_lock);
+-    if (index_mask_ != 0) {
+-      lock.lock();
+-    }
++    lock.lock();
+     return arena_.ApproximateMemoryUsage() - ShardAllocatedAndUnused();
+   }
+ 
+@@ -95,18 +94,16 @@ class ConcurrentArena : public Allocator {
+   };
+ 
+ #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+-  static __thread uint32_t tls_cpuid;
++  static __thread size_t tls_cpuid;
+ #else
+-  enum ZeroFirstEnum : uint32_t { tls_cpuid = 0 };
++  enum ZeroFirstEnum : size_t { tls_cpuid = 0 };
+ #endif
+ 
+   char padding0[56] ROCKSDB_FIELD_UNUSED;
+ 
+   size_t shard_block_size_;
+ 
+-  // shards_[i & index_mask_] is valid
+-  size_t index_mask_;
+-  std::unique_ptr<Shard[]> shards_;
++  CoreLocalArray<Shard> shards_;
+ 
+   Arena arena_;
+   mutable SpinMutex arena_mutex_;
+@@ -120,15 +117,16 @@ class ConcurrentArena : public Allocator {
+ 
+   size_t ShardAllocatedAndUnused() const {
+     size_t total = 0;
+-    for (size_t i = 0; i <= index_mask_; ++i) {
+-      total += shards_[i].allocated_and_unused_.load(std::memory_order_relaxed);
++    for (size_t i = 0; i < shards_.Size(); ++i) {
++      total += shards_.AccessAtCore(i)->allocated_and_unused_.load(
++          std::memory_order_relaxed);
+     }
+     return total;
+   }
+ 
+   template <typename Func>
+   char* AllocateImpl(size_t bytes, bool force_arena, const Func& func) {
+-    uint32_t cpu;
++    size_t cpu;
+ 
+     // Go directly to the arena if the allocation is too large, or if
+     // we've never needed to Repick() and the arena mutex is available
+@@ -137,7 +135,8 @@ class ConcurrentArena : public Allocator {
+     std::unique_lock<SpinMutex> arena_lock(arena_mutex_, std::defer_lock);
+     if (bytes > shard_block_size_ / 4 || force_arena ||
+         ((cpu = tls_cpuid) == 0 &&
+-         !shards_[0].allocated_and_unused_.load(std::memory_order_relaxed) &&
++         !shards_.AccessAtCore(0)->allocated_and_unused_.load(
++             std::memory_order_relaxed) &&
+          arena_lock.try_lock())) {
+       if (!arena_lock.owns_lock()) {
+         arena_lock.lock();
+@@ -148,7 +147,7 @@ class ConcurrentArena : public Allocator {
+     }
+ 
+     // pick a shard from which to allocate
+-    Shard* s = &shards_[cpu & index_mask_];
++    Shard* s = shards_.AccessAtCore(cpu & (shards_.Size() - 1));
+     if (!s->mutex.try_lock()) {
+       s = Repick();
+       s->mutex.lock();
+diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h
+new file mode 100644
+index 0000000..806584d
+--- /dev/null
++++ b/src/rocksdb/util/core_local.h
+@@ -0,0 +1,84 @@
++//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
++//  This source code is licensed under the BSD-style license found in the
++//  LICENSE file in the root directory of this source tree. An additional grant
++//  of patent rights can be found in the PATENTS file in the same directory.
++//  This source code is also licensed under the GPLv2 license found in the
++//  COPYING file in the root directory of this source tree.
++
++#pragma once
++
++#include "port/likely.h"
++#include "port/port.h"
++#include "util/random.h"
++
++#include <cstddef>
++#include <thread>
++#include <vector>
++
++namespace rocksdb {
++
++// An array of core-local values. Ideally the value type, T, is cache aligned to
++// prevent false sharing.
++template<typename T>
++class CoreLocalArray {
++ public:
++  CoreLocalArray();
++
++  size_t Size() const;
++  // returns pointer to the element corresponding to the core that the thread
++  // currently runs on.
++  T* Access() const;
++  // same as above, but also returns the core index, which the client can cache
++  // to reduce how often core ID needs to be retrieved. Only do this if some
++  // inaccuracy is tolerable, as the thread may migrate to a different core.
++  std::pair<T*, size_t> AccessElementAndIndex() const;
++  // returns pointer to element for the specified core index. This can be used,
++  // e.g., for aggregation, or if the client caches core index.
++  T* AccessAtCore(size_t core_idx) const;
++
++ private:
++  std::unique_ptr<T[]> data_;
++  size_t size_shift_;
++};
++
++template<typename T>
++CoreLocalArray<T>::CoreLocalArray() {
++  unsigned int num_cpus = std::thread::hardware_concurrency();
++  // find a power of two >= num_cpus and >= 8
++  size_shift_ = 3;
++  while (1u << size_shift_ < num_cpus) {
++    ++size_shift_;
++  }
++  data_.reset(new T[1 << size_shift_]);
++}
++
++template<typename T>
++size_t CoreLocalArray<T>::Size() const {
++  return 1u << size_shift_;
++}
++
++template<typename T>
++T* CoreLocalArray<T>::Access() const {
++  return AccessElementAndIndex().first;
++}
++
++template<typename T>
++std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
++  int cpuid = port::PhysicalCoreID();
++  size_t core_idx;
++  if (UNLIKELY(cpuid < 0)) {
++    // cpu id unavailable, just pick randomly
++    core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_);
++  } else {
++    core_idx = static_cast<size_t>(cpuid & ((1 << size_shift_) - 1));
++  }
++  return {AccessAtCore(core_idx), core_idx};
++}
++
++template<typename T>
++T* CoreLocalArray<T>::AccessAtCore(size_t core_idx) const {
++  assert(core_idx < 1u << size_shift_);
++  return &data_[core_idx];
++}
++
++}  // namespace rocksdb
+-- 
+2.7.4
+
diff --git a/debian/patches/0002-core-local-array-type-conversions.patch b/debian/patches/0002-core-local-array-type-conversions.patch
new file mode 100644
index 0000000..7537bf1
--- /dev/null
+++ b/debian/patches/0002-core-local-array-type-conversions.patch
@@ -0,0 +1,66 @@
+From 26d95dc4fdd72bbc52ae21ff962e15c53a6ffbec Mon Sep 17 00:00:00 2001
+From: Andrew Kryczka <andrewkr at fb.com>
+Date: Fri, 12 May 2017 09:26:40 -0700
+Subject: [PATCH 2/3] core-local array type conversions
+
+Summary:
+try to clean up the type conversions and hope it passes on windows.
+
+one interesting thing I learned is that bitshift operations are special: in `x << y`, the result type depends only on the type of `x`, unlike most arithmetic operations where the result type depends on both operands' types.
+Closes https://github.com/facebook/rocksdb/pull/2277
+
+Differential Revision: D5050145
+
+Pulled By: ajkr
+
+fbshipit-source-id: f3309e77526ac9612c632bf93a62d99757af9a29
+(cherry picked from commit bbe9ee7dd4a542b191ace521ca13b4bdb063008b)
+---
+ util/core_local.h | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h
+index 806584d..7515c54 100644
+--- a/src/rocksdb/util/core_local.h
++++ b/src/rocksdb/util/core_local.h
+@@ -38,23 +38,23 @@ class CoreLocalArray {
+ 
+  private:
+   std::unique_ptr<T[]> data_;
+-  size_t size_shift_;
++  int size_shift_;
+ };
+ 
+ template<typename T>
+ CoreLocalArray<T>::CoreLocalArray() {
+-  unsigned int num_cpus = std::thread::hardware_concurrency();
++  int num_cpus = static_cast<int>(std::thread::hardware_concurrency());
+   // find a power of two >= num_cpus and >= 8
+   size_shift_ = 3;
+-  while (1u << size_shift_ < num_cpus) {
++  while (1 << size_shift_ < num_cpus) {
+     ++size_shift_;
+   }
+-  data_.reset(new T[1 << size_shift_]);
++  data_.reset(new T[static_cast<size_t>(1) << size_shift_]);
+ }
+ 
+ template<typename T>
+ size_t CoreLocalArray<T>::Size() const {
+-  return 1u << size_shift_;
++  return static_cast<size_t>(1) << size_shift_;
+ }
+ 
+ template<typename T>
+@@ -77,7 +77,7 @@ std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
+ 
+ template<typename T>
+ T* CoreLocalArray<T>::AccessAtCore(size_t core_idx) const {
+-  assert(core_idx < 1u << size_shift_);
++  assert(core_idx < static_cast<size_t>(1) << size_shift_);
+   return &data_[core_idx];
+ }
+ 
+-- 
+2.7.4
+
diff --git a/debian/patches/0003-Core-local-statistics.patch b/debian/patches/0003-Core-local-statistics.patch
new file mode 100644
index 0000000..80ef9a2
--- /dev/null
+++ b/debian/patches/0003-Core-local-statistics.patch
@@ -0,0 +1,454 @@
+From 30c2cdc833a62953f380580b1769fec63f770b21 Mon Sep 17 00:00:00 2001
+From: Andrew Kryczka <andrewkr at fb.com>
+Date: Tue, 23 May 2017 10:29:14 -0700
+Subject: [PATCH 3/3] Core-local statistics
+
+Summary:
+This diff changes `StatisticsImpl` from a thread-local approach to a core-local one. The goal is to perform faster aggregations, particularly for applications that have many threads. There should be no behavior change.
+Closes https://github.com/facebook/rocksdb/pull/2258
+
+Differential Revision: D5016258
+
+Pulled By: ajkr
+
+fbshipit-source-id: 7d4d165b4a91d8110f0409d113d1be91f22d31a9
+(cherry picked from commit ac39d6bec5b2c23a2c3fd0f0e61d468be4f3e803)
+---
+ HISTORY.md               |   4 ++
+ monitoring/statistics.cc | 129 +++++++++++++----------------------------------
+ monitoring/statistics.h  | 111 +++++++++++-----------------------------
+ util/core_local.h        |  21 ++++----
+ 4 files changed, 78 insertions(+), 187 deletions(-)
+
+diff --git a/src/rocksdb/HISTORY.md b/src/rocksdb/HISTORY.md
+index 7b51d37..4cde9e2 100644
+--- a/src/rocksdb/HISTORY.md
++++ b/src/rocksdb/HISTORY.md
+@@ -1,6 +1,10 @@
+ # Rocksdb Change Log
+ ## Unreleased
+ ### New Features
++* Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads.
++
++## 5.5.0 (05/17/2017)
++### New Features
+ * DB::ResetStats() to reset internal stats.
+ * Statistics::Reset() to reset user stats.
+ * ldb add option --try_load_options, which will open DB with its own option file.
+diff --git a/src/rocksdb/monitoring/statistics.cc b/src/rocksdb/monitoring/statistics.cc
+index fb5634f..3a69a13 100644
+--- a/src/rocksdb/monitoring/statistics.cc
++++ b/src/rocksdb/monitoring/statistics.cc
+@@ -21,13 +21,9 @@ std::shared_ptr<Statistics> CreateDBStatistics() {
+   return std::make_shared<StatisticsImpl>(nullptr, false);
+ }
+ 
+-StatisticsImpl::StatisticsImpl(
+-    std::shared_ptr<Statistics> stats,
+-    bool enable_internal_stats)
+-  : stats_shared_(stats),
+-    stats_(stats.get()),
+-    enable_internal_stats_(enable_internal_stats) {
+-}
++StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats,
++                               bool enable_internal_stats)
++    : stats_(std::move(stats)), enable_internal_stats_(enable_internal_stats) {}
+ 
+ StatisticsImpl::~StatisticsImpl() {}
+ 
+@@ -41,79 +37,36 @@ uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const {
+     enable_internal_stats_ ?
+       tickerType < INTERNAL_TICKER_ENUM_MAX :
+       tickerType < TICKER_ENUM_MAX);
+-  uint64_t thread_local_sum = 0;
+-  tickers_[tickerType].thread_value->Fold(
+-      [](void* curr_ptr, void* res) {
+-        auto* sum_ptr = static_cast<uint64_t*>(res);
+-        *sum_ptr += static_cast<std::atomic_uint_fast64_t*>(curr_ptr)->load(
+-            std::memory_order_relaxed);
+-      },
+-      &thread_local_sum);
+-  return thread_local_sum +
+-         tickers_[tickerType].merged_sum.load(std::memory_order_relaxed);
+-}
+-
+-std::unique_ptr<HistogramImpl>
+-StatisticsImpl::HistogramInfo::getMergedHistogram() const {
+-  std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
+-  {
+-    MutexLock lock(&merge_lock);
+-    res_hist->Merge(merged_hist);
++  uint64_t res = 0;
++  for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
++    res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType];
+   }
+-  thread_value->Fold(
+-      [](void* curr_ptr, void* res) {
+-        auto tmp_res_hist = static_cast<HistogramImpl*>(res);
+-        auto curr_hist = static_cast<HistogramImpl*>(curr_ptr);
+-        tmp_res_hist->Merge(*curr_hist);
+-      },
+-      res_hist.get());
+-  return res_hist;
++  return res;
+ }
+ 
+ void StatisticsImpl::histogramData(uint32_t histogramType,
+                                    HistogramData* const data) const {
+   MutexLock lock(&aggregate_lock_);
+-  histogramDataLocked(histogramType, data);
++  getHistogramImplLocked(histogramType)->Data(data);
+ }
+ 
+-void StatisticsImpl::histogramDataLocked(uint32_t histogramType,
+-                                         HistogramData* const data) const {
++std::unique_ptr<HistogramImpl> StatisticsImpl::getHistogramImplLocked(
++    uint32_t histogramType) const {
+   assert(
+     enable_internal_stats_ ?
+       histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
+       histogramType < HISTOGRAM_ENUM_MAX);
+-  histograms_[histogramType].getMergedHistogram()->Data(data);
++  std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
++  for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
++    res_hist->Merge(
++        per_core_stats_.AccessAtCore(core_idx)->histograms_[histogramType]);
++  }
++  return res_hist;
+ }
+ 
+ std::string StatisticsImpl::getHistogramString(uint32_t histogramType) const {
+   MutexLock lock(&aggregate_lock_);
+-  assert(enable_internal_stats_ ? histogramType < INTERNAL_HISTOGRAM_ENUM_MAX
+-                                : histogramType < HISTOGRAM_ENUM_MAX);
+-  return histograms_[histogramType].getMergedHistogram()->ToString();
+-}
+-
+-StatisticsImpl::ThreadTickerInfo* StatisticsImpl::getThreadTickerInfo(
+-    uint32_t tickerType) {
+-  auto info_ptr =
+-      static_cast<ThreadTickerInfo*>(tickers_[tickerType].thread_value->Get());
+-  if (info_ptr == nullptr) {
+-    info_ptr =
+-        new ThreadTickerInfo(0 /* value */, &tickers_[tickerType].merged_sum);
+-    tickers_[tickerType].thread_value->Reset(info_ptr);
+-  }
+-  return info_ptr;
+-}
+-
+-StatisticsImpl::ThreadHistogramInfo* StatisticsImpl::getThreadHistogramInfo(
+-    uint32_t histogram_type) {
+-  auto info_ptr = static_cast<ThreadHistogramInfo*>(
+-      histograms_[histogram_type].thread_value->Get());
+-  if (info_ptr == nullptr) {
+-    info_ptr = new ThreadHistogramInfo(&histograms_[histogram_type].merged_hist,
+-                                       &histograms_[histogram_type].merge_lock);
+-    histograms_[histogram_type].thread_value->Reset(info_ptr);
+-  }
+-  return info_ptr;
++  return getHistogramImplLocked(histogramType)->ToString();
+ }
+ 
+ void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
+@@ -129,14 +82,12 @@ void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
+ void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) {
+   assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX
+                                 : tickerType < TICKER_ENUM_MAX);
+-  if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) {
+-    tickers_[tickerType].thread_value->Fold(
+-        [](void* curr_ptr, void* res) {
+-          static_cast<std::atomic<uint64_t>*>(curr_ptr)->store(
+-              0, std::memory_order_relaxed);
+-        },
+-        nullptr /* res */);
+-    tickers_[tickerType].merged_sum.store(count, std::memory_order_relaxed);
++  for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
++    if (core_idx == 0) {
++      per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count;
++    } else {
++      per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = 0;
++    }
+   }
+ }
+ 
+@@ -146,16 +97,10 @@ uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
+     MutexLock lock(&aggregate_lock_);
+     assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX
+                                   : tickerType < TICKER_ENUM_MAX);
+-    if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) {
+-      tickers_[tickerType].thread_value->Fold(
+-          [](void* curr_ptr, void* res) {
+-            auto* sum_ptr = static_cast<uint64_t*>(res);
+-            *sum_ptr += static_cast<std::atomic<uint64_t>*>(curr_ptr)->exchange(
+-                0, std::memory_order_relaxed);
+-          },
+-          &sum);
+-      sum += tickers_[tickerType].merged_sum.exchange(
+-          0, std::memory_order_relaxed);
++    for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
++      sum +=
++          per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange(
++              0, std::memory_order_relaxed);
+     }
+   }
+   if (stats_ && tickerType < TICKER_ENUM_MAX) {
+@@ -169,10 +114,8 @@ void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
+     enable_internal_stats_ ?
+       tickerType < INTERNAL_TICKER_ENUM_MAX :
+       tickerType < TICKER_ENUM_MAX);
+-  if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) {
+-    auto info_ptr = getThreadTickerInfo(tickerType);
+-    info_ptr->value.fetch_add(count, std::memory_order_relaxed);
+-  }
++  per_core_stats_.Access()->tickers_[tickerType].fetch_add(
++      count, std::memory_order_relaxed);
+   if (stats_ && tickerType < TICKER_ENUM_MAX) {
+     stats_->recordTick(tickerType, count);
+   }
+@@ -183,9 +126,7 @@ void StatisticsImpl::measureTime(uint32_t histogramType, uint64_t value) {
+     enable_internal_stats_ ?
+       histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
+       histogramType < HISTOGRAM_ENUM_MAX);
+-  if (histogramType < HISTOGRAM_ENUM_MAX || enable_internal_stats_) {
+-    getThreadHistogramInfo(histogramType)->value.Add(value);
+-  }
++  per_core_stats_.Access()->histograms_[histogramType].Add(value);
+   if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) {
+     stats_->measureTime(histogramType, value);
+   }
+@@ -197,11 +138,9 @@ Status StatisticsImpl::Reset() {
+     setTickerCountLocked(i, 0);
+   }
+   for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; ++i) {
+-    histograms_[i].thread_value->Fold(
+-        [](void* curr_ptr, void* res) {
+-          static_cast<HistogramImpl*>(curr_ptr)->Clear();
+-        },
+-        nullptr /* res */);
++    for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
++      per_core_stats_.AccessAtCore(core_idx)->histograms_[i].Clear();
++    }
+   }
+   return Status::OK();
+ }
+@@ -229,7 +168,7 @@ std::string StatisticsImpl::ToString() const {
+     if (h.first < HISTOGRAM_ENUM_MAX || enable_internal_stats_) {
+       char buffer[kTmpStrBufferSize];
+       HistogramData hData;
+-      histogramDataLocked(h.first, &hData);
++      getHistogramImplLocked(h.first)->Data(&hData);
+       snprintf(
+           buffer, kTmpStrBufferSize,
+           "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f 100 : %f\n",
+diff --git a/src/rocksdb/monitoring/statistics.h b/src/rocksdb/monitoring/statistics.h
+index 32b7036..96b31a3 100644
+--- a/src/rocksdb/monitoring/statistics.h
++++ b/src/rocksdb/monitoring/statistics.h
+@@ -13,8 +13,14 @@
+ #include "monitoring/histogram.h"
+ #include "port/likely.h"
+ #include "port/port.h"
++#include "util/core_local.h"
+ #include "util/mutexlock.h"
+-#include "util/thread_local.h"
++
++#ifdef __clang__
++#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__))
++#else
++#define ROCKSDB_FIELD_UNUSED
++#endif  // __clang__
+ 
+ namespace rocksdb {
+ 
+@@ -50,97 +56,38 @@ class StatisticsImpl : public Statistics {
+   virtual bool HistEnabledForType(uint32_t type) const override;
+ 
+  private:
+-  std::shared_ptr<Statistics> stats_shared_;
+-  Statistics* stats_;
++  // If non-nullptr, forwards updates to the object pointed to by `stats_`.
++  std::shared_ptr<Statistics> stats_;
++  // TODO(ajkr): clean this up since there are no internal stats anymore
+   bool enable_internal_stats_;
+-  // Synchronizes anything that operates on other threads' thread-specific data
++  // Synchronizes anything that operates across other cores' local data,
+   // such that operations like Reset() can be performed atomically.
+   mutable port::Mutex aggregate_lock_;
+ 
+-  // Holds data maintained by each thread for implementing tickers.
+-  struct ThreadTickerInfo {
+-    std::atomic_uint_fast64_t value;
+-    // During teardown, value will be summed into *merged_sum.
+-    std::atomic_uint_fast64_t* merged_sum;
+-
+-    ThreadTickerInfo(uint_fast64_t _value,
+-                     std::atomic_uint_fast64_t* _merged_sum)
+-        : value(_value), merged_sum(_merged_sum) {}
++  // The ticker/histogram data are stored in this structure, which we will store
++  // per-core. It is cache-aligned, so tickers/histograms belonging to different
++  // cores can never share the same cache line.
++  //
++  // Alignment attributes expand to nothing depending on the platform
++  struct StatisticsData {
++    std::atomic_uint_fast64_t tickers_[INTERNAL_TICKER_ENUM_MAX] = {{0}};
++    HistogramImpl histograms_[INTERNAL_HISTOGRAM_ENUM_MAX];
++    char
++        padding[(CACHE_LINE_SIZE -
++                 (INTERNAL_TICKER_ENUM_MAX * sizeof(std::atomic_uint_fast64_t) +
++                  INTERNAL_HISTOGRAM_ENUM_MAX * sizeof(HistogramImpl)) %
++                     CACHE_LINE_SIZE) %
++                CACHE_LINE_SIZE] ROCKSDB_FIELD_UNUSED;
+   };
+ 
+-  // Holds data maintained by each thread for implementing histograms.
+-  struct ThreadHistogramInfo {
+-    HistogramImpl value;
+-    // During teardown, value will be merged into *merged_hist while holding
+-    // *merge_lock, which also syncs with the merges necessary for reads.
+-    HistogramImpl* merged_hist;
+-    port::Mutex* merge_lock;
++  static_assert(sizeof(StatisticsData) % 64 == 0, "Expected 64-byte aligned");
+ 
+-    ThreadHistogramInfo(HistogramImpl* _merged_hist, port::Mutex* _merge_lock)
+-        : value(), merged_hist(_merged_hist), merge_lock(_merge_lock) {}
+-  };
+-
+-  // Holds global data for implementing tickers.
+-  struct TickerInfo {
+-    TickerInfo()
+-        : thread_value(new ThreadLocalPtr(&mergeThreadValue)), merged_sum(0) {}
+-    // Holds thread-specific pointer to ThreadTickerInfo
+-    std::unique_ptr<ThreadLocalPtr> thread_value;
+-    // Sum of thread-specific values for tickers that have been reset due to
+-    // thread termination or ThreadLocalPtr destruction. Also, this is used by
+-    // setTickerCount() to conveniently change the global value by setting this
+-    // while simultaneously zeroing all thread-local values.
+-    std::atomic_uint_fast64_t merged_sum;
+-
+-    static void mergeThreadValue(void* ptr) {
+-      auto info_ptr = static_cast<ThreadTickerInfo*>(ptr);
+-      *info_ptr->merged_sum += info_ptr->value;
+-      delete info_ptr;
+-    }
+-  };
+-
+-  // Holds global data for implementing histograms.
+-  struct HistogramInfo {
+-    HistogramInfo()
+-        : merged_hist(),
+-          merge_lock(),
+-          thread_value(new ThreadLocalPtr(&mergeThreadValue)) {}
+-    // Merged thread-specific values for histograms that have been reset due to
+-    // thread termination or ThreadLocalPtr destruction. Note these must be
+-    // destroyed after thread_value since its destructor accesses them.
+-    HistogramImpl merged_hist;
+-    mutable port::Mutex merge_lock;
+-    // Holds thread-specific pointer to ThreadHistogramInfo
+-    std::unique_ptr<ThreadLocalPtr> thread_value;
+-
+-    static void mergeThreadValue(void* ptr) {
+-      auto info_ptr = static_cast<ThreadHistogramInfo*>(ptr);
+-      {
+-        MutexLock lock(info_ptr->merge_lock);
+-        info_ptr->merged_hist->Merge(info_ptr->value);
+-      }
+-      delete info_ptr;
+-    }
+-
+-    // Returns a histogram that merges all histograms (thread-specific and
+-    // previously merged ones).
+-    std::unique_ptr<HistogramImpl> getMergedHistogram() const;
+-  };
++  CoreLocalArray<StatisticsData> per_core_stats_;
+ 
+   uint64_t getTickerCountLocked(uint32_t ticker_type) const;
+-  void histogramDataLocked(uint32_t histogram_type,
+-                           HistogramData* const data) const;
++  std::unique_ptr<HistogramImpl> getHistogramImplLocked(
++      uint32_t histogram_type) const;
+   void setTickerCountLocked(uint32_t ticker_type, uint64_t count);
+-
+-  // Returns the info for this tickerType/thread. It sets a new info with zeroed
+-  // counter if none exists.
+-  ThreadTickerInfo* getThreadTickerInfo(uint32_t ticker_type);
+-  // Returns the info for this histogramType/thread. It sets a new histogram
+-  // with zeroed data if none exists.
+-  ThreadHistogramInfo* getThreadHistogramInfo(uint32_t histogram_type);
+-
+-  TickerInfo tickers_[INTERNAL_TICKER_ENUM_MAX];
+-  HistogramInfo histograms_[INTERNAL_HISTOGRAM_ENUM_MAX];
+ };
+ 
+ // Utility functions
+diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h
+index 7515c54..4239df6 100644
+--- a/src/rocksdb/util/core_local.h
++++ b/src/rocksdb/util/core_local.h
+@@ -7,19 +7,20 @@
+ 
+ #pragma once
+ 
+-#include "port/likely.h"
+-#include "port/port.h"
+-#include "util/random.h"
+-
+ #include <cstddef>
+ #include <thread>
++#include <utility>
+ #include <vector>
+ 
++#include "port/likely.h"
++#include "port/port.h"
++#include "util/random.h"
++
+ namespace rocksdb {
+ 
+ // An array of core-local values. Ideally the value type, T, is cache aligned to
+ // prevent false sharing.
+-template<typename T>
++template <typename T>
+ class CoreLocalArray {
+  public:
+   CoreLocalArray();
+@@ -41,7 +42,7 @@ class CoreLocalArray {
+   int size_shift_;
+ };
+ 
+-template<typename T>
++template <typename T>
+ CoreLocalArray<T>::CoreLocalArray() {
+   int num_cpus = static_cast<int>(std::thread::hardware_concurrency());
+   // find a power of two >= num_cpus and >= 8
+@@ -52,17 +53,17 @@ CoreLocalArray<T>::CoreLocalArray() {
+   data_.reset(new T[static_cast<size_t>(1) << size_shift_]);
+ }
+ 
+-template<typename T>
++template <typename T>
+ size_t CoreLocalArray<T>::Size() const {
+   return static_cast<size_t>(1) << size_shift_;
+ }
+ 
+-template<typename T>
++template <typename T>
+ T* CoreLocalArray<T>::Access() const {
+   return AccessElementAndIndex().first;
+ }
+ 
+-template<typename T>
++template <typename T>
+ std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
+   int cpuid = port::PhysicalCoreID();
+   size_t core_idx;
+@@ -75,7 +76,7 @@ std::pair<T*, size_t> CoreLocalArray<T>::AccessElementAndIndex() const {
+   return {AccessAtCore(core_idx), core_idx};
+ }
+ 
+-template<typename T>
++template <typename T>
+ T* CoreLocalArray<T>::AccessAtCore(size_t core_idx) const {
+   assert(core_idx < static_cast<size_t>(1) << size_shift_);
+   return &data_[core_idx];
+-- 
+2.7.4
+
diff --git a/debian/patches/series b/debian/patches/series
index e284a47..41d9bfc 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -5,3 +5,6 @@ conditional-rgw-beast.patch
 
 # Ubuntu: i386 build failure
 i386-build-fixes.patch
+0001-CoreLocalArray-class.patch
+0002-core-local-array-type-conversions.patch
+0003-Core-local-statistics.patch
diff --git a/debian/rules b/debian/rules
index 927ac4f..b9b7664 100755
--- a/debian/rules
+++ b/debian/rules
@@ -2,6 +2,14 @@
 # -*- makefile -*-
 #export DH_VERBOSE=1
 
+# Reduce size of debug symbols to fix FTBFS due to the
+# 2GB/3GB address space limits on 32bit
+DEB_HOST_ARCH_BITS ?= $(shell dpkg-architecture -qDEB_HOST_ARCH_BITS)
+ifeq (32,$(DEB_HOST_ARCH_BITS))
+       export DEB_CFLAGS_MAINT_APPEND = -g1
+       export DEB_CXXFLAGS_MAINT_APPEND = -g1
+endif
+
 # minimise needless linking and link to libatomic
 # The last is needed because long long atomic operations are not directly
 # supported by all processor architectures

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git