Snapshot high water mark data for computing realized fragmentation.

This approximates "Adaptive Huge-Page Subrelease for Non-moving Memory Allocators in Warehouse-Scale Computers"'s estimate of realized fragmentation as T_{schedule} -> infinity. It captures memory going unused at a peak in OS-visible demand due to metadata requirements and binpacking losses. PiperOrigin-RevId: 439611199 Change-Id: Iccc056d7439a0ccaa0e9b92df615f000b8d84ed6
google · Apr 5, 2022 · e10badd · e10badd
1 parent 537932e
commit e10badd
Show file tree

Hide file tree

Showing 6 changed files with 260 additions and 6 deletions.
diff --git a/docs/stats.md b/docs/stats.md
@@ -122,6 +122,31 @@ MALLOC:        4067336 (    3.9 MiB) Pagemap root resident bytes
     potentially large array, and it is useful to know how much of it is actually
     memory resident.
 
+### Realized Fragmentation
+
+```
+MALLOC:    12238113346 (11671.2 MiB) Actual memory used at peak
+MALLOC:    11626207678 (11087.6 MiB) Estimated in-use at peak
+MALLOC:              5.2632          Realized fragmentation (%)
+```
+
+Memory overhead at peak demand is more important than off-peak, since we need to
+provision a process with sufficient memory to run during its peak requirements
+without OOM'ing. After a peak in demand, memory may be deallocated and held in
+caches in anticipation of future reuse. Overhead as a fraction of the remaining
+live allocations rises, but no additional memory is required.
+
+This metric is called "realized fragmentation" and described in ["Adaptive
+Hugepage Subrelease for Non-moving Memory Allocators in Warehouse-Scale
+Computers"](https://research.google/pubs/pub50436/) (ISMM 2021). The realized
+fragmentation metric computed here is a snapshot over the life of the entire
+process.
+
+These realized fragmentation stats in the summary table indicate a snapshot of
+conditions when TCMalloc used a peak in its physical memory. As of April 2022,
+the in-use at peak number is estimated from TCMalloc's periodic allocation
+sampling.
+
 ### Page Sizes
 
 There are three relevant "page" sizes for systems and TCMalloc. It's important
@@ -734,10 +759,8 @@ which is an indication of how much memory could have been "usefully" reclaimed
 (i.e., free for long enough that the OS would likely be able to use the memory
 for another process). The line shows both the total number of free pages in the
 filler (whether or not released to the OS) as well as only those that were
-backed by physical memory for the full 5-min interval. This metric is called
-"realized fragmentation" and described in ["Adaptive Hugepage Subrelease for
-Non-moving Memory Allocators in Warehouse-Scale
-Computers"](https://research.google/pubs/pub50436/) (ISMM 2021).
+backed by physical memory for the full 5-min interval. The realized
+fragmentation metric computed here uses a bounded window.
 
 The next two sections show the state of the filler at peak demand (i.e., when
 the maximum number of pages was in use) and at peak hps (i.e., when the maximum

diff --git a/tcmalloc/page_allocator.cc b/tcmalloc/page_allocator.cc
@@ -131,11 +131,30 @@ PageAllocator::PageAllocator() {
 }
 
 void PageAllocator::ShrinkToUsageLimit() {
+  BackingStats s = stats();
+  const size_t backed =
+      s.system_bytes - s.unmapped_bytes + Static::metadata_bytes();
+  // New high water marks should be rare.
+  if (ABSL_PREDICT_FALSE(backed > peak_backed_bytes_)) {
+    peak_backed_bytes_ = backed;
+    // This estimate may skew slightly low (and overestimate realized
+    // fragmentation), as we allocate successfully from the page heap before
+    // updating the sampled object list.
+    //
+    // TODO(ckennelly): Evaluate passing the current allocation size to the page
+    // heap to adjust this.  This correction would overestimate for many-object
+    // spans from the CentralFreeList, but those are typically a single page so
+    // the error in absolute terms is minimal.
+    peak_sampled_application_bytes_ = Static::sampled_objects_size_.value();
+  }
+  // TODO(ckennelly): Consider updating peak_sampled_application_bytes_ if
+  // backed == peak_backed_bytes_ but application usage has gone up.  This can
+  // occur if we allocate space for many objects preemptively and only later
+  // sample them (incrementing sampled_objects_size_).
+
   if (limit_ == std::numeric_limits<size_t>::max()) {
     return;
   }
-  BackingStats s = stats();
-  size_t backed = s.system_bytes - s.unmapped_bytes + Static::metadata_bytes();
   if (backed <= limit_) {
     // We're already fine.
     return;

diff --git a/tcmalloc/page_allocator.h b/tcmalloc/page_allocator.h
@@ -101,6 +101,15 @@ class PageAllocator {
   // Returns the main hugepage-aware heap, or nullptr if not using HPAA.
   HugePageAwareAllocator* default_hpaa() const { return default_hpaa_; }
 
+  struct PeakStats {
+    size_t backed_bytes;
+    size_t sampled_application_bytes;
+  };
+
+  PeakStats peak_stats() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock) {
+    return PeakStats{peak_backed_bytes_, peak_sampled_application_bytes_};
+  }
+
  private:
   bool ShrinkHardBy(Length pages) ABSL_EXCLUSIVE_LOCKS_REQUIRED(pageheap_lock);
 
@@ -130,6 +139,17 @@ class PageAllocator {
   // The number of times the limit has been hit.
   int64_t limit_hits_{0};
 
+  // peak_backed_bytes_ tracks the maximum number of pages backed (with physical
+  // memory) in the page heap and metadata.
+  //
+  // peak_sampled_application_bytes_ is a snapshot of
+  // Static::sampled_objects_size_ at the time of the most recent
+  // peak_backed_bytes_ high water mark.  While this is an estimate of true
+  // in-use by application demand, it is generally accurate at scale and
+  // requires minimal work to compute.
+  size_t peak_backed_bytes_{0};
+  size_t peak_sampled_application_bytes_{0};
+
   HugePageAwareAllocator* default_hpaa_{nullptr};
 };
 

diff --git a/tcmalloc/tcmalloc.cc b/tcmalloc/tcmalloc.cc
@@ -143,6 +143,7 @@ struct TCMallocStats {
   size_t pagemap_bytes;          // included in metadata bytes
   size_t percpu_metadata_bytes;  // included in metadata bytes
   BackingStats pageheap;         // Stats from page heap
+  PageAllocator::PeakStats peak_stats;
 
   ArenaStats arena;  // Stats from the metadata Arena
 
@@ -202,6 +203,7 @@ static void ExtractStats(TCMallocStats* r, uint64_t* class_count,
     r->metadata_bytes = Static::metadata_bytes();
     r->pagemap_bytes = Static::pagemap().bytes();
     r->pageheap = Static::page_allocator().stats();
+    r->peak_stats = Static::page_allocator().peak_stats();
     if (small_spans != nullptr) {
       Static::page_allocator().GetSmallSpanStats(small_spans);
     }
@@ -378,6 +380,9 @@ static void DumpStats(Printer* out, int level) {
       "MALLOC:   %12" PRIu64 " (%7.1f MiB) Pagemap root resident bytes\n"
       "MALLOC:   %12" PRIu64 " (%7.1f MiB) per-CPU slab bytes used\n"
       "MALLOC:   %12" PRIu64 " (%7.1f MiB) per-CPU slab resident bytes\n"
+      "MALLOC:   %12" PRIu64 " (%7.1f MiB) Actual memory used at peak\n"
+      "MALLOC:   %12" PRIu64 " (%7.1f MiB) Estimated in-use at peak\n"
+      "MALLOC:   %12.4f               Realized fragmentation (%%)\n"
       "MALLOC:   %12" PRIu64 "               Tcmalloc page size\n"
       "MALLOC:   %12" PRIu64 "               Tcmalloc hugepage size\n"
       "MALLOC:   %12" PRIu64 "               CPUs Allowed in Mask\n"
@@ -414,6 +419,11 @@ static void DumpStats(Printer* out, int level) {
       uint64_t(stats.percpu_metadata_bytes),
       stats.percpu_metadata_bytes / MiB,
       stats.percpu_metadata_bytes_res, stats.percpu_metadata_bytes_res / MiB,
+      uint64_t(stats.peak_stats.backed_bytes),
+      stats.peak_stats.backed_bytes / MiB,
+      uint64_t(stats.peak_stats.sampled_application_bytes),
+      stats.peak_stats.sampled_application_bytes / MiB,
+      100. * safe_div(stats.peak_stats.backed_bytes - stats.peak_stats.sampled_application_bytes, stats.peak_stats.sampled_application_bytes),
       uint64_t(kPageSize),
       uint64_t(kHugePageSize),
       CountAllowedCpus(),
@@ -594,6 +604,9 @@ namespace {
   region.PrintI64("pagemap_root_residence", stats.pagemap_root_bytes_res);
   region.PrintI64("percpu_slab_size", stats.percpu_metadata_bytes);
   region.PrintI64("percpu_slab_residence", stats.percpu_metadata_bytes_res);
+  region.PrintI64("peak_backed", stats.peak_stats.backed_bytes);
+  region.PrintI64("peak_application_demand",
+                  stats.peak_stats.sampled_application_bytes);
   region.PrintI64("tcmalloc_page_size", uint64_t(kPageSize));
   region.PrintI64("tcmalloc_huge_page_size", uint64_t(kHugePageSize));
   region.PrintI64("cpus_allowed", CountAllowedCpus());

diff --git a/tcmalloc/testing/BUILD b/tcmalloc/testing/BUILD
@@ -958,6 +958,24 @@ cc_library(
     ],
 )
 
+create_tcmalloc_testsuite(
+    name = "realized_fragmentation_test",
+    timeout = "long",
+    srcs = ["realized_fragmentation_test.cc"],
+    copts = TCMALLOC_DEFAULT_COPTS,
+    tags = [
+        "nosan",
+    ],
+    deps = [
+        ":testutil",
+        "//tcmalloc/internal:linked_list",
+        "@com_github_google_benchmark//:benchmark",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 create_tcmalloc_testsuite(
     name = "reclaim_test",
     srcs = ["reclaim_test.cc"],

diff --git a/tcmalloc/testing/realized_fragmentation_test.cc b/tcmalloc/testing/realized_fragmentation_test.cc
@@ -0,0 +1,161 @@
+// Copyright 2022 The TCMalloc Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/random/random.h"
+#include "absl/strings/numbers.h"
+#include "tcmalloc/internal/linked_list.h"
+#include "tcmalloc/testing/testutil.h"
+
+namespace tcmalloc {
+namespace {
+
+using tcmalloc_internal::SLL_Pop;
+using tcmalloc_internal::SLL_Push;
+
+struct PeakStats {
+  size_t backing;
+  size_t application;
+};
+
+PeakStats GetPeakStats() {
+  // TODO(ckennelly): Parse this with protobuf directly
+  PeakStats ret;
+  const std::string buf = GetStatsInPbTxt();
+  constexpr absl::string_view backing_needle = "peak_backed: ";
+  constexpr absl::string_view application_needle = "peak_application_demand: ";
+
+  auto parse = [](absl::string_view buf, absl::string_view needle) {
+    auto pos = buf.find(needle);
+    EXPECT_NE(pos, absl::string_view::npos);
+    pos += needle.size();
+    auto stop = buf.find(' ', pos);
+    if (stop != absl::string_view::npos) {
+      stop -= pos;
+    }
+    size_t ret;
+    EXPECT_TRUE(absl::SimpleAtoi(buf.substr(pos, stop), &ret))
+        << buf.substr(pos, stop);
+    return ret;
+  };
+
+  ret.backing = parse(buf, backing_needle);
+  ret.application = parse(buf, application_needle);
+
+  return ret;
+}
+
+TEST(RealizedFragmentation, Accuracy) {
+#ifndef NDEBUG
+  GTEST_SKIP() << "Skipping test under debug build for performance";
+#endif
+
+  const PeakStats starting = GetPeakStats();
+  // We have allocated at least once up to this point.
+  ASSERT_GT(starting.backing, 0);
+
+  // Since application data is sampled, allow wider error bars.
+  constexpr double kBackingTolerance = 0.20;
+  constexpr double kApplicationTolerance = 0.25;
+  absl::BitGen rng;
+
+  // Allocate many 2MB allocations, as to trigger a new high water mark, then
+  // deallocate.
+  constexpr size_t kLargeTarget = 1 << 29;
+  constexpr size_t kLargeSize = 2 << 20;
+  void* large_list = nullptr;
+
+  for (size_t total = 0; total < kLargeTarget; total += kLargeSize) {
+    SLL_Push(&large_list, ::operator new(kLargeSize));
+  }
+
+  const PeakStats peak0 = GetPeakStats();
+
+  EXPECT_NEAR(peak0.backing, starting.backing + kLargeTarget,
+              (starting.backing + kLargeTarget) * kBackingTolerance);
+  EXPECT_NEAR(peak0.application, starting.application + kLargeTarget,
+              (starting.application + kLargeTarget) * kApplicationTolerance);
+
+  while (large_list != nullptr) {
+    void* object = SLL_Pop(&large_list);
+    sized_delete(object, kLargeSize);
+  }
+
+  // Allocate many small alocations, as to trigger another high water mark.
+  // Deallocate half of these allocations, but fragmentation should remain high.
+  constexpr size_t kSmallTarget = kLargeTarget * 2;
+  constexpr size_t kSmallSize = 1024;
+  void* small_list_keep = nullptr;
+  int kept = 0;
+  void* small_list_free = nullptr;
+  int freed = 0;
+
+  for (size_t total = 0; total < kSmallTarget; total += kSmallSize) {
+    void* object = ::operator new(kSmallSize);
+    if (absl::Bernoulli(rng, 0.5)) {
+      SLL_Push(&small_list_keep, object);
+      kept++;
+    } else {
+      SLL_Push(&small_list_free, object);
+      freed++;
+    }
+  }
+
+  const PeakStats peak1 = GetPeakStats();
+
+  EXPECT_NEAR(peak1.backing, starting.backing + kSmallTarget,
+              (starting.backing + kSmallTarget) * kBackingTolerance);
+  EXPECT_NEAR(peak1.application, starting.application + kSmallTarget,
+              (starting.application + kSmallTarget) * kApplicationTolerance);
+
+  while (small_list_free != nullptr) {
+    void* object = SLL_Pop(&small_list_free);
+    sized_delete(object, kSmallSize);
+  }
+
+  // Allocate many 2MB allocations, as to trigger another high water mark.
+  // Fragmentation should continue to be high due to partial spans from the
+  // previous round.
+  for (size_t total = 0; total < 2 * kLargeTarget; total += kLargeSize) {
+    SLL_Push(&large_list, ::operator new(kLargeSize));
+  }
+
+  const PeakStats peak2 = GetPeakStats();
+
+  const double expected_backing =
+      starting.backing + kSmallTarget + 2 * kLargeTarget;
+  const double expected_application =
+      starting.backing + kSmallSize * kept + 2 * kLargeTarget;
+
+  EXPECT_NEAR(peak2.backing, expected_backing,
+              expected_backing * kBackingTolerance);
+  EXPECT_NEAR(peak2.application, expected_application,
+              expected_application * kApplicationTolerance);
+
+  while (large_list != nullptr) {
+    void* object = SLL_Pop(&large_list);
+    sized_delete(object, kLargeSize);
+  }
+
+  while (small_list_keep != nullptr) {
+    void* object = SLL_Pop(&small_list_keep);
+    sized_delete(object, kSmallSize);
+  }
+}
+
+}  // namespace
+}  // namespace tcmalloc