Reduce bulk init time and fix OOM (pytorch#3828)

peterfu0 · facebook-github-bot · commit 6e3e8f0e8a6c · 2025-03-15T11:42:03.000-07:00
Summary: X-link: facebookresearch/FBGEMM#911 X-link: facebookresearch/FBGEMM#909 X-link: facebookresearch/FBGEMM#908 Disable compaction when bulk initialize TBE in SSD, this reduce the initialization time from over 5mins to 2-3 mins. Also use bytes as the chunk size rather than row count, as each row might have different dimensions in different TBE, to avoid OOM issue. Differential Revision: D70921864
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -16,7 +16,7 @@
 import tempfile
 import threading
 import time
-from math import log2
+from math import floor, log2
 from typing import Any, Callable, List, Optional, Tuple, Type, Union
 import torch  # usort:skip
 
@@ -148,7 +148,8 @@ def __init__(
         # Set to False to use cudaMallocManaged
         uvm_host_mapped: bool = False,
         enable_async_update: bool = True,  # whether enable L2/rocksdb write to async background thread
-        # if > 0, insert all kv pairs to rocksdb at init time, in chunks of *bulk_init_chunk_size* rows
+        # if > 0, insert all kv pairs to rocksdb at init time, in chunks of *bulk_init_chunk_size* bytes
+        # number of rows will be decided by bulk_init_chunk_size / size_of_each_row
         bulk_init_chunk_size: int = 0,
         lazy_bulk_init_enabled: bool = False,
     ) -> None:
@@ -245,7 +246,7 @@ def __init__(
             f"{cache_size / 1024.0 / 1024.0 / 1024.0 : .2f}GB, "
             f"weights precision: {weights_precision}, "
             f"output dtype: {output_dtype}, "
-            f"chunk size in bulk init: {bulk_init_chunk_size} rows"
+            f"chunk size in bulk init: {bulk_init_chunk_size} bytes"
         )
         self.register_buffer(
             "lxu_cache_state",
@@ -766,21 +767,24 @@ def _insert_all_kv(self) -> None:
         initailization time.
         """
         row_offset = 0
-        chunk_size = self.bulk_init_chunk_size
+        row_count = floor(
+            self.bulk_init_chunk_size
+            / (self.max_D * self.weights_precision.as_dtype().itemsize)
+        )
         total_dim0 = 0
         for dim0, _ in self.embedding_specs:
             total_dim0 += dim0
 
         start_ts = time.time()
         chunk_tensor = torch.empty(
-            chunk_size,
+            row_count,
             self.max_D,
             dtype=self.weights_precision.as_dtype(),
             device="cuda",
         )
         cpu_tensor = torch.empty_like(chunk_tensor, device="cpu")
-        for row_offset in range(0, total_dim0, chunk_size):
-            actual_dim0 = min(total_dim0 - row_offset, chunk_size)
+        for row_offset in range(0, total_dim0, row_count):
+            actual_dim0 = min(total_dim0 - row_offset, row_count)
             chunk_tensor.uniform_(
                 self.ssd_uniform_init_lower, self.ssd_uniform_init_upper
             )
@@ -789,9 +793,12 @@ def _insert_all_kv(self) -> None:
             # This code is intentionally not calling through the getter property
             # to avoid the lazy initialization thread from joining with itself.
             self._ssd_db.set_range_to_storage(rand_val, row_offset, actual_dim0)
+        self.ssd_db.toggle_compaction(True)
         end_ts = time.time()
         elapsed = int((end_ts - start_ts) * 1e6)
-        logging.info(f"TBE bulk initialization took {elapsed:_} us")
+        logging.info(
+            f"TBE bulk initialization took {elapsed:_} us, bulk_init_chunk_size={self.bulk_init_chunk_size}, each batch of {row_count} rows, total rows of {total_dim0}"
+        )
 
     @torch.jit.ignore
     def _report_duration(
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/embedding_rocksdb_wrapper.h
@@ -82,6 +82,10 @@ class EmbeddingRocksDBWrapper : public torch::jit::CustomClassHolder {
     return impl_->set_range_to_storage(weights, start, length);
   }
 
+  void toggle_compaction(bool enable) {
+    impl_->toggle_compaction(enable);
+  }
+
   void get(
       at::Tensor indices,
       at::Tensor weights,
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp
@@ -460,6 +460,7 @@ static auto embedding_rocks_db_wrapper =
         .def(
             "set_range_to_storage",
             &EmbeddingRocksDBWrapper::set_range_to_storage)
+        .def("toggle_compaction", &EmbeddingRocksDBWrapper::toggle_compaction)
         .def(
             "get",
             &EmbeddingRocksDBWrapper::get,
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h
@@ -290,12 +290,67 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
     options.memtable_prefix_bloom_size_ratio = 0.05;
     options.memtable_whole_key_filtering = true;
     options.max_background_jobs = num_threads;
+    // disable auto compactions during bulk init, re-enable once done
+    // maximum number of concurrent flush operations
+    options.max_background_flushes = num_threads;
+    options.disable_auto_compactions = true;
     options.env->SetBackgroundThreads(4, rocksdb::Env::HIGH);
     options.env->SetBackgroundThreads(1, rocksdb::Env::LOW);
-
     options.max_open_files = -1;
 
+    initialize_dbs(num_shards, path, options, use_passed_in_path);
+    initialize_initializers(
+        num_shards,
+        max_D,
+        uniform_init_lower,
+        uniform_init_upper,
+        row_storage_bitwidth);
+    executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(num_shards);
+    ro_.verify_checksums = false;
+    ro_.async_io = true;
+    wo_.disableWAL = true;
+    wo_.sync = false;
+
+    // Setup staggered manual compaction data members
+    memtable_flush_period_ = memtable_flush_period;
+    if (memtable_flush_period_ > 0) {
+      done_staggered_flushes_ = false;
+      memtable_flush_offset_ = memtable_flush_offset;
+      l0_files_per_compact_ = l0_files_per_compact;
+      compaction_period_ = memtable_flush_period_ * l0_files_per_compact *
+          options.min_write_buffer_number_to_merge;
+      int64_t period_per_shard = memtable_flush_period_ / num_shards;
+      CHECK_GT(period_per_shard, 0);
+      // We want to stagger memory flushes (and then later
+      // stagger all compactions)
+
+      for (int64_t i = 0; i < num_shards; i++) {
+        shard_flush_compaction_deadlines_.push_back(
+            memtable_flush_offset_ + (i * period_per_shard));
+      }
+    }
+  }
+
+  ~EmbeddingRocksDB() override {
+    // clear all the snapshots if not released
+    if (snapshots_.size() > 0) {
+      LOG(WARNING)
+          << snapshots_.size()
+          << " snapshots have not been released when db is closing. Releasing them now.";
+    }
+    snapshots_.clear();
+    for (auto shard = 0; shard < dbs_.size(); ++shard) {
+      dbs_[shard]->Close();
+    }
+  }
+
+  void initialize_dbs(
+      int64_t num_shards,
+      std::string path,
+      rocksdb::Options& options,
+      bool use_passed_in_path) {
 #ifdef FBGEMM_FBCODE
+    std::string used_path = "";
     auto serviceInfo = std::make_shared<facebook::fb_rocksdb::ServiceInfo>();
     serviceInfo->oncall = "pyper_training";
     serviceInfo->service_name = "ssd_offloading_rocksb";
@@ -307,7 +362,6 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
       path = ssd_mount_point;
       tbe_uuid = facebook::strings::generateUUID();
     }
-    std::string used_path = "";
 #endif
     for (auto i = 0; i < num_shards; ++i) {
 #ifdef FBGEMM_FBCODE
@@ -350,6 +404,19 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
       }
       CHECK(s.ok()) << s.ToString();
       dbs_.emplace_back(db);
+    }
+#ifdef FBGEMM_FBCODE
+    LOG(INFO) << "TBE actual used_path: " << used_path;
+#endif
+  }
+
+  void initialize_initializers(
+      int64_t num_shards,
+      int64_t max_D,
+      float uniform_init_lower,
+      float uniform_init_upper,
+      int64_t row_storage_bitwidth) {
+    for (auto i = 0; i < num_shards; ++i) {
       auto* gen = at::check_generator<at::CPUGeneratorImpl>(
           at::detail::getDefaultCPUGenerator());
       {
@@ -362,46 +429,6 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
             row_storage_bitwidth));
       }
     }
-#ifdef FBGEMM_FBCODE
-    LOG(INFO) << "TBE actual used_path: " << used_path;
-#endif
-    executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(num_shards);
-    ro_.verify_checksums = false;
-    ro_.async_io = true;
-    wo_.disableWAL = true;
-    wo_.sync = false;
-
-    // Setup staggered manual compaction data members
-    memtable_flush_period_ = memtable_flush_period;
-    if (memtable_flush_period_ > 0) {
-      done_staggered_flushes_ = false;
-      memtable_flush_offset_ = memtable_flush_offset;
-      l0_files_per_compact_ = l0_files_per_compact;
-      compaction_period_ = memtable_flush_period_ * l0_files_per_compact *
-          options.min_write_buffer_number_to_merge;
-      int64_t period_per_shard = memtable_flush_period_ / num_shards;
-      CHECK_GT(period_per_shard, 0);
-      // We want to stagger memory flushes (and then later
-      // stagger all compactions)
-
-      for (int64_t i = 0; i < num_shards; i++) {
-        shard_flush_compaction_deadlines_.push_back(
-            memtable_flush_offset_ + (i * period_per_shard));
-      }
-    }
-  }
-
-  ~EmbeddingRocksDB() override {
-    // clear all the snapshots if not released
-    if (snapshots_.size() > 0) {
-      LOG(WARNING)
-          << snapshots_.size()
-          << " snapshots have not been released when db is closing. Releasing them now.";
-    }
-    snapshots_.clear();
-    for (auto shard = 0; shard < dbs_.size(); ++shard) {
-      dbs_[shard]->Close();
-    }
   }
 
   folly::SemiFuture<std::vector<folly::Unit>> get_kv_db_async(
@@ -549,6 +576,46 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
     folly::coro::blockingWait(set_kv_db_async(seq_indices, weights, count));
   }
 
+  virtual rocksdb::Status set_rocksdb_option(
+      int shard,
+      const std::string& key,
+      const std::string& value) {
+    return dbs_[shard]->SetOptions({{key, value}});
+  }
+
+  void toggle_compaction(bool enable) {
+    int max_retries = 10;
+    std::vector<folly::Future<bool>> futures;
+    for (auto shard = 0; shard < dbs_.size(); ++shard) {
+      auto f = folly::via(executor_.get()).thenValue([=](folly::Unit) -> bool {
+        for (int attempt = 0; attempt < max_retries; ++attempt) {
+          auto val = enable ? "false" : "true";
+          auto s = set_rocksdb_option(shard, "disable_auto_compactions", val);
+          if (s.ok()) {
+            return true;
+          }
+          LOG(WARNING) << "Failed to toggle compaction to " << enable
+                       << " for shard " << shard << ", attempt=" << attempt
+                       << ", max_retries=" << max_retries << std::endl;
+          std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        }
+        return false;
+      });
+      futures.push_back(std::move(f));
+    }
+    auto results = folly::coro::blockingWait(folly::collectAll(futures));
+    for (auto& result : results) {
+      if (result.hasValue()) {
+        CHECK(result.value())
+            << "Failed to toggle compaction to " << enable << std::endl;
+      } else {
+        CHECK(false) << "Failed to toggle compaction to " << enable
+                     << " with exception " << result.exception().what()
+                     << std::endl;
+      }
+    }
+  }
+
   int64_t get_max_D() {
     return max_D_;
   }
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/test/ssd_table_batched_embeddings_test.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/test/ssd_table_batched_embeddings_test.cpp
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py