add L2 flush (pytorch#197)

duduyi2013 · facebook-github-bot · commit 50b5b3796676 · 2024-09-11T10:05:27.000-07:00
Summary: X-link: pytorch#3110 Pull Request resolved: facebookresearch/FBGEMM#197 add L2 flush support for checkpoint Reviewed By: q10 Differential Revision: D62462352 fbshipit-source-id: dfd59f0ebd43b27b1ce6f8a684b8956bf8672191
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1643,6 +1643,8 @@ def flush(self) -> None:
             False,
         )
 
+        self.ssd_db.flush()
+
     def prepare_inputs(
         self,
         indices: Tensor,
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/cachelib_cache.h b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache/cachelib_cache.h
@@ -33,10 +33,13 @@ class CacheLibCache {
  public:
   using Cache = facebook::cachelib::LruAllocator;
   struct CacheConfig {
-    size_t cacheSizeBytes;
+    size_t cache_size_bytes;
+    size_t item_size_bytes;
+    size_t num_shards;
+    int64_t max_D_;
   };
 
-  explicit CacheLibCache(size_t cacheSizeBytes, int64_t num_shards);
+  explicit CacheLibCache(const CacheConfig& cache_config);
 
   std::unique_ptr<Cache> initializeCacheLib(const CacheConfig& config);
 
@@ -85,6 +88,20 @@ class CacheLibCache {
   /// @note cache_->allocation will trigger eviction callback func
   bool put(int64_t key, const at::Tensor& data);
 
+  /// iterate through all items in L2 cache, fill them in indices and weights
+  /// respectively and return indices, weights and count
+  ///
+  /// @return indices The 1D embedding index tensor, should skip on negative
+  /// value
+  /// @return weights The 2D tensor that each row(embeddings) is paired up with
+  /// relative element in <indices>
+  /// @return count A single element tensor that contains the number of indices
+  /// to be processed
+  ///
+  /// @note this isn't thread safe, caller needs to make sure put isn't called
+  /// while this is executed.
+  std::tuple<at::Tensor, at::Tensor, at::Tensor> get_all_items();
+
   /// instantiate eviction related indices and weights tensors(size of <count>)
   /// for L2 eviction using the same dtype and device from <indices> and
   /// <weights> , managed on the caller side
diff --git a/fbgemm_gpu/src/ps_split_embeddings_cache/ps_table_batched_embeddings.h b/fbgemm_gpu/src/ps_split_embeddings_cache/ps_table_batched_embeddings.h
@@ -59,7 +59,7 @@ class EmbeddingParameterServer : public kv_db::EmbeddingKVDB {
     RECORD_USER_SCOPE("EmbeddingParameterServer::get");
     co_await tps_client_->get(indices, weights, count.item().toLong());
   }
-  void flush() override {}
+  void flush() {}
   void compact() override {}
   // cleanup cached results in server side
   // This is a test helper, please do not use it in production
diff --git a/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp b/fbgemm_gpu/src/split_embeddings_cache/cachelib_cache.cpp
@@ -13,14 +13,32 @@
 namespace l2_cache {
 
 using Cache = facebook::cachelib::LruAllocator;
-CacheLibCache::CacheLibCache(size_t cacheSizeBytes, int64_t num_shards)
-    : cache_config_(CacheConfig{.cacheSizeBytes = cacheSizeBytes}),
+
+// this is a general predictor for weights data type, might not be general
+// enough for all the cases
+at::ScalarType bytes_to_dtype(int num_bytes) {
+  switch (num_bytes) {
+    case 1:
+      return at::kByte;
+    case 2:
+      return at::kHalf;
+    case 4:
+      return at::kFloat;
+    case 8:
+      return at::kDouble;
+    default:
+      throw std::runtime_error("Unsupported dtype");
+  }
+}
+
+CacheLibCache::CacheLibCache(const CacheConfig& cache_config)
+    : cache_config_(cache_config),
       cache_(initializeCacheLib(cache_config_)),
       admin_(createCacheAdmin(*cache_)) {
-  for (int i = 0; i < num_shards; i++) {
+  for (size_t i = 0; i < cache_config_.num_shards; i++) {
     pool_ids_.push_back(cache_->addPool(
         fmt::format("shard_{}", i),
-        cache_->getCacheMemoryStats().ramCacheSize / num_shards));
+        cache_->getCacheMemoryStats().ramCacheSize / cache_config_.num_shards));
   }
 }
 
@@ -51,7 +69,7 @@ std::unique_ptr<Cache> CacheLibCache::initializeCacheLib(
             });
       };
   Cache::Config cacheLibConfig;
-  cacheLibConfig.setCacheSize(static_cast<uint64_t>(config.cacheSizeBytes))
+  cacheLibConfig.setCacheSize(static_cast<uint64_t>(config.cache_size_bytes))
       .setRemoveCallback(eviction_cb)
       .setCacheName("TBEL2Cache")
       .setAccessConfig({25 /* bucket power */, 10 /* lock power */})
@@ -99,6 +117,44 @@ bool CacheLibCache::put(int64_t key, const at::Tensor& data) {
   return true;
 }
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor> CacheLibCache::get_all_items() {
+  int total_num_items = 0;
+  for (auto& pool_id : pool_ids_) {
+    total_num_items += cache_->getPoolStats(pool_id).numItems();
+  }
+  auto weight_dim = cache_config_.max_D_;
+  auto weights_dtype =
+      bytes_to_dtype(cache_config_.item_size_bytes / weight_dim);
+  auto indices = at::empty(
+      total_num_items, at::TensorOptions().dtype(at::kLong).device(at::kCPU));
+  auto weights = at::empty(
+      {total_num_items, weight_dim},
+      at::TensorOptions().dtype(weights_dtype).device(at::kCPU));
+  FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(
+      weights.scalar_type(), "get_all_items", [&] {
+        auto indices_data_ptr = indices.data_ptr<int64_t>();
+        auto weights_data_ptr = weights.data_ptr<scalar_t>();
+        int64_t item_idx = 0;
+        for (auto itr = cache_->begin(); itr != cache_->end(); ++itr) {
+          const auto key_ptr =
+              reinterpret_cast<const int64_t*>(itr->getKey().data());
+          indices_data_ptr[item_idx] = *key_ptr;
+          std::copy(
+              reinterpret_cast<const scalar_t*>(itr->getMemory()),
+              reinterpret_cast<const scalar_t*>(itr->getMemory()) + weight_dim,
+              &weights_data_ptr[item_idx * weight_dim]); // dst_start
+          item_idx++;
+        }
+        CHECK_EQ(total_num_items, item_idx);
+      });
+  return std::make_tuple(
+      indices,
+      weights,
+      at::tensor(
+          {total_num_items},
+          at::TensorOptions().dtype(at::kLong).device(at::kCPU)));
+}
+
 void CacheLibCache::init_tensor_for_l2_eviction(
     const at::Tensor& indices,
     const at::Tensor& weights,
@@ -130,7 +186,7 @@ CacheLibCache::get_evicted_indices_and_weights() {
 
 std::vector<int64_t> CacheLibCache::get_cache_usage() {
   std::vector<int64_t> cache_mem_stats(2, 0); // freeBytes, capacity
-  cache_mem_stats[1] = cache_config_.cacheSizeBytes;
+  cache_mem_stats[1] = cache_config_.cache_size_bytes;
   for (auto& pool_id : pool_ids_) {
     auto pool_stats = cache_->getPoolStats(pool_id);
     cache_mem_stats[0] += pool_stats.freeMemoryBytes();
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.cpp
@@ -65,16 +65,23 @@ EmbeddingKVDB::EmbeddingKVDB(
     int64_t num_shards,
     int64_t max_D,
     int64_t cache_size_gb,
-    int64_t unique_id)
+    int64_t unique_id,
+    int64_t ele_size_bytes)
     : unique_id_(unique_id),
       num_shards_(num_shards),
       max_D_(max_D),
       executor_tp_(std::make_unique<folly::CPUThreadPoolExecutor>(num_shards)) {
   assert(num_shards > 0);
-  l2_cache_ = cache_size_gb > 0
-      ? std::make_unique<l2_cache::CacheLibCache>(
-            cache_size_gb * 1024 * 1024 * 1024, num_shards_)
-      : nullptr;
+  if (cache_size_gb > 0) {
+    l2_cache::CacheLibCache::CacheConfig cache_config;
+    cache_config.cache_size_bytes = cache_size_gb * 1024 * 1024 * 1024;
+    cache_config.num_shards = num_shards_;
+    cache_config.item_size_bytes = max_D_ * ele_size_bytes;
+    cache_config.max_D_ = max_D_;
+    l2_cache_ = std::make_unique<l2_cache::CacheLibCache>(cache_config);
+  } else {
+    l2_cache_ = nullptr;
+  }
   cache_filling_thread_ = std::make_unique<std::thread>([=] {
     while (!stop_) {
       auto filling_item_ptr = weights_to_fill_queue_.try_peek();
@@ -114,6 +121,17 @@ EmbeddingKVDB::~EmbeddingKVDB() {
   cache_filling_thread_->join();
 }
 
+void EmbeddingKVDB::flush() {
+  wait_util_filling_work_done();
+  if (l2_cache_) {
+    auto tensor_tuple = l2_cache_->get_all_items();
+    auto& indices = std::get<0>(tensor_tuple);
+    auto& weights = std::get<1>(tensor_tuple);
+    auto& count = std::get<2>(tensor_tuple);
+    folly::coro::blockingWait(set_kv_db_async(indices, weights, count));
+  }
+}
+
 void EmbeddingKVDB::get_cuda(
     const at::Tensor& indices,
     const at::Tensor& weights,
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/kv_db_table_batched_embeddings.h
@@ -76,7 +76,8 @@ class EmbeddingKVDB : public std::enable_shared_from_this<EmbeddingKVDB> {
       int64_t num_shards,
       int64_t max_D,
       int64_t cache_size_gb = 0,
-      int64_t unique_id = 0);
+      int64_t unique_id = 0,
+      int64_t ele_size_bytes = 2 /*assume by default fp16*/);
 
   virtual ~EmbeddingKVDB();
 
@@ -140,7 +141,13 @@ class EmbeddingKVDB : public std::enable_shared_from_this<EmbeddingKVDB> {
 
   virtual void compact() = 0;
 
-  virtual void flush() = 0;
+  /// Flush L2 cache into backend storage
+  /// @return None
+  /// @note caller side should mananger the timing to make sure flush doens't
+  /// happen at the same time as get/set
+  /// @note flush only flushes L2 cache, if there is cache on the backend
+  /// storage, that flush should be called as well
+  void flush();
 
   // The function attaches the CUDA callback logic to the compute
   // stream to ensure that the data retrieval is carried out properly.
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h
@@ -151,7 +151,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
             num_shards,
             max_D,
             l2_cache_size_gb,
-            tbe_unqiue_id) {
+            tbe_unqiue_id,
+            row_storage_bitwidth / 8) {
     // TODO: lots of tunables. NNI or something for this?
     rocksdb::Options options;
     options.create_if_missing = true;
@@ -580,7 +581,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {
     }
   }
 
-  void flush() override {
+  void flush() {
+    kv_db::EmbeddingKVDB::flush();
     for (auto& db : dbs_) {
       db->Flush(rocksdb::FlushOptions());
     }

Original file line number	Diff line number	Diff line change
`@@ -1643,6 +1643,8 @@ def flush(self) -> None:`
`1643`	`1643`	`False,`
`1644`	`1644`	`)`
`1645`	`1645`
	`1646`	`+ self.ssd_db.flush()`
	`1647`	`+`
`1646`	`1648`	`def prepare_inputs(`
`1647`	`1649`	`self,`
`1648`	`1650`	`indices: Tensor,`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ class EmbeddingParameterServer : public kv_db::EmbeddingKVDB {`
`59`	`59`	`RECORD_USER_SCOPE("EmbeddingParameterServer::get");`
`60`	`60`	`co_await tps_client_->get(indices, weights, count.item().toLong());`
`61`	`61`	`}`
`62`		`- void flush() override {}`
	`62`	`+ void flush() {}`
`63`	`63`	`void compact() override {}`
`64`	`64`	`// cleanup cached results in server side`
`65`	`65`	`// This is a test helper, please do not use it in production`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {`
`151`	`151`	`num_shards,`
`152`	`152`	`max_D,`
`153`	`153`	`l2_cache_size_gb,`
`154`		`- tbe_unqiue_id) {`
	`154`	`+ tbe_unqiue_id,`
	`155`	`+ row_storage_bitwidth / 8) {`
`155`	`156`	`// TODO: lots of tunables. NNI or something for this?`
`156`	`157`	`rocksdb::Options options;`
`157`	`158`	`options.create_if_missing = true;`
`@@ -580,7 +581,8 @@ class EmbeddingRocksDB : public kv_db::EmbeddingKVDB {`
`580`	`581`	`}`
`581`	`582`	`}`
`582`	`583`
`583`		`- void flush() override {`
	`584`	`+ void flush() {`
	`585`	`+ kv_db::EmbeddingKVDB::flush();`
`584`	`586`	`for (auto& db : dbs_) {`
`585`	`587`	`db->Flush(rocksdb::FlushOptions());`
`586`	`588`	`}`