Follow up on BC issue for open sourcing TBE inplace update op (pytorch#1492)

jianyuh · facebook-github-bot · commit 457bac034e4a · 2022-12-04T12:02:36.000-08:00
Summary: Pull Request resolved: pytorch#1492 Reviewed By: jspark1105 Differential Revision: D41717190 fbshipit-source-id: 818c54fb236e72b5816921e7d2c579843d346d47
diff --git a/.github/workflows/fbgemmci.yml b/.github/workflows/fbgemmci.yml
@@ -197,7 +197,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest]
+        os: [ubuntu-20.04]
         config: [[pip, 11.3], [pip, 11.5], [pip, 11.6], [pip, 11.7], [conda, 11.7]]
 
     steps:
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -161,6 +161,7 @@ set(codegen_dependencies
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/dispatch_macros.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/embedding_backward_template_helpers.cuh
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/embedding_common.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/embedding_inplace_update.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/quantize_ops_utils.h
     ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_utils.cuh
diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -18,13 +18,13 @@ torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
 torch.ops.load_library(
     "//deeplearning/fbgemm/fbgemm_gpu:split_table_batched_embeddings"
 )
-try:
-    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update")
-    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update_cpu")
-except OSError:
-    # Keep for BC: will be deprecated soon.
-    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/fb:embedding_inplace_update")
-    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/fb:embedding_inplace_update_cpu")
+# try:
+#     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update")
+#     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:embedding_inplace_update_cpu")
+# except OSError:
+#     # Keep for BC: will be deprecated soon.
+#     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/fb:embedding_inplace_update")
+#     torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu/fb:embedding_inplace_update_cpu")
 
 {% else %}
 #import os
diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h
@@ -71,4 +71,22 @@ void embedding_inplace_update_cuda(
     c10::optional<Tensor> lxu_cache_weights = c10::nullopt,
     c10::optional<Tensor> lxu_cache_locations = c10::nullopt);
 
+void embedding_inplace_update_cpu(
+    Tensor dev_weights,
+    Tensor uvm_weights,
+    Tensor weights_placements,
+    Tensor weights_offsets,
+    Tensor weights_tys,
+    Tensor D_offsets,
+    Tensor update_weights,
+    Tensor update_table_idx,
+    Tensor update_row_idx,
+    Tensor update_offsets,
+    const int64_t row_alignment,
+    c10::optional<Tensor> lxu_cache_weights =
+        c10::nullopt, // Not used, to match cache interface for CUDA op
+    c10::optional<Tensor> lxu_cache_locations =
+        c10::nullopt // Not used, to match cache interface for CUDA op
+);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_update.cu
@@ -10,7 +10,7 @@
 
 #include <c10/cuda/CUDAGuard.h>
 
-#include "embedding_inplace_update.h"
+#include "fbgemm_gpu/embedding_inplace_update.h"
 #include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
 
 using Tensor = at::Tensor;
diff --git a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp
@@ -12,7 +12,7 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
-#include "embedding_inplace_update.h"
+#include "fbgemm_gpu/embedding_inplace_update.h"
 
 using Tensor = at::Tensor;
 
@@ -72,11 +72,8 @@ void embedding_inplace_update_cpu(
     Tensor update_row_idx,
     Tensor update_offsets,
     const int64_t row_alignment,
-    c10::optional<Tensor> lxu_cache_weights =
-        c10::nullopt, // Not used, to match cache interface for CUDA op
-    c10::optional<Tensor> lxu_cache_locations =
-        c10::nullopt // Not used, to match cache interface for CUDA op
-) {
+    c10::optional<Tensor> lxu_cache_weights,
+    c10::optional<Tensor> lxu_cache_locations) {
   TENSOR_ON_CPU(dev_weights);
   TENSOR_ON_CPU(uvm_weights);
   TENSOR_ON_CPU(weights_placements);
diff --git a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp
@@ -8,7 +8,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/library.h>
-#include "embedding_inplace_update.h"
+#include "fbgemm_gpu/embedding_inplace_update.h"
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
diff --git a/fbgemm_gpu/test/embedding_inplace_update_test.cpp b/fbgemm_gpu/test/embedding_inplace_update_test.cpp
@@ -6,7 +6,7 @@
  */
 #include <folly/Random.h>
 #include <gtest/gtest.h>
-#include "deeplearning/fbgemm/fbgemm_gpu/src/embedding_inplace_update.h"
+#include "fbgemm_gpu/embedding_inplace_update.h"
 
 using namespace ::testing;
 using namespace fbgemm_gpu;
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -4685,8 +4685,9 @@ def test_embedding_inplace_update(
         )
 
         weights_ty_list = [weights_ty] * T
-        if open_source:
-            test_internal = False
+        # if open_source:
+        #     test_internal = False
+        test_internal = False
 
         # create two embedding bag op with random weights
         locations = [location] * T