q10
diff --git a/‎fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp
Lines changed: 7 additions & 5 deletions b/‎fbgemm_gpu/codegen/training/backward/embedding_backward_split_cpu_template.cpp
Lines changed: 7 additions & 5 deletions
diff --git a/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_cpu.cpp
Lines changed: 19 additions & 18 deletions b/‎fbgemm_gpu/codegen/training/forward/embedding_forward_split_cpu.cpp
Lines changed: 19 additions & 18 deletions
diff --git a/‎fbgemm_gpu/include/fbgemm_gpu/embedding_forward_split_cpu.h
Lines changed: 4 additions & 3 deletions b/‎fbgemm_gpu/include/fbgemm_gpu/embedding_forward_split_cpu.h
Lines changed: 4 additions & 3 deletions
@@ -87,14 +87,16 @@ for (const auto t : c10::irange(num_tables)) {
     int feature_begin = table_to_feature_offset[t];
     int64_t hash_size = get_hash_size(feature_begin);
 
+#ifdef FBGEMM_GPU_MEMCHECK
+    const auto func_name = "::internal::csr2csc";
+#endif
+    using weight_t = at::acc_type<scalar_t, true>;
     ::internal::csr2csc(
         cscs[t],
         B,
-        offsets.accessor<int64_t, 1>(),
-        indices.accessor<int64_t, 1>(),
-        indice_weights.defined()
-            ? indice_weights.accessor<at::acc_type<scalar_t, true>, 1>()
-            : at::TensorAccessor<at::acc_type<scalar_t, true>, 1>(nullptr, nullptr, nullptr),
+        MAKE_TA_WITH_NAME(func_name, offsets, int64_t, 1),
+        MAKE_TA_WITH_NAME(func_name, indices, int64_t, 1),
+        MAKE_TA_WITH_NAME(func_name, indice_weights, weight_t, 1),
         pooling_mode,
         table_to_feature_offset + t,
         hash_size);
 
@@ -14,6 +14,7 @@
 #include "fbgemm_gpu/utils/cpu_utils.h"
 #include "fbgemm_gpu/utils/dispatch_macros.h"
 #include "fbgemm_gpu/utils/ops_utils.h"
+#include "fbgemm_gpu/utils/tensor_accessor.h"
 #ifdef FBCODE_CAFFE2
 #include <libdivide.h>
 #else
@@ -384,9 +385,9 @@ template <typename index_t, typename scalar_t, bool IS_VALUE_PAIR>
 void csr2csc_template_(
     HyperCompressedSparseColumn& csc,
     int B,
-    const at::TensorAccessor<index_t, 1>& csr_offsets,
-    const at::TensorAccessor<index_t, 1>& csr_indices,
-    const at::TensorAccessor<scalar_t, 1>& csr_weights,
+    const pta::TensorAccessor<index_t, 1>& csr_offsets,
+    const pta::TensorAccessor<index_t, 1>& csr_indices,
+    const pta::TensorAccessor<scalar_t, 1>& csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset,
     int64_t num_embeddings) {
@@ -585,9 +586,9 @@ void csr2csc_template_(
   template void csr2csc_template_<index_t, scalar_t, is_value_pair>(     \
       HyperCompressedSparseColumn & csc,                                 \
       int B,                                                             \
-      const at::TensorAccessor<index_t, 1>& csr_offsets,                 \
-      const at::TensorAccessor<index_t, 1>& csr_indices,                 \
-      const at::TensorAccessor<scalar_t, 1>& csr_weights,                \
+      const pta::TensorAccessor<index_t, 1>& csr_offsets,                \
+      const pta::TensorAccessor<index_t, 1>& csr_indices,                \
+      const pta::TensorAccessor<scalar_t, 1>& csr_weights,               \
       int64_t pooling_mode,                                              \
       const int* table_to_feature_offset,                                \
       int64_t num_embeddings);
@@ -613,9 +614,9 @@ template <typename index_t, typename scalar_t>
 void csr2csc(
     HyperCompressedSparseColumn& csc,
     int B,
-    const at::TensorAccessor<index_t, 1>& csr_offsets,
-    const at::TensorAccessor<index_t, 1>& csr_indices,
-    const at::TensorAccessor<scalar_t, 1>& csr_weights,
+    const pta::TensorAccessor<index_t, 1>& csr_offsets,
+    const pta::TensorAccessor<index_t, 1>& csr_indices,
+    const pta::TensorAccessor<scalar_t, 1>& csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset,
     int64_t num_embeddings) {
@@ -644,15 +645,15 @@ void csr2csc(
   }
 }
 
-#define INSTANTIATE_CSR2CSC_0(index_t, scalar_t)          \
-  template void csr2csc<index_t, scalar_t>(               \
-      HyperCompressedSparseColumn & csc,                  \
-      int B,                                              \
-      const at::TensorAccessor<index_t, 1>& csr_offsets,  \
-      const at::TensorAccessor<index_t, 1>& csr_indices,  \
-      const at::TensorAccessor<scalar_t, 1>& csr_weights, \
-      int64_t pooling_mode,                               \
-      const int* table_to_feature_offset,                 \
+#define INSTANTIATE_CSR2CSC_0(index_t, scalar_t)           \
+  template void csr2csc<index_t, scalar_t>(                \
+      HyperCompressedSparseColumn & csc,                   \
+      int B,                                               \
+      const pta::TensorAccessor<index_t, 1>& csr_offsets,  \
+      const pta::TensorAccessor<index_t, 1>& csr_indices,  \
+      const pta::TensorAccessor<scalar_t, 1>& csr_weights, \
+      int64_t pooling_mode,                                \
+      const int* table_to_feature_offset,                  \
       int64_t num_embeddings);
 
 #define INSTANTIATE_CSR2CSC_1(index_t)   \
 
@@ -11,6 +11,7 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include "fbgemm/Utils.h"
+#include "fbgemm_gpu/utils/tensor_accessor.h"
 
 at::Tensor split_embedding_codegen_forward_cpu(
     at::Tensor weights,
@@ -120,9 +121,9 @@ template <typename index_t, typename scalar_t>
 void csr2csc(
     HyperCompressedSparseColumn& csc,
     int B,
-    const at::TensorAccessor<index_t, 1>& csr_offsets,
-    const at::TensorAccessor<index_t, 1>& csr_indices,
-    const at::TensorAccessor<scalar_t, 1>& csr_weights,
+    const pta::TensorAccessor<index_t, 1>& csr_offsets,
+    const pta::TensorAccessor<index_t, 1>& csr_indices,
+    const pta::TensorAccessor<scalar_t, 1>& csr_weights,
     int64_t pooling_mode,
     const int* table_to_feature_offset,
     int64_t num_embeddings);