From 623619f1de3034a6a00725157b5c1143dee7f451 Mon Sep 17 00:00:00 2001
From: RdoubleA <rafiayub@fb.com>
Date: Mon, 7 Oct 2024 15:15:05 -0700
Subject: [PATCH 1/2] move to top level config

---
 recipes/full_finetune_distributed.py            | 6 ++++--
 recipes/full_finetune_single_device.py          | 6 ++++--
 recipes/knowledge_distillation_single_device.py | 6 ++++--
 recipes/lora_dpo_distributed.py                 | 6 ++++--
 recipes/lora_dpo_single_device.py               | 6 ++++--
 recipes/lora_finetune_distributed.py            | 6 ++++--
 recipes/lora_finetune_single_device.py          | 6 ++++--
 recipes/ppo_full_finetune_single_device.py      | 7 ++++---
 recipes/qat_distributed.py                      | 6 ++++--
 9 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index af50516b74..b7c79c58e8 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -248,6 +248,8 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -476,6 +478,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -507,8 +510,7 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=partial(
                 collate_fn,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index fa1b7b14ff..a6850cdb7f 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -248,6 +248,8 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -482,6 +484,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -515,8 +518,7 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=partial(
                 collate_fn,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index 833c9aec56..007b9d015c 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -277,6 +277,8 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -497,6 +499,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -525,8 +528,7 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=(
                 partial(
                     padded_collate_sft,
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index d655889305..1037ee49ad 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -256,6 +256,8 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -449,6 +451,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -474,8 +477,7 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=partial(
                 padded_collate_dpo,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index edd2d10427..9031b649db 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -214,6 +214,8 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -337,6 +339,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -363,8 +366,7 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=partial(
                 padded_collate_dpo,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index 2be9aa94a2..d722b63786 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -297,6 +297,8 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -590,6 +592,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -622,8 +625,7 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=partial(
                 collate_fn,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 6cc57d7bcd..4b632739b0 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -284,6 +284,8 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -501,6 +503,7 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -534,8 +537,7 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=(
                 partial(
                     collate_fn,
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index 9f645b5fdd..15b4d26106 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -212,6 +212,8 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         self._setup_training_parameters(cfg)
@@ -554,7 +556,7 @@ def _setup_optimizer(
             return optimizer
 
     def _setup_data(
-        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int
+        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int, drop_last: bool
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here.
@@ -579,8 +581,7 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=partial(
                 padded_collate,
                 pad_direction="left",
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index c6a7ec0ed1..f281f1fdca 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -264,6 +264,8 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -497,6 +499,7 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
+        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -523,8 +526,7 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg_dataset.get("drop_last", True),
+            drop_last=drop_last,
             collate_fn=partial(
                 padded_collate_sft,
                 padding_idx=self._tokenizer.pad_id,

From 0b9a3ec102d7fbb1accd187923b4b09848ab5347 Mon Sep 17 00:00:00 2001
From: RdoubleA <rafiayub@fb.com>
Date: Tue, 8 Oct 2024 07:17:31 -0700
Subject: [PATCH 2/2] default to true

---
 recipes/full_finetune_distributed.py            | 6 ++----
 recipes/full_finetune_single_device.py          | 6 ++----
 recipes/knowledge_distillation_single_device.py | 6 ++----
 recipes/lora_dpo_distributed.py                 | 6 ++----
 recipes/lora_dpo_single_device.py               | 6 ++----
 recipes/lora_finetune_distributed.py            | 6 ++----
 recipes/lora_finetune_single_device.py          | 6 ++----
 recipes/ppo_full_finetune_single_device.py      | 7 +++----
 recipes/qat_distributed.py                      | 6 ++----
 9 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index b7c79c58e8..6e83e575f9 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -248,8 +248,6 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -478,7 +476,6 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -510,7 +507,8 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=partial(
                 collate_fn,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index a6850cdb7f..2addd92944 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -248,8 +248,6 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -484,7 +482,6 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -518,7 +515,8 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=partial(
                 collate_fn,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index 007b9d015c..c2ee8c7cc4 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -277,8 +277,6 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -499,7 +497,6 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -528,7 +525,8 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=(
                 partial(
                     padded_collate_sft,
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index 1037ee49ad..e903ab274a 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -256,8 +256,6 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -451,7 +449,6 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -477,7 +474,8 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=partial(
                 padded_collate_dpo,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index 9031b649db..bd7dd77fba 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -214,8 +214,6 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -339,7 +337,6 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -366,7 +363,8 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=partial(
                 padded_collate_dpo,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index d722b63786..1569dfee63 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -297,8 +297,6 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -592,7 +590,6 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -625,7 +622,8 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=partial(
                 collate_fn,
                 padding_idx=self._tokenizer.pad_id,
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 4b632739b0..e4dd826c42 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -284,8 +284,6 @@ def setup(self, cfg: DictConfig) -> None:
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
             collate_fn=collate_name,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -503,7 +501,6 @@ def _setup_data(
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports
@@ -537,7 +534,8 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=(
                 partial(
                     collate_fn,
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index 15b4d26106..7679af3fd3 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -212,8 +212,6 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         self._setup_training_parameters(cfg)
@@ -556,7 +554,7 @@ def _setup_optimizer(
             return optimizer
 
     def _setup_data(
-        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int, drop_last: bool
+        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here.
@@ -581,7 +579,8 @@ def _setup_data(
             dataset=ds,
             sampler=sampler,
             batch_size=batch_size,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=partial(
                 padded_collate,
                 pad_direction="left",
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index f281f1fdca..eb2e44fae2 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -264,8 +264,6 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            # dropping last avoids shape issues with compile + flex attention
-            drop_last=cfg.get("drop_last", True),
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -499,7 +497,6 @@ def _setup_data(
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
-        drop_last: bool,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -526,7 +523,8 @@ def _setup_data(
             dataset=ds,
             batch_size=batch_size,
             sampler=sampler,
-            drop_last=drop_last,
+            # dropping last avoids shape issues with compile + flex attention
+            drop_last=True,
             collate_fn=partial(
                 padded_collate_sft,
                 padding_idx=self._tokenizer.pad_id,