From 623619f1de3034a6a00725157b5c1143dee7f451 Mon Sep 17 00:00:00 2001 From: RdoubleA Date: Mon, 7 Oct 2024 15:15:05 -0700 Subject: [PATCH 1/2] move to top level config --- recipes/full_finetune_distributed.py | 6 ++++-- recipes/full_finetune_single_device.py | 6 ++++-- recipes/knowledge_distillation_single_device.py | 6 ++++-- recipes/lora_dpo_distributed.py | 6 ++++-- recipes/lora_dpo_single_device.py | 6 ++++-- recipes/lora_finetune_distributed.py | 6 ++++-- recipes/lora_finetune_single_device.py | 6 ++++-- recipes/ppo_full_finetune_single_device.py | 7 ++++--- recipes/qat_distributed.py | 6 ++++-- 9 files changed, 36 insertions(+), 19 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index af50516b74..b7c79c58e8 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -248,6 +248,8 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -476,6 +478,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -507,8 +510,7 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index fa1b7b14ff..a6850cdb7f 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -248,6 +248,8 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -482,6 +484,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -515,8 +518,7 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index 833c9aec56..007b9d015c 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -277,6 +277,8 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -497,6 +499,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -525,8 +528,7 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=( partial( padded_collate_sft, diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index d655889305..1037ee49ad 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -256,6 +256,8 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -449,6 +451,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -474,8 +477,7 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=partial( padded_collate_dpo, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index edd2d10427..9031b649db 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -214,6 +214,8 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -337,6 +339,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -363,8 +366,7 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=partial( padded_collate_dpo, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 2be9aa94a2..d722b63786 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -297,6 +297,8 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -590,6 +592,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -622,8 +625,7 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 6cc57d7bcd..4b632739b0 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -284,6 +284,8 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -501,6 +503,7 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -534,8 +537,7 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=( partial( collate_fn, diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 9f645b5fdd..15b4d26106 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -212,6 +212,8 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) self._setup_training_parameters(cfg) @@ -554,7 +556,7 @@ def _setup_optimizer( return optimizer def _setup_data( - self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int + self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int, drop_last: bool ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. @@ -579,8 +581,7 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=partial( padded_collate, pad_direction="left", diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index c6a7ec0ed1..f281f1fdca 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -264,6 +264,8 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, + # dropping last avoids shape issues with compile + flex attention + drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -497,6 +499,7 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, + drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -523,8 +526,7 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=drop_last, collate_fn=partial( padded_collate_sft, padding_idx=self._tokenizer.pad_id, From 0b9a3ec102d7fbb1accd187923b4b09848ab5347 Mon Sep 17 00:00:00 2001 From: RdoubleA Date: Tue, 8 Oct 2024 07:17:31 -0700 Subject: [PATCH 2/2] default to true --- recipes/full_finetune_distributed.py | 6 ++---- recipes/full_finetune_single_device.py | 6 ++---- recipes/knowledge_distillation_single_device.py | 6 ++---- recipes/lora_dpo_distributed.py | 6 ++---- recipes/lora_dpo_single_device.py | 6 ++---- recipes/lora_finetune_distributed.py | 6 ++---- recipes/lora_finetune_single_device.py | 6 ++---- recipes/ppo_full_finetune_single_device.py | 7 +++---- recipes/qat_distributed.py | 6 ++---- 9 files changed, 19 insertions(+), 36 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index b7c79c58e8..6e83e575f9 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -248,8 +248,6 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -478,7 +476,6 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -510,7 +507,8 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index a6850cdb7f..2addd92944 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -248,8 +248,6 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -484,7 +482,6 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -518,7 +515,8 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index 007b9d015c..c2ee8c7cc4 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -277,8 +277,6 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -499,7 +497,6 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -528,7 +525,8 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=( partial( padded_collate_sft, diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index 1037ee49ad..e903ab274a 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -256,8 +256,6 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -451,7 +449,6 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -477,7 +474,8 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=partial( padded_collate_dpo, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index 9031b649db..bd7dd77fba 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -214,8 +214,6 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -339,7 +337,6 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -366,7 +363,8 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=partial( padded_collate_dpo, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index d722b63786..1569dfee63 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -297,8 +297,6 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -592,7 +590,6 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -625,7 +622,8 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 4b632739b0..e4dd826c42 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -284,8 +284,6 @@ def setup(self, cfg: DictConfig) -> None: shuffle=cfg.shuffle, batch_size=cfg.batch_size, collate_fn=collate_name, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -503,7 +501,6 @@ def _setup_data( shuffle: bool, batch_size: int, collate_fn: str, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports @@ -537,7 +534,8 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=( partial( collate_fn, diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 15b4d26106..7679af3fd3 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -212,8 +212,6 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) self._setup_training_parameters(cfg) @@ -556,7 +554,7 @@ def _setup_optimizer( return optimizer def _setup_data( - self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int, drop_last: bool + self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. @@ -581,7 +579,8 @@ def _setup_data( dataset=ds, sampler=sampler, batch_size=batch_size, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=partial( padded_collate, pad_direction="left", diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index f281f1fdca..eb2e44fae2 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -264,8 +264,6 @@ def setup(self, cfg: DictConfig) -> None: cfg_dataset=cfg.dataset, shuffle=cfg.shuffle, batch_size=cfg.batch_size, - # dropping last avoids shape issues with compile + flex attention - drop_last=cfg.get("drop_last", True), ) # Finally update the recipe state which can only be correctly set after all of the @@ -499,7 +497,6 @@ def _setup_data( cfg_dataset: DictConfig, shuffle: bool, batch_size: int, - drop_last: bool, ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. Currently this recipe only supports the @@ -526,7 +523,8 @@ def _setup_data( dataset=ds, batch_size=batch_size, sampler=sampler, - drop_last=drop_last, + # dropping last avoids shape issues with compile + flex attention + drop_last=True, collate_fn=partial( padded_collate_sft, padding_idx=self._tokenizer.pad_id,