From 6b8aaad6f4bc323f838c6ccf0b97b7e6a8a89b4b Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Mon, 7 Jul 2025 15:40:16 +0530 Subject: [PATCH 1/3] Fixed reporting of single value of loss and ppl across devices. Signed-off-by: meetkuma --- QEfficient/finetune/utils/train_utils.py | 94 ++++++++++++------------ 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index f513ba5c4..b55fe45e3 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -19,6 +19,7 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import TrainConfig +from QEfficient.finetune.utils.helper import get_num_ddp_devices try: import torch_qaic # noqa: F401 @@ -353,35 +354,42 @@ def train( else total_loss / (step + 1 - (num_dummy_samples / train_config.train_batch_size)) ) if train_config.task_type == "seq_classification": - metric_val = acc_helper.compute() + train_epoch_metric = acc_helper.compute() acc_helper.reset() else: - metric_val = torch.exp(train_epoch_loss) + train_epoch_metric = torch.exp(train_epoch_loss) - train_metric.append(float(metric_val)) + train_metric.append(float(train_epoch_metric)) train_loss.append(float(train_epoch_loss)) + if train_config.enable_ddp: + dist.all_reduce(train_epoch_loss, op=dist.ReduceOp.SUM) + train_epoch_loss /= get_num_ddp_devices() + dist.all_reduce(train_epoch_metric, op=dist.ReduceOp.SUM) + train_epoch_metric /= get_num_ddp_devices() + # Update the learning rate as needed lr_scheduler.step() if train_config.run_validation: - if train_config.enable_ddp: - dist.barrier() - eval_epoch_loss, eval_metric, temp_val_loss, temp_step_metric = evaluation_helper( - model, train_config, eval_dataloader, device - ) - if local_rank == 0: - tensorboard_updates.add_scalars("loss", {"eval": eval_epoch_loss}, total_train_steps) - - else: - eval_epoch_loss, eval_metric, temp_val_loss, temp_step_metric = evaluation_helper( - model, train_config, eval_dataloader, device - ) - tensorboard_updates.add_scalars("loss", {"eval": eval_epoch_loss}, total_train_steps) + eval_loss, eval_metric, step_loss, step_metric = evaluation_helper( + model, train_config, eval_dataloader, device + ) + # Print evaluation metrics + print( + f"Epoch {epoch + 1}: Eval Loss: {eval_loss.detach().cpu():.4f}, Eval metric: {eval_metric.detach().cpu():.4f}" + ) + if eval_loss < best_val_loss: + best_val_loss = eval_loss + print(f"best eval loss on epoch {epoch + 1} is {best_val_loss:.4f}") + if local_rank == 0: + tensorboard_updates.add_scalars("loss", {"eval": eval_loss}, total_train_steps) if train_config.save_metrics: - val_step_loss.extend(temp_val_loss) - val_step_metric.extend(temp_step_metric) + val_step_loss.extend(step_loss) + val_step_metric.extend(step_metric) + val_loss.append(float(eval_loss)) + val_metric.append(float(eval_metric)) # saving the adapters after completion of each epoch if train_config.save_model: @@ -391,20 +399,9 @@ def train( else: model.save_pretrained(train_config.output_dir + f"/complete_epoch_{epoch + 1}") - if train_config.run_validation: - if eval_epoch_loss < best_val_loss: - best_val_loss = eval_epoch_loss - print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") - val_loss.append(float(eval_epoch_loss)) - val_metric.append(float(eval_metric)) - if train_config.task_type == "seq_classification": - print( - f"Epoch {epoch + 1}: train_acc={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" - ) - else: - print( - f"Epoch {epoch + 1}: train_metric={metric_val:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" - ) + print( + f"Epoch {epoch + 1}: Train epoch loss: {train_epoch_loss:.4f}, Train metric: {train_epoch_metric:.4f}, Epoch time {epoch_end_time:.2f} sec" + ) # Saving the results every epoch to plot later if train_config.save_metrics: @@ -421,17 +418,12 @@ def train( ) avg_epoch_time = sum(epoch_times) / len(epoch_times) avg_checkpoint_time = sum(checkpoint_times) / len(checkpoint_times) if len(checkpoint_times) > 0 else 0 - avg_train_metric = sum(train_metric) / len(train_metric) - avg_train_loss = sum(train_loss) / len(train_loss) - if train_config.run_validation: - avg_eval_metric = sum(val_metric) / len(val_metric) - avg_eval_loss = sum(val_loss) / len(val_loss) - results["avg_train_metric"] = avg_train_metric - results["avg_train_loss"] = avg_train_loss + results["last_epoch_train_loss"] = train_epoch_loss + results["last_epoch_train_metric"] = train_epoch_metric if train_config.run_validation: - results["avg_eval_metric"] = avg_eval_metric - results["avg_eval_loss"] = avg_eval_loss + results["last_epoch_eval_loss"] = eval_loss + results["last_epoch_eval_metric"] = eval_metric results["avg_epoch_time"] = avg_epoch_time results["avg_checkpoint_time"] = avg_checkpoint_time if train_config.save_metrics: @@ -449,6 +441,9 @@ def evaluation_helper(model, train_config, eval_dataloader, device): Returns: eval_epoch_loss, eval_metric, eval_step_loss, eval_step_metric """ + if train_config.enable_ddp: + dist.barrier() + model.eval() if train_config.task_type == "seq_classification": @@ -464,7 +459,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device): val_step_loss = [] val_step_metric = [] - eval_loss = 0.0 # Initialize evaluation loss + eval_loss = torch.tensor(0.0, dtype=torch.float32, device=device) # Initialize evaluation loss device_type = torch.device(device).type num_dummy_samples = 0 @@ -512,18 +507,19 @@ def evaluation_helper(model, train_config, eval_dataloader, device): eval_loss += loss.detach().float() # Compute average loss and metric - eval_epoch_loss = ( - 0.0 if eval_loss == 0.0 else eval_loss / (step + 1 - num_dummy_samples / train_config.val_batch_size) - ) + eval_loss = 0.0 if eval_loss == 0.0 else eval_loss / (step + 1 - num_dummy_samples / train_config.val_batch_size) if train_config.task_type == "seq_classification": eval_metric = acc_helper.compute() else: - eval_metric = torch.exp(eval_epoch_loss) + eval_metric = torch.exp(eval_loss) - # Print evaluation metrics - print(f" {eval_metric.detach().cpu()=} {eval_epoch_loss.detach().cpu()=}") + if train_config.enable_ddp: + dist.all_reduce(eval_loss, op=dist.ReduceOp.SUM) + eval_loss /= get_num_ddp_devices() + dist.all_reduce(eval_metric, op=dist.ReduceOp.SUM) + eval_metric /= get_num_ddp_devices() - return eval_epoch_loss, eval_metric, val_step_loss, val_step_metric + return eval_loss, eval_metric, val_step_loss, val_step_metric def get_longest_seq_length(data: List[Dict]) -> Tuple[int, int]: From 76ce0948a791613ec07c4a127f463e02fa572a13 Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Tue, 8 Jul 2025 11:00:32 +0530 Subject: [PATCH 2/3] Updated local_rank usages with is_rank_zero function call. Signed-off-by: meetkuma --- QEfficient/finetune/utils/helper.py | 4 ++++ QEfficient/finetune/utils/train_utils.py | 16 +++++----------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py index 8562b2aed..81c84a5cd 100644 --- a/QEfficient/finetune/utils/helper.py +++ b/QEfficient/finetune/utils/helper.py @@ -14,3 +14,7 @@ def get_num_ddp_devices(): return int(os.getenv("WORLD_SIZE", 1)) + + +def is_rank_zero(): + return int(os.getenv("LOCAL_RANK", 0)) == 0 diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index b55fe45e3..5c3e447a9 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -19,7 +19,7 @@ from tqdm import tqdm from QEfficient.finetune.configs.training import TrainConfig -from QEfficient.finetune.utils.helper import get_num_ddp_devices +from QEfficient.finetune.utils.helper import get_num_ddp_devices, is_rank_zero try: import torch_qaic # noqa: F401 @@ -84,10 +84,7 @@ def train( max_steps_reached = False # Flag to indicate max training steps reached tensorboard_updates = None - if train_config.enable_ddp: - if local_rank == 0: - tensorboard_updates = SummaryWriter() - else: + if is_rank_zero(): tensorboard_updates = SummaryWriter() device_type = torch.device(device).type @@ -233,7 +230,7 @@ def train( total_loss += loss.detach().float() if train_config.enable_ddp: - if local_rank == 0: + if is_rank_zero(): if loss <= train_config.convergence_loss: loss_0_counter += 1 else: @@ -245,10 +242,7 @@ def train( else: loss_0_counter = torch.tensor([0]).to(device) - if train_config.enable_ddp: - if local_rank == 0: - tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps) - else: + if is_rank_zero(): tensorboard_updates.add_scalars("loss", {"train": loss}, total_train_steps) if train_config.save_metrics: @@ -383,7 +377,7 @@ def train( best_val_loss = eval_loss print(f"best eval loss on epoch {epoch + 1} is {best_val_loss:.4f}") - if local_rank == 0: + if is_rank_zero(): tensorboard_updates.add_scalars("loss", {"eval": eval_loss}, total_train_steps) if train_config.save_metrics: val_step_loss.extend(step_loss) From e3df77082f02414c3069125dd4420922990d357b Mon Sep 17 00:00:00 2001 From: Meet Patel Date: Wed, 9 Jul 2025 11:22:13 +0530 Subject: [PATCH 3/3] Moved eval all reduce code after including it in json list. Signed-off-by: meetkuma --- QEfficient/finetune/utils/train_utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 5c3e447a9..e344a9ae2 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -369,13 +369,9 @@ def train( eval_loss, eval_metric, step_loss, step_metric = evaluation_helper( model, train_config, eval_dataloader, device ) - # Print evaluation metrics - print( - f"Epoch {epoch + 1}: Eval Loss: {eval_loss.detach().cpu():.4f}, Eval metric: {eval_metric.detach().cpu():.4f}" - ) if eval_loss < best_val_loss: best_val_loss = eval_loss - print(f"best eval loss on epoch {epoch + 1} is {best_val_loss:.4f}") + print(f"Best eval loss on epoch {epoch + 1} is {best_val_loss:.4f}") if is_rank_zero(): tensorboard_updates.add_scalars("loss", {"eval": eval_loss}, total_train_steps) @@ -385,6 +381,16 @@ def train( val_loss.append(float(eval_loss)) val_metric.append(float(eval_metric)) + if train_config.enable_ddp: + dist.all_reduce(eval_loss, op=dist.ReduceOp.SUM) + eval_loss /= get_num_ddp_devices() + dist.all_reduce(eval_metric, op=dist.ReduceOp.SUM) + eval_metric /= get_num_ddp_devices() + + print( + f"Epoch {epoch + 1}: Eval Loss: {eval_loss.detach().cpu():.4f}, Eval metric: {eval_metric.detach().cpu():.4f}" + ) + # saving the adapters after completion of each epoch if train_config.save_model: if train_config.enable_ddp: @@ -507,12 +513,6 @@ def evaluation_helper(model, train_config, eval_dataloader, device): else: eval_metric = torch.exp(eval_loss) - if train_config.enable_ddp: - dist.all_reduce(eval_loss, op=dist.ReduceOp.SUM) - eval_loss /= get_num_ddp_devices() - dist.all_reduce(eval_metric, op=dist.ReduceOp.SUM) - eval_metric /= get_num_ddp_devices() - return eval_loss, eval_metric, val_step_loss, val_step_metric