diff --git a/examples/RLHF/run.sh b/examples/RLHF/run.sh deleted file mode 100644 index a4bdca9e974e..000000000000 --- a/examples/RLHF/run.sh +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -PYTHONPATH=../../ GLOG_minloglevel=2 python3.10 -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ppo_main.py ppo_config.json diff --git a/llm/.gitignore b/llm/.gitignore index 709c2cb61539..d81fdef50031 100644 --- a/llm/.gitignore +++ b/llm/.gitignore @@ -3,7 +3,6 @@ infer.json output.json # data -data/ AdvertiseGen.tar.gz # checkpoints diff --git a/examples/RLHF/comm_utils.py b/llm/Alignment/PPO/comm_utils.py similarity index 100% rename from examples/RLHF/comm_utils.py rename to llm/Alignment/PPO/comm_utils.py diff --git a/examples/RLHF/data/__init__.py b/llm/Alignment/PPO/data/__init__.py similarity index 100% rename from examples/RLHF/data/__init__.py rename to llm/Alignment/PPO/data/__init__.py diff --git a/examples/RLHF/data/alpaca.py b/llm/Alignment/PPO/data/alpaca.py similarity index 100% rename from examples/RLHF/data/alpaca.py rename to llm/Alignment/PPO/data/alpaca.py diff --git a/examples/RLHF/data/base.py b/llm/Alignment/PPO/data/base.py similarity index 100% rename from examples/RLHF/data/base.py rename to llm/Alignment/PPO/data/base.py diff --git a/examples/RLHF/data/preference.py b/llm/Alignment/PPO/data/preference.py similarity index 100% rename from examples/RLHF/data/preference.py rename to llm/Alignment/PPO/data/preference.py diff --git a/examples/RLHF/data/prompt_only.py b/llm/Alignment/PPO/data/prompt_only.py similarity index 100% rename from examples/RLHF/data/prompt_only.py rename to llm/Alignment/PPO/data/prompt_only.py diff --git a/examples/RLHF/data/safe_rlhf.py b/llm/Alignment/PPO/data/safe_rlhf.py similarity index 100% rename from examples/RLHF/data/safe_rlhf.py rename to llm/Alignment/PPO/data/safe_rlhf.py diff --git a/examples/RLHF/data/supervised.py b/llm/Alignment/PPO/data/supervised.py similarity index 100% rename from examples/RLHF/data/supervised.py rename to llm/Alignment/PPO/data/supervised.py diff --git a/examples/RLHF/infer_utils.py b/llm/Alignment/PPO/infer_utils.py similarity index 100% rename from examples/RLHF/infer_utils.py rename to llm/Alignment/PPO/infer_utils.py diff --git a/examples/RLHF/models/__init__.py b/llm/Alignment/PPO/models/__init__.py similarity index 100% rename from examples/RLHF/models/__init__.py rename to llm/Alignment/PPO/models/__init__.py diff --git a/examples/RLHF/models/infer_model_utils.py b/llm/Alignment/PPO/models/infer_model_utils.py similarity index 100% rename from examples/RLHF/models/infer_model_utils.py rename to llm/Alignment/PPO/models/infer_model_utils.py diff --git a/examples/RLHF/models/model_pp.py b/llm/Alignment/PPO/models/model_pp.py similarity index 100% rename from examples/RLHF/models/model_pp.py rename to llm/Alignment/PPO/models/model_pp.py diff --git a/examples/RLHF/models/pp_model_utils.py b/llm/Alignment/PPO/models/pp_model_utils.py similarity index 100% rename from examples/RLHF/models/pp_model_utils.py rename to llm/Alignment/PPO/models/pp_model_utils.py diff --git a/examples/RLHF/models/ppo_model.py b/llm/Alignment/PPO/models/ppo_model.py similarity index 100% rename from examples/RLHF/models/ppo_model.py rename to llm/Alignment/PPO/models/ppo_model.py diff --git a/examples/RLHF/models/ppo_model_utils.py b/llm/Alignment/PPO/models/ppo_model_utils.py similarity index 100% rename from examples/RLHF/models/ppo_model_utils.py rename to llm/Alignment/PPO/models/ppo_model_utils.py diff --git a/examples/RLHF/models/score_model.py b/llm/Alignment/PPO/models/score_model.py similarity index 100% rename from examples/RLHF/models/score_model.py rename to llm/Alignment/PPO/models/score_model.py diff --git a/examples/RLHF/models/score_model_utils.py b/llm/Alignment/PPO/models/score_model_utils.py similarity index 100% rename from examples/RLHF/models/score_model_utils.py rename to llm/Alignment/PPO/models/score_model_utils.py diff --git a/examples/RLHF/ppo_main.py b/llm/Alignment/PPO/ppo_main.py similarity index 100% rename from examples/RLHF/ppo_main.py rename to llm/Alignment/PPO/ppo_main.py diff --git a/examples/RLHF/ppo_trainer.py b/llm/Alignment/PPO/ppo_trainer.py similarity index 100% rename from examples/RLHF/ppo_trainer.py rename to llm/Alignment/PPO/ppo_trainer.py diff --git a/examples/RLHF/tests/run_model.py b/llm/Alignment/PPO/tests/run_model.py similarity index 100% rename from examples/RLHF/tests/run_model.py rename to llm/Alignment/PPO/tests/run_model.py diff --git a/examples/RLHF/tests/test_export.py b/llm/Alignment/PPO/tests/test_export.py similarity index 100% rename from examples/RLHF/tests/test_export.py rename to llm/Alignment/PPO/tests/test_export.py diff --git a/examples/RLHF/trainer_utils.py b/llm/Alignment/PPO/trainer_utils.py similarity index 100% rename from examples/RLHF/trainer_utils.py rename to llm/Alignment/PPO/trainer_utils.py diff --git a/examples/RLHF/README.md b/llm/Alignment/README.md similarity index 82% rename from examples/RLHF/README.md rename to llm/Alignment/README.md index 478f8365833b..fbf978dc208c 100644 --- a/examples/RLHF/README.md +++ b/llm/Alignment/README.md @@ -1,4 +1,4 @@ -# RLHF PPO +# Alignment 提供了基于强化学习 PPO 算法对 LLM 进行人类偏好对齐的代码及完整使用示例,支持**3D 分布式并行训练以及 rollout 阶段使用预测优化进行生成加速**。其中 PPO 代码实现细节参考了 [PKU-Alignment/safe-rlhf](https://github.com/PKU-Alignment/safe-rlhf)(PKU Beaver) 中的 PPO 实现,支持reward normalization、pretraining loss等常用的 PPO 稳定训练策略;示例使用 PKU-Alignment/safe-rlhf 提供的部分数据集和模型。后续将持续完善扩展,支持更好效果、更低成本、更高性能、更大规模的 RLHF 能力。 @@ -8,30 +8,37 @@ ``` . -├── reward_main.py # reward model训练脚本 -├── reward_config.json # reward model训练配置文件 -├── reward_trainer.py # reward训练执行器py脚本 -├── ppo_main.py # RLHF训练脚本 -├── ppo_config.json # RLHF训练配置文件 -├── ppo_trainer.py # RLHF训练执行器py脚本 -├── ppo_config.json # RLHF训练配置文件 -├── trainer_utils.py # Trainer补丁及工具py脚本 -├── infer_utils.py # 生成加速工具py脚本 -├── data # 数据集相关目录 -│ └── base.py # 数据集基类及工具py文件 -│ └── alpaca.py # alpaca(raw)数据集py文件 -│ └── safe_rlhf.py # safe_rlhf(raw)数据集py文件 -│ └── preference.py # 偏好数据集py文件 -│ └── prompt_only.py # prompt only数据集py文件 -│ └── supervised.py # supervised数据集py文件 -├── models # 模型相关目录 -│ └── score_model_utils.py # score model基类及工具py文件 -│ └── score_model.py # score model模型定义py文件 -│ └── ppo_model_utils.py # PPO loss等模型策略py文件 -│ └── pp_model_utils.py # 流水线并行补丁及工具py文件 -│ └── model_pp.py # 流水线并行模型py文件 -│ └── infer_model_utils.py # 预测加速模型补丁及工具py文件 -└── README.md +├── PPO # PPO 训练相关目录 +│ ├── comm_utils.py # 通信相关工具py文件 +│ ├── data # 数据集相关目录 +│ │ ├── alpaca.py # alpaca(raw)数据集py文件 +│ │ ├── base.py # 数据集基类及工具py文件 +│ │ ├── __init__.py +│ │ ├── preference.py # 偏好数据集py文件 +│ │ ├── prompt_only.py # prompt only数据集py文件 +│ │ ├── safe_rlhf.py # safe_rlhf(raw)数据集py文件 +│ │ └── supervised.py # supervised数据集py文件 +│ ├── infer_utils.py # 生成加速工具py脚本 +│ ├── models # 模型相关目录 +│ │ ├── infer_model_utils.py # 预测加速模型补丁及工具py文件 +│ │ ├── __init__.py +│ │ ├── model_pp.py # 流水线并行模型py文件 +│ │ ├── pp_model_utils.py # 流水线并行补丁及工具py文件 +│ │ ├── ppo_model.py # PPO 相关模型实现 +│ │ ├── ppo_model_utils.py # PPO loss等模型策略py文件 +│ │ ├── score_model.py # score model模型定义py文件 +│ │ └── score_model_utils.py # score model基类及工具py文件 +│ ├── ppo_main.py # RLHF训练脚本 +│ ├── ppo_trainer.py # RLHF训练执行器py脚本 +│ ├── tests # 测试相关目录 +│ │ ├── run_model.py +│ │ └── test_export.py +│ └── trainer_utils.py # Trainer补丁及工具py脚本 +├── README.md +└── RM # Reward Model 训练相关目录 + ├── models -> ../PPO/models + ├── reward_main.py # reward model训练脚本 + └── reward_trainer.py # reward训练执行器py脚本 ``` ### 环境准备 @@ -172,13 +179,14 @@ PPO 完整的训练过程包括以下 3 个阶段,如下图所示(来自[Dee 2. Reward Model Fine-Tuning -使用 `reward_main.py` 脚本根据 `reward_config.json` 训练奖励模型 +使用 `reward_main.py` 脚本根据 `rm.json` 训练奖励模型 ``` -python -u -m paddle.distributed.launch reward_main.py ./reward_config.json +cd RM +python -u -m paddle.distributed.launch reward_main.py ../../config/llama/rm.json ``` -`reward_config.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述;稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): +`rm.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述;稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): - `normalize_score_during_training`:是否在训练过程中对奖励进行 normalize,默认为 `False`。 - `normalizer_type`:使用 normalizer 时计算 mean、var 的方式,可选`"RunningMeanStd", "ExponentialMovingAverage"`。 @@ -188,14 +196,15 @@ python -u -m paddle.distributed.launch reward_main.py ./reward_config.json 3. RLHF: -RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型;actor-model/reference-model 使用 SFT 模型进行 initialize/frozen;critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并)。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型([PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced))和 reward 模型([PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward),注意该模型只关注 helpful 未考量 harmless)作为示例,使用 `ppo_main.py` 脚本根据 `ppo_config.json` 进行 RLHF 训练。 +RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型;actor-model/reference-model 使用 SFT 模型进行 initialize/frozen;critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并)。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型([PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced))和 reward 模型([PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward),注意该模型只关注 helpful 未考量 harmless)作为示例,使用 `ppo_main.py` 脚本根据 `ppo.json` 进行 RLHF 训练。 ``` # 类型提升 warning 暂时通过 loglevel 屏蔽,待后续修复 -GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_main.py ./ppo_config.json +cd PPO +PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_main.py ../../config/llama/ppo.json ``` -`ppo_config.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述,重点给出以下参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): +`ppo.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83),不再赘述,重点给出以下参数配置及释义(使用 PKU-Alignment/PKU-SafeRLHF 中的默认值): - `train_datasets`:使用数据集定义注册时的`NAME`属性给出训练集。 - `eval_datasets`:使用数据集定义注册时的`NAME`属性给出验证集。 diff --git a/llm/Alignment/RM/models b/llm/Alignment/RM/models new file mode 120000 index 000000000000..39963209bbb5 --- /dev/null +++ b/llm/Alignment/RM/models @@ -0,0 +1 @@ +../PPO/models \ No newline at end of file diff --git a/examples/RLHF/reward_main.py b/llm/Alignment/RM/reward_main.py similarity index 100% rename from examples/RLHF/reward_main.py rename to llm/Alignment/RM/reward_main.py diff --git a/examples/RLHF/reward_trainer.py b/llm/Alignment/RM/reward_trainer.py similarity index 100% rename from examples/RLHF/reward_trainer.py rename to llm/Alignment/RM/reward_trainer.py diff --git a/examples/RLHF/ppo_config.json b/llm/config/llama/ppo.json similarity index 95% rename from examples/RLHF/ppo_config.json rename to llm/config/llama/ppo.json index 7bc5f88e515f..442f78433562 100644 --- a/examples/RLHF/ppo_config.json +++ b/llm/config/llama/ppo.json @@ -4,7 +4,7 @@ "ptx_datasets": "alpaca", "actor_model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced", "reward_model_name_or_path": "PKU-Alignment/beaver-7b-v1.0-reward", - "output_dir": "/root/paddlejob/workspace/guosheng/ckpts/ppo-reshard-sd38", + "output_dir": "checkpoints/llm_ppo", "max_length": 512, "top_p": 0.8, "temperature": 1.0, diff --git a/examples/RLHF/reward_config.json b/llm/config/llama/rm.json similarity index 92% rename from examples/RLHF/reward_config.json rename to llm/config/llama/rm.json index bfe5b2d1b523..5626e9ccc74b 100644 --- a/examples/RLHF/reward_config.json +++ b/llm/config/llama/rm.json @@ -2,7 +2,7 @@ "model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced", "train_datasets": "PKU-SafeRLHF-30K/train", "eval_datasets": "PKU-SafeRLHF-30K/test", - "output_dir": "/root/paddlejob/workspace/guosheng/checkpoints/llama_sft_ckpts-test", + "output_dir": "checkpoints/llama_rm", "per_device_train_batch_size": 16, "gradient_accumulation_steps": 1, "per_device_eval_batch_size": 16,