PaddlePaddle · ZHUI · Jun 19, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/examples/RLHF/run.sh b/examples/RLHF/run.sh
diff --git a/llm/.gitignore b/llm/.gitignore
@@ -3,7 +3,6 @@ infer.json
 output.json 
 
 # data
-data/
 AdvertiseGen.tar.gz
 
 # checkpoints

diff --git a/examples/RLHF/comm_utils.py → llm/Alignment/PPO/comm_utils.py b/examples/RLHF/comm_utils.py → llm/Alignment/PPO/comm_utils.py
diff --git a/examples/RLHF/data/__init__.py → llm/Alignment/PPO/data/__init__.py b/examples/RLHF/data/__init__.py → llm/Alignment/PPO/data/__init__.py
diff --git a/examples/RLHF/data/alpaca.py → llm/Alignment/PPO/data/alpaca.py b/examples/RLHF/data/alpaca.py → llm/Alignment/PPO/data/alpaca.py
diff --git a/examples/RLHF/data/base.py → llm/Alignment/PPO/data/base.py b/examples/RLHF/data/base.py → llm/Alignment/PPO/data/base.py
diff --git a/examples/RLHF/data/preference.py → llm/Alignment/PPO/data/preference.py b/examples/RLHF/data/preference.py → llm/Alignment/PPO/data/preference.py
diff --git a/examples/RLHF/data/prompt_only.py → llm/Alignment/PPO/data/prompt_only.py b/examples/RLHF/data/prompt_only.py → llm/Alignment/PPO/data/prompt_only.py
diff --git a/examples/RLHF/data/safe_rlhf.py → llm/Alignment/PPO/data/safe_rlhf.py b/examples/RLHF/data/safe_rlhf.py → llm/Alignment/PPO/data/safe_rlhf.py
diff --git a/examples/RLHF/data/supervised.py → llm/Alignment/PPO/data/supervised.py b/examples/RLHF/data/supervised.py → llm/Alignment/PPO/data/supervised.py
diff --git a/examples/RLHF/infer_utils.py → llm/Alignment/PPO/infer_utils.py b/examples/RLHF/infer_utils.py → llm/Alignment/PPO/infer_utils.py
diff --git a/examples/RLHF/models/__init__.py → llm/Alignment/PPO/models/__init__.py b/examples/RLHF/models/__init__.py → llm/Alignment/PPO/models/__init__.py
diff --git a/examples/RLHF/models/infer_model_utils.py → ...Alignment/PPO/models/infer_model_utils.py b/examples/RLHF/models/infer_model_utils.py → ...Alignment/PPO/models/infer_model_utils.py
diff --git a/examples/RLHF/models/model_pp.py → llm/Alignment/PPO/models/model_pp.py b/examples/RLHF/models/model_pp.py → llm/Alignment/PPO/models/model_pp.py
diff --git a/examples/RLHF/models/pp_model_utils.py → llm/Alignment/PPO/models/pp_model_utils.py b/examples/RLHF/models/pp_model_utils.py → llm/Alignment/PPO/models/pp_model_utils.py
diff --git a/examples/RLHF/models/ppo_model.py → llm/Alignment/PPO/models/ppo_model.py b/examples/RLHF/models/ppo_model.py → llm/Alignment/PPO/models/ppo_model.py
diff --git a/examples/RLHF/models/ppo_model_utils.py → llm/Alignment/PPO/models/ppo_model_utils.py b/examples/RLHF/models/ppo_model_utils.py → llm/Alignment/PPO/models/ppo_model_utils.py
diff --git a/examples/RLHF/models/score_model.py → llm/Alignment/PPO/models/score_model.py b/examples/RLHF/models/score_model.py → llm/Alignment/PPO/models/score_model.py
diff --git a/examples/RLHF/models/score_model_utils.py → ...Alignment/PPO/models/score_model_utils.py b/examples/RLHF/models/score_model_utils.py → ...Alignment/PPO/models/score_model_utils.py
diff --git a/examples/RLHF/ppo_main.py → llm/Alignment/PPO/ppo_main.py b/examples/RLHF/ppo_main.py → llm/Alignment/PPO/ppo_main.py
diff --git a/examples/RLHF/ppo_trainer.py → llm/Alignment/PPO/ppo_trainer.py b/examples/RLHF/ppo_trainer.py → llm/Alignment/PPO/ppo_trainer.py
diff --git a/examples/RLHF/tests/run_model.py → llm/Alignment/PPO/tests/run_model.py b/examples/RLHF/tests/run_model.py → llm/Alignment/PPO/tests/run_model.py
diff --git a/examples/RLHF/tests/test_export.py → llm/Alignment/PPO/tests/test_export.py b/examples/RLHF/tests/test_export.py → llm/Alignment/PPO/tests/test_export.py
diff --git a/examples/RLHF/trainer_utils.py → llm/Alignment/PPO/trainer_utils.py b/examples/RLHF/trainer_utils.py → llm/Alignment/PPO/trainer_utils.py
diff --git a/examples/RLHF/README.md → llm/Alignment/README.md b/examples/RLHF/README.md → llm/Alignment/README.md
@@ -1,4 +1,4 @@
-# RLHF PPO
+# Alignment
 
 提供了基于强化学习 PPO 算法对 LLM 进行人类偏好对齐的代码及完整使用示例，支持**3D 分布式并行训练以及 rollout 阶段使用预测优化进行生成加速**。其中 PPO 代码实现细节参考了 [PKU-Alignment/safe-rlhf](https://github.com/PKU-Alignment/safe-rlhf)（PKU Beaver） 中的 PPO 实现，支持reward normalization、pretraining loss等常用的 PPO 稳定训练策略；示例使用 PKU-Alignment/safe-rlhf 提供的部分数据集和模型。后续将持续完善扩展，支持更好效果、更低成本、更高性能、更大规模的 RLHF 能力。
 
@@ -8,30 +8,37 @@
 
 ```
 .
-├── reward_main.py               # reward model训练脚本
-├── reward_config.json           # reward model训练配置文件
-├── reward_trainer.py            # reward训练执行器py脚本
-├── ppo_main.py                  # RLHF训练脚本
-├── ppo_config.json              # RLHF训练配置文件
-├── ppo_trainer.py               # RLHF训练执行器py脚本
-├── ppo_config.json              # RLHF训练配置文件
-├── trainer_utils.py             # Trainer补丁及工具py脚本
-├── infer_utils.py               # 生成加速工具py脚本
-├── data                         # 数据集相关目录
-│ └── base.py                    # 数据集基类及工具py文件
-│ └── alpaca.py                  # alpaca(raw)数据集py文件
-│ └── safe_rlhf.py               # safe_rlhf(raw)数据集py文件
-│ └── preference.py              # 偏好数据集py文件
-│ └── prompt_only.py             # prompt only数据集py文件
-│ └── supervised.py              # supervised数据集py文件
-├── models                       # 模型相关目录
-│ └── score_model_utils.py       # score model基类及工具py文件
-│ └── score_model.py             # score model模型定义py文件
-│ └── ppo_model_utils.py         # PPO loss等模型策略py文件
-│ └── pp_model_utils.py          # 流水线并行补丁及工具py文件
-│ └── model_pp.py                # 流水线并行模型py文件
-│ └── infer_model_utils.py       # 预测加速模型补丁及工具py文件
-└── README.md
+├── PPO                          # PPO 训练相关目录
+│   ├── comm_utils.py            # 通信相关工具py文件
+│   ├── data                     # 数据集相关目录
+│   │   ├── alpaca.py            # alpaca(raw)数据集py文件
+│   │   ├── base.py              # 数据集基类及工具py文件
+│   │   ├── __init__.py
+│   │   ├── preference.py        # 偏好数据集py文件
+│   │   ├── prompt_only.py       # prompt only数据集py文件
+│   │   ├── safe_rlhf.py         # safe_rlhf(raw)数据集py文件
+│   │   └── supervised.py        # supervised数据集py文件
+│   ├── infer_utils.py           # 生成加速工具py脚本
+│   ├── models                   # 模型相关目录
+│   │   ├── infer_model_utils.py # 预测加速模型补丁及工具py文件
+│   │   ├── __init__.py
+│   │   ├── model_pp.py          # 流水线并行模型py文件
+│   │   ├── pp_model_utils.py    # 流水线并行补丁及工具py文件
+│   │   ├── ppo_model.py         # PPO 相关模型实现
+│   │   ├── ppo_model_utils.py   # PPO loss等模型策略py文件
+│   │   ├── score_model.py       # score model模型定义py文件
+│   │   └── score_model_utils.py # score model基类及工具py文件
+│   ├── ppo_main.py              # RLHF训练脚本
+│   ├── ppo_trainer.py           # RLHF训练执行器py脚本
+│   ├── tests                    # 测试相关目录
+│   │   ├── run_model.py
+│   │   └── test_export.py
+│   └── trainer_utils.py         # Trainer补丁及工具py脚本
+├── README.md
+└── RM                           # Reward Model 训练相关目录
+    ├── models -> ../PPO/models
+    ├── reward_main.py            # reward model训练脚本
+    └── reward_trainer.py         # reward训练执行器py脚本
 ```
 
 ### 环境准备
@@ -172,13 +179,14 @@ PPO 完整的训练过程包括以下 3 个阶段，如下图所示（来自[Dee
 
 2. Reward Model Fine-Tuning
 
-使用 `reward_main.py` 脚本根据 `reward_config.json` 训练奖励模型
+使用 `reward_main.py` 脚本根据 `rm.json` 训练奖励模型
 
 ```
-python -u -m paddle.distributed.launch reward_main.py ./reward_config.json
+cd RM
+python -u -m paddle.distributed.launch reward_main.py ../../config/llama/rm.json
 ```
 
-`reward_config.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述；稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
+`rm.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述；稍有区别的是 `train_datasets`/`eval_datasets` 分别使用数据集定义注册时的`NAME`属性给出训练和验证集。另外对于奖励模型训练有以下特殊参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
 
 - `normalize_score_during_training`：是否在训练过程中对奖励进行 normalize，默认为 `False`。
 - `normalizer_type`：使用 normalizer 时计算 mean、var 的方式，可选`"RunningMeanStd", "ExponentialMovingAverage"`。
@@ -188,14 +196,15 @@ python -u -m paddle.distributed.launch reward_main.py ./reward_config.json
 
 3. RLHF：
 
-RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型；actor-model/reference-model 使用 SFT 模型进行 initialize/frozen；critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并）。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型（[PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced)）和 reward 模型（[PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward)，注意该模型只关注 helpful 未考量 harmless）作为示例，使用 `ppo_main.py` 脚本根据 `ppo_config.json` 进行 RLHF 训练。
+RLHF 阶段需要 actor model、reference model、critic model、reward model 四个模型；actor-model/reference-model 使用 SFT 模型进行 initialize/frozen；critic-model/reward-model 使用 reward 模型进行 initialize/frozen (另外注意若 SFT 使用 LoRA 请先将 LoRA 权重合并）。这里使用 PKU-Alignment/PKU-SafeRLHF 提供的 SFT 模型（[PKU-Alignment/alpaca-7b-reproduced](https://huggingface.co/PKU-Alignment/alpaca-7b-reproduced)）和 reward 模型（[PKU-Alignment/beaver-7b-v1.0-reward](https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward)，注意该模型只关注 helpful 未考量 harmless）作为示例，使用 `ppo_main.py` 脚本根据 `ppo.json` 进行 RLHF 训练。
 
 ```
 # 类型提升 warning 暂时通过 loglevel 屏蔽，待后续修复
-GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_main.py ./ppo_config.json
+cd PPO
+PYTHONPATH=../../ GLOG_minloglevel=2 python -u -m paddle.distributed.launch ppo_main.py ../../config/llama/ppo.json
 ```
 
-`ppo_config.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述，重点给出以下参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
+`ppo.json` 中的绝大部分参数释义同[LLM 精调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm#2-%E7%B2%BE%E8%B0%83)，不再赘述，重点给出以下参数配置及释义（使用 PKU-Alignment/PKU-SafeRLHF 中的默认值）：
 
 - `train_datasets`：使用数据集定义注册时的`NAME`属性给出训练集。
 - `eval_datasets`：使用数据集定义注册时的`NAME`属性给出验证集。

diff --git a/llm/Alignment/RM/models b/llm/Alignment/RM/models
@@ -0,0 +1 @@
+../PPO/models
diff --git a/examples/RLHF/reward_main.py → llm/Alignment/RM/reward_main.py b/examples/RLHF/reward_main.py → llm/Alignment/RM/reward_main.py
diff --git a/examples/RLHF/reward_trainer.py → llm/Alignment/RM/reward_trainer.py b/examples/RLHF/reward_trainer.py → llm/Alignment/RM/reward_trainer.py
diff --git a/examples/RLHF/ppo_config.json → llm/config/llama/ppo.json b/examples/RLHF/ppo_config.json → llm/config/llama/ppo.json
@@ -4,7 +4,7 @@
     "ptx_datasets": "alpaca",
     "actor_model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced",
     "reward_model_name_or_path": "PKU-Alignment/beaver-7b-v1.0-reward",
-    "output_dir": "/root/paddlejob/workspace/guosheng/ckpts/ppo-reshard-sd38",
+    "output_dir": "checkpoints/llm_ppo",
     "max_length": 512,
     "top_p": 0.8,
     "temperature": 1.0,

diff --git a/examples/RLHF/reward_config.json → llm/config/llama/rm.json b/examples/RLHF/reward_config.json → llm/config/llama/rm.json
@@ -2,7 +2,7 @@
     "model_name_or_path": "PKU-Alignment/alpaca-7b-reproduced",
     "train_datasets": "PKU-SafeRLHF-30K/train",
     "eval_datasets": "PKU-SafeRLHF-30K/test",
-    "output_dir": "/root/paddlejob/workspace/guosheng/checkpoints/llama_sft_ckpts-test",
+    "output_dir": "checkpoints/llama_rm",
     "per_device_train_batch_size": 16,
     "gradient_accumulation_steps": 1,
     "per_device_eval_batch_size": 16,
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,6 @@ infer.json @@
     output.json
     # data
-    data/
     AdvertiseGen.tar.gz
     # checkpoints
@@ Expand Down @@