Update README.md
Browse files
README.md
CHANGED
@@ -1,14 +1,17 @@
|
|
1 |
The reward model may be used for iterative SFT/DPO
|
2 |
|
|
|
3 |
@article{dong2023raft,
|
4 |
title={Raft: Reward ranked finetuning for generative foundation model alignment},
|
5 |
author={Dong, Hanze and Xiong, Wei and Goyal, Deepanshu and Pan, Rui and Diao, Shizhe and Zhang, Jipeng and Shum, Kashun and Zhang, Tong},
|
6 |
journal={arXiv preprint arXiv:2304.06767},
|
7 |
year={2023}
|
8 |
}
|
|
|
9 |
@article{xiong2023gibbs,
|
10 |
title={Gibbs sampling from human feedback: A provable kl-constrained framework for rlhf},
|
11 |
author={Xiong, Wei and Dong, Hanze and Ye, Chenlu and Zhong, Han and Jiang, Nan and Zhang, Tong},
|
12 |
journal={arXiv preprint arXiv:2312.11456},
|
13 |
year={2023}
|
14 |
-
}
|
|
|
|
1 |
The reward model may be used for iterative SFT/DPO
|
2 |
|
3 |
+
```
|
4 |
@article{dong2023raft,
|
5 |
title={Raft: Reward ranked finetuning for generative foundation model alignment},
|
6 |
author={Dong, Hanze and Xiong, Wei and Goyal, Deepanshu and Pan, Rui and Diao, Shizhe and Zhang, Jipeng and Shum, Kashun and Zhang, Tong},
|
7 |
journal={arXiv preprint arXiv:2304.06767},
|
8 |
year={2023}
|
9 |
}
|
10 |
+
|
11 |
@article{xiong2023gibbs,
|
12 |
title={Gibbs sampling from human feedback: A provable kl-constrained framework for rlhf},
|
13 |
author={Xiong, Wei and Dong, Hanze and Ye, Chenlu and Zhong, Han and Jiang, Nan and Zhang, Tong},
|
14 |
journal={arXiv preprint arXiv:2312.11456},
|
15 |
year={2023}
|
16 |
+
}
|
17 |
+
```
|