← Back
@article{xu2026beyond,
  title={Beyond GRPO and On-Policy Distillation: An Empirical Sparse-to-Dense Reward Principle for Language-Model Post-Training},
  author={Xu, Yuanda and Sang, Hejian and Zhou, Zhengze and He, Ran and Wang, Zhipeng and Geramifard, Alborz},
  journal={arXiv preprint arXiv:2605.12483},
  year={2026}
}