|
|
\begin{thebibliography}{90} |
|
|
\providecommand{\natexlab}[1]{#1} |
|
|
\providecommand{\url}[1]{\texttt{#1}} |
|
|
\expandafter\ifx\csname urlstyle\endcsname\relax |
|
|
\providecommand{\doi}[1]{doi: |
|
|
\providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi |
|
|
|
|
|
\bibitem[Achiam et~al.(2017)Achiam, Held, Tamar, and Abbeel]{achiam2017constrained} |
|
|
Joshua Achiam, David Held, Aviv Tamar, and Pieter Abbeel. |
|
|
\newblock Constrained policy optimization. |
|
|
\newblock In \emph{International Conference on Machine Learning (ICML)}, pp.\ 22--31. PMLR, 2017. |
|
|
|
|
|
\bibitem[Altman(1999)]{altman1999constrained} |
|
|
Eitan Altman. |
|
|
\newblock \emph{Constrained Markov decision processes: stochastic modeling}. |
|
|
\newblock Routledge, 1999. |
|
|
|
|
|
\bibitem[Arora et~al.(2019)Arora, Du, Hu, Li, and Wang]{arora2019fine} |
|
|
Sanjeev Arora, Simon Du, Wei Hu, Zhiyuan Li, and Ruosong Wang. |
|
|
\newblock Fine-grained analysis of optimization and generalization for overparameterized two-layer neural networks. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 322--332. PMLR, 2019. |
|
|
|
|
|
\bibitem[Auer et~al.(2002)Auer, Cesa-Bianchi, and Fischer]{auer2002finite} |
|
|
Peter Auer, Nicolo Cesa-Bianchi, and Paul Fischer. |
|
|
\newblock Finite-time analysis of the multiarmed bandit problem. |
|
|
\newblock \emph{Machine learning}, 47\penalty0 (2):\penalty0 235--256, 2002. |
|
|
|
|
|
\bibitem[Bai et~al.(2021)Bai, Bedi, Agarwal, Koppel, and Aggarwal]{bai2021achieving} |
|
|
Qinbo Bai, Amrit~Singh Bedi, Mridul Agarwal, Alec Koppel, and Vaneet Aggarwal. |
|
|
\newblock Achieving zero constraint violation for constrained reinforcement learning via primal-dual approach. |
|
|
\newblock \emph{arXiv preprint arXiv:2109.06332}, 2021. |
|
|
|
|
|
\bibitem[Balcan et~al.(2015)Balcan, Blum, and Vempala]{balcan2015efficient} |
|
|
Maria-Florina Balcan, Avrim Blum, and Santosh Vempala. |
|
|
\newblock Efficient representations for lifelong learning and autoencoding. |
|
|
\newblock In \emph{Conference on Learning Theory}, pp.\ 191--210. PMLR, 2015. |
|
|
|
|
|
\bibitem[Balcan et~al.(2019)Balcan, Khodak, and Talwalkar]{balcan2019provable} |
|
|
Maria-Florina Balcan, Mikhail Khodak, and Ameet Talwalkar. |
|
|
\newblock Provable guarantees for gradient-based meta-learning. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 424--433. PMLR, 2019. |
|
|
|
|
|
\bibitem[Balcan et~al.(2021)Balcan, Khodak, Sharma, and Talwalkar]{balcan2021learning} |
|
|
Maria-Florina~F Balcan, Mikhail Khodak, Dravyansh Sharma, and Ameet Talwalkar. |
|
|
\newblock Learning-to-learn non-convex piecewise-lipschitz functions. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021. |
|
|
|
|
|
\bibitem[Bedi et~al.(2018)Bedi, Sarma, and Rajawat]{bedi2018tracking} |
|
|
Amrit~Singh Bedi, Paban Sarma, and Ketan Rajawat. |
|
|
\newblock Tracking moving agents via inexact online gradient descent algorithm. |
|
|
\newblock \emph{IEEE Journal of Selected Topics in Signal Processing}, 12\penalty0 (1):\penalty0 202--217, 2018. |
|
|
|
|
|
\bibitem[Besbes et~al.(2015)Besbes, Gur, and Zeevi]{besbes2015non} |
|
|
Omar Besbes, Yonatan Gur, and Assaf Zeevi. |
|
|
\newblock Non-stationary stochastic optimization. |
|
|
\newblock \emph{Operations research}, 63\penalty0 (5):\penalty0 1227--1244, 2015. |
|
|
|
|
|
\bibitem[Bhandari et~al.(2018)Bhandari, Russo, and Singal]{bhandari2018finite} |
|
|
Jalaj Bhandari, Daniel Russo, and Raghav Singal. |
|
|
\newblock A finite time analysis of temporal difference learning with linear function approximation. |
|
|
\newblock In \emph{Conference on learning theory}, pp.\ 1691--1692. PMLR, 2018. |
|
|
|
|
|
\bibitem[Bhatnagar \& Lakshmanan(2012)Bhatnagar and Lakshmanan]{bhatnagar2012online} |
|
|
Shalabh Bhatnagar and K~Lakshmanan. |
|
|
\newblock An online actor--critic algorithm with function approximation for constrained markov decision processes. |
|
|
\newblock \emph{Journal of Optimization Theory and Applications}, 153\penalty0 (3):\penalty0 688--708, 2012. |
|
|
|
|
|
\bibitem[Bolte et~al.(2007)Bolte, Daniilidis, Lewis, and Shiota]{bolte2007clarke} |
|
|
J{\'e}r{\^o}me Bolte, Aris Daniilidis, Adrian Lewis, and Masahiro Shiota. |
|
|
\newblock Clarke subgradients of stratifiable functions. |
|
|
\newblock \emph{SIAM Journal on Optimization}, 18\penalty0 (2):\penalty0 556--572, 2007. |
|
|
|
|
|
\bibitem[Bolte et~al.(2010)Bolte, Daniilidis, Ley, and Mazet]{bolte2010characterizations} |
|
|
J{\'e}r{\^o}me Bolte, Aris Daniilidis, Olivier Ley, and Laurent Mazet. |
|
|
\newblock Characterizations of {\l}ojasiewicz inequalities: subgradient flows, talweg, convexity. |
|
|
\newblock \emph{Transactions of the American Mathematical Society}, 362\penalty0 (6):\penalty0 3319--3363, 2010. |
|
|
|
|
|
\bibitem[Borkar(2005)]{borkar2005actor} |
|
|
Vivek~S Borkar. |
|
|
\newblock An actor-critic algorithm for constrained markov decision processes. |
|
|
\newblock \emph{Systems \& control letters}, 54\penalty0 (3):\penalty0 207--213, 2005. |
|
|
|
|
|
\bibitem[Brockman et~al.(2016)Brockman, Cheung, Pettersson, Schneider, Schulman, Tang, and Zaremba]{brockman2016openai} |
|
|
Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and Wojciech Zaremba. |
|
|
\newblock {OpenAI} gym. |
|
|
\newblock \emph{arXiv preprint arXiv:1606.01540}, 2016. |
|
|
|
|
|
\bibitem[Cesa-Bianchi et~al.(2011)Cesa-Bianchi, Shalev-Shwartz, and Shamir]{cesa2011online} |
|
|
Nicolo Cesa-Bianchi, Shai Shalev-Shwartz, and Ohad Shamir. |
|
|
\newblock Online learning of noisy data. |
|
|
\newblock \emph{IEEE Transactions on Information Theory}, 57\penalty0 (12):\penalty0 7907--7931, 2011. |
|
|
|
|
|
\bibitem[Chen et~al.(2021{\natexlab{a}})Chen, Hu, Jin, Li, and Wang]{chen2021understanding} |
|
|
Xiaoyu Chen, Jiachen Hu, Chi Jin, Lihong Li, and Liwei Wang. |
|
|
\newblock Understanding domain randomization for sim-to-real transfer. |
|
|
\newblock \emph{arXiv preprint arXiv:2110.03239}, 2021{\natexlab{a}}. |
|
|
|
|
|
\bibitem[Chen et~al.(2021{\natexlab{b}})Chen, Dong, and Wang]{chen2021primal} |
|
|
Yi~Chen, Jing Dong, and Zhaoran Wang. |
|
|
\newblock A primal-dual approach to constrained markov decision processes. |
|
|
\newblock \emph{arXiv preprint arXiv:2101.10895}, 2021{\natexlab{b}}. |
|
|
|
|
|
\bibitem[Chow et~al.(2017)Chow, Ghavamzadeh, Janson, and Pavone]{chow2017risk} |
|
|
Yinlam Chow, Mohammad Ghavamzadeh, Lucas Janson, and Marco Pavone. |
|
|
\newblock Risk-constrained reinforcement learning with percentile risk criteria. |
|
|
\newblock \emph{The Journal of Machine Learning Research}, 18\penalty0 (1):\penalty0 6070--6120, 2017. |
|
|
|
|
|
\bibitem[Chow et~al.(2018)Chow, Nachum, Du{\'e}{\~n}ez-Guzm{\'a}n, and Ghavamzadeh]{chow2018lyapunov} |
|
|
Yinlam Chow, Ofir Nachum, Edgar~A Du{\'e}{\~n}ez-Guzm{\'a}n, and Mohammad Ghavamzadeh. |
|
|
\newblock A {Lyapunov}-based approach to safe reinforcement learning. |
|
|
\newblock In \emph{Advances in Neural Information Processing Systems}, 2018. |
|
|
|
|
|
\bibitem[Davis et~al.(2020)Davis, Drusvyatskiy, Kakade, and Lee]{davis2020stochastic} |
|
|
Damek Davis, Dmitriy Drusvyatskiy, Sham Kakade, and Jason~D Lee. |
|
|
\newblock Stochastic subgradient method converges on tame functions. |
|
|
\newblock \emph{Foundations of computational mathematics}, 20\penalty0 (1):\penalty0 119--154, 2020. |
|
|
|
|
|
\bibitem[De~Nijs et~al.(2021)De~Nijs, Walraven, De~Weerdt, and Spaan]{de2021constrained} |
|
|
Frits De~Nijs, Erwin Walraven, Mathijs De~Weerdt, and Matthijs Spaan. |
|
|
\newblock Constrained multiagent markov decision processes: A taxonomy of problems and algorithms. |
|
|
\newblock \emph{Journal of Artificial Intelligence Research}, 70:\penalty0 955--1001, 2021. |
|
|
|
|
|
\bibitem[Denevi et~al.(2019)Denevi, Ciliberto, Grazzi, and Pontil]{denevi2019learning} |
|
|
Giulia Denevi, Carlo Ciliberto, Riccardo Grazzi, and Massimiliano Pontil. |
|
|
\newblock Learning-to-learn stochastic gradient descent with biased regularization. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 1566--1575. PMLR, 2019. |
|
|
|
|
|
\bibitem[Ding et~al.(2021{\natexlab{a}})Ding, Wei, Yang, Wang, and Jovanovic]{ding2021provably} |
|
|
Dongsheng Ding, Xiaohan Wei, Zhuoran Yang, Zhaoran Wang, and Mihailo Jovanovic. |
|
|
\newblock Provably efficient safe exploration via primal-dual policy optimization. |
|
|
\newblock In \emph{International Conference on Artificial Intelligence and Statistics}, pp.\ 3304--3312. PMLR, 2021{\natexlab{a}}. |
|
|
|
|
|
\bibitem[Ding \& Lavaei(2022)Ding and Lavaei]{ding2022provably} |
|
|
Yuhao Ding and Javad Lavaei. |
|
|
\newblock Provably efficient primal-dual reinforcement learning for {CMDP}s with non-stationary objectives and constraints. |
|
|
\newblock \emph{arXiv preprint arXiv:2201.11965}, 2022. |
|
|
|
|
|
\bibitem[Ding et~al.(2021{\natexlab{b}})Ding, Zhang, and Lavaei]{ding2021beyond} |
|
|
Yuhao Ding, Junzi Zhang, and Javad Lavaei. |
|
|
\newblock Beyond exact gradients: Convergence of stochastic soft-max policy gradient methods with entropy regularization. |
|
|
\newblock \emph{arXiv preprint arXiv:2110.10117}, 2021{\natexlab{b}}. |
|
|
|
|
|
\bibitem[Dixit et~al.(2019)Dixit, Bedi, Tripathi, and Rajawat]{dixit2019online} |
|
|
Rishabh Dixit, Amrit~Singh Bedi, Ruchi Tripathi, and Ketan Rajawat. |
|
|
\newblock Online learning with inexact proximal online gradient descent algorithms. |
|
|
\newblock \emph{IEEE Transactions on Signal Processing}, 67\penalty0 (5):\penalty0 1338--1352, 2019. |
|
|
|
|
|
\bibitem[Drusvyatskiy \& Lewis(2018)Drusvyatskiy and Lewis]{drusvyatskiy2018error} |
|
|
Dmitriy Drusvyatskiy and Adrian~S Lewis. |
|
|
\newblock Error bounds, quadratic growth, and linear convergence of proximal methods. |
|
|
\newblock \emph{Mathematics of Operations Research}, 43\penalty0 (3):\penalty0 919--948, 2018. |
|
|
|
|
|
\bibitem[Du et~al.(2020)Du, Hu, Kakade, Lee, and Lei]{du2020few} |
|
|
Simon~Shaolei Du, Wei Hu, Sham~M Kakade, Jason~D Lee, and Qi~Lei. |
|
|
\newblock Few-shot learning via learning the representation, provably. |
|
|
\newblock In \emph{International Conference on Learning Representations}, 2020. |
|
|
|
|
|
\bibitem[Duan et~al.(2016)Duan, Schulman, Chen, Bartlett, Sutskever, and Abbeel]{duan2016rl} |
|
|
Yan Duan, John Schulman, Xi~Chen, Peter~L Bartlett, Ilya Sutskever, and Pieter Abbeel. |
|
|
\newblock $\text{RL}^2$: Fast reinforcement learning via slow reinforcement learning. |
|
|
\newblock \emph{arXiv preprint arXiv:1611.02779}, 2016. |
|
|
|
|
|
\bibitem[Duan et~al.(2020)Duan, Jia, and Wang]{duan2020minimax} |
|
|
Yaqi Duan, Zeyu Jia, and Mengdi Wang. |
|
|
\newblock Minimax-optimal off-policy evaluation with linear function approximation. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 2701--2709. PMLR, 2020. |
|
|
|
|
|
\bibitem[Efroni et~al.(2020)Efroni, Mannor, and Pirotta]{efroni2020exploration} |
|
|
Yonathan Efroni, Shie Mannor, and Matteo Pirotta. |
|
|
\newblock Exploration-exploitation in constrained {MDP}s. |
|
|
\newblock \emph{arXiv preprint arXiv:2003.02189}, 2020. |
|
|
|
|
|
\bibitem[Fallah et~al.(2021)Fallah, Georgiev, Mokhtari, and Ozdaglar]{fallah2021convergence} |
|
|
Alireza Fallah, Kristian Georgiev, Aryan Mokhtari, and Asuman Ozdaglar. |
|
|
\newblock On the convergence theory of debiased model-agnostic meta-reinforcement learning. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021. |
|
|
|
|
|
\bibitem[Fan et~al.(2021)Fan, Ma, and Zhong]{fan2021selective} |
|
|
Jianqing Fan, Cong Ma, and Yiqiao Zhong. |
|
|
\newblock A selective overview of deep learning. |
|
|
\newblock \emph{Statistical science: a review journal of the Institute of Mathematical Statistics}, 36\penalty0 (2):\penalty0 264, 2021. |
|
|
|
|
|
\bibitem[Finn et~al.(2017)Finn, Abbeel, and Levine]{finn2017model} |
|
|
Chelsea Finn, Pieter Abbeel, and Sergey Levine. |
|
|
\newblock Model-agnostic meta-learning for fast adaptation of deep networks. |
|
|
\newblock In \emph{International conference on machine learning}, pp.\ 1126--1135. PMLR, 2017. |
|
|
|
|
|
\bibitem[Finn et~al.(2019)Finn, Rajeswaran, Kakade, and Levine]{finn2019online} |
|
|
Chelsea Finn, Aravind Rajeswaran, Sham Kakade, and Sergey Levine. |
|
|
\newblock Online meta-learning. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 1920--1930. PMLR, 2019. |
|
|
|
|
|
\bibitem[Garc{\i}a \& Fern{\'a}ndez(2015)Garc{\i}a and Fern{\'a}ndez]{garcia2015comprehensive} |
|
|
Javier Garc{\i}a and Fernando Fern{\'a}ndez. |
|
|
\newblock A comprehensive survey on safe reinforcement learning. |
|
|
\newblock \emph{Journal of Machine Learning Research}, 16\penalty0 (1):\penalty0 1437--1480, 2015. |
|
|
|
|
|
\bibitem[Geist et~al.(2019)Geist, Scherrer, and Pietquin]{geist2019theory} |
|
|
Matthieu Geist, Bruno Scherrer, and Olivier Pietquin. |
|
|
\newblock A theory of regularized markov decision processes. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 2160--2169. PMLR, 2019. |
|
|
|
|
|
\bibitem[Gelada \& Bellemare(2019)Gelada and Bellemare]{gelada2019off} |
|
|
Carles Gelada and Marc~G Bellemare. |
|
|
\newblock Off-policy deep reinforcement learning by bootstrapping the covariate shift. |
|
|
\newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~33, pp.\ 3647--3655, 2019. |
|
|
|
|
|
\bibitem[Hazan et~al.(2016)]{hazan2016introduction} |
|
|
Elad Hazan et~al. |
|
|
\newblock Introduction to online convex optimization. |
|
|
\newblock \emph{Foundations and Trends{\textregistered} in Optimization}, 2\penalty0 (3-4):\penalty0 157--325, 2016. |
|
|
|
|
|
\bibitem[Hospedales et~al.(2020)Hospedales, Antoniou, Micaelli, and Storkey]{hospedales2020meta} |
|
|
Timothy Hospedales, Antreas Antoniou, Paul Micaelli, and Amos Storkey. |
|
|
\newblock Meta-learning in neural networks: A survey. |
|
|
\newblock \emph{arXiv preprint arXiv:2004.05439}, 2020. |
|
|
|
|
|
\bibitem[Ioffe(2009)]{ioffe2009invitation} |
|
|
Alexander~D Ioffe. |
|
|
\newblock An invitation to tame optimization. |
|
|
\newblock \emph{SIAM Journal on Optimization}, 19\penalty0 (4):\penalty0 1894--1917, 2009. |
|
|
|
|
|
\bibitem[Jadbabaie et~al.(2015)Jadbabaie, Rakhlin, Shahrampour, and Sridharan]{jadbabaie2015online} |
|
|
Ali Jadbabaie, Alexander Rakhlin, Shahin Shahrampour, and Karthik Sridharan. |
|
|
\newblock Online optimization: Competing with dynamic comparators. |
|
|
\newblock In \emph{Artificial Intelligence and Statistics}, pp.\ 398--406. PMLR, 2015. |
|
|
|
|
|
\bibitem[Jaderberg et~al.(2019)Jaderberg, Czarnecki, Dunning, Marris, Lever, Castaneda, Beattie, Rabinowitz, Morcos, Ruderman, et~al.]{jaderberg2019human} |
|
|
Max Jaderberg, Wojciech~M Czarnecki, Iain Dunning, Luke Marris, Guy Lever, Antonio~Garcia Castaneda, Charles Beattie, Neil~C Rabinowitz, Ari~S Morcos, Avraham Ruderman, et~al. |
|
|
\newblock Human-level performance in 3d multiplayer games with population-based reinforcement learning. |
|
|
\newblock \emph{Science}, 364\penalty0 (6443):\penalty0 859--865, 2019. |
|
|
|
|
|
\bibitem[Jean-Baptiste(2010)]{jean2010convex} |
|
|
HU~Jean-Baptiste. |
|
|
\newblock Convex analysis and minimization algorithms: advanced theory and bundle methods, 2010. |
|
|
|
|
|
\bibitem[Ji et~al.(2022)Ji, Yang, and Liang]{ji2022theoretical} |
|
|
Kaiyi Ji, Junjie Yang, and Yingbin Liang. |
|
|
\newblock Theoretical convergence of multi-step model-agnostic meta-learning. |
|
|
\newblock \emph{Journal of Machine Learning Research}, 23\penalty0 (29):\penalty0 1--41, 2022. |
|
|
|
|
|
\bibitem[Johnstone \& Moulin(2020)Johnstone and Moulin]{johnstone2020faster} |
|
|
Patrick~R Johnstone and Pierre Moulin. |
|
|
\newblock Faster subgradient methods for functions with h{\"o}lderian growth. |
|
|
\newblock \emph{Mathematical Programming}, 180\penalty0 (1):\penalty0 417--450, 2020. |
|
|
|
|
|
\bibitem[Khodak et~al.(2019)Khodak, Balcan, and Talwalkar]{khodak2019adaptive} |
|
|
Mikhail Khodak, Maria-Florina~F Balcan, and Ameet~S Talwalkar. |
|
|
\newblock Adaptive gradient-based meta-learning methods. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019. |
|
|
|
|
|
\bibitem[Kwon et~al.(2021)Kwon, Efroni, Caramanis, and Mannor]{kwon2021rl} |
|
|
Jeongyeol Kwon, Yonathan Efroni, Constantine Caramanis, and Shie Mannor. |
|
|
\newblock {RL} for latent {MDP}s: Regret guarantees and a lower bound. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021. |
|
|
|
|
|
\bibitem[Le et~al.(2019)Le, Voloshin, and Yue]{le2019batch} |
|
|
Hoang Le, Cameron Voloshin, and Yisong Yue. |
|
|
\newblock Batch policy learning under constraints. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 3703--3712. PMLR, 2019. |
|
|
|
|
|
\bibitem[Lee et~al.(2021)Lee, Jeon, Lee, Pineau, and Kim]{lee2021optidice} |
|
|
Jongmin Lee, Wonseok Jeon, Byungjun Lee, Joelle Pineau, and Kee-Eung Kim. |
|
|
\newblock Optidice: Offline policy optimization via stationary distribution correction estimation. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 6120--6130. PMLR, 2021. |
|
|
|
|
|
\bibitem[Levin \& Peres(2017)Levin and Peres]{levin2017markov} |
|
|
David~A Levin and Yuval Peres. |
|
|
\newblock \emph{Markov chains and mixing times}, volume 107. |
|
|
\newblock American Mathematical Soc., 2017. |
|
|
|
|
|
\bibitem[Li \& Liang(2018)Li and Liang]{li2018learning} |
|
|
Yuanzhi Li and Yingyu Liang. |
|
|
\newblock Learning overparameterized neural networks via stochastic gradient descent on structured data. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 31, 2018. |
|
|
|
|
|
\bibitem[Li et~al.(2017)Li, Zhou, Chen, and Li]{li2017meta} |
|
|
Zhenguo Li, Fengwei Zhou, Fei Chen, and Hang Li. |
|
|
\newblock Meta-{SGD}: Learning to learn quickly for few-shot learning. |
|
|
\newblock \emph{arXiv preprint arXiv:1707.09835}, 2017. |
|
|
|
|
|
\bibitem[Liu et~al.(2019)Liu, Socher, and Xiong]{liu2019taming} |
|
|
Hao Liu, Richard Socher, and Caiming Xiong. |
|
|
\newblock Taming maml: Efficient unbiased meta-reinforcement learning. |
|
|
\newblock In \emph{International conference on machine learning}, pp.\ 4061--4071. PMLR, 2019. |
|
|
|
|
|
\bibitem[Liu et~al.(2018)Liu, Li, Tang, and Zhou]{liu2018breaking} |
|
|
Qiang Liu, Lihong Li, Ziyang Tang, and Dengyong Zhou. |
|
|
\newblock Breaking the curse of horizon: Infinite-horizon off-policy estimation. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 31, 2018. |
|
|
|
|
|
\bibitem[Liu et~al.(2021{\natexlab{a}})Liu, Zhou, Kalathil, Kumar, and Tian]{liu2021learning} |
|
|
Tao Liu, Ruida Zhou, Dileep Kalathil, Panganamala Kumar, and Chao Tian. |
|
|
\newblock Learning policies with zero or bounded constraint violation for constrained {MDP}s. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021{\natexlab{a}}. |
|
|
|
|
|
\bibitem[Liu et~al.(2021{\natexlab{b}})Liu, Zhou, Kalathil, Kumar, and Tian]{liu2021fast} |
|
|
Tao Liu, Ruida Zhou, Dileep Kalathil, PR~Kumar, and Chao Tian. |
|
|
\newblock Fast global convergence of policy optimization for constrained {MDP}s. |
|
|
\newblock \emph{arXiv preprint arXiv:2111.00552}, 2021{\natexlab{b}}. |
|
|
|
|
|
\bibitem[Maurer et~al.(2016)Maurer, Pontil, and Romera-Paredes]{maurer2016benefit} |
|
|
Andreas Maurer, Massimiliano Pontil, and Bernardino Romera-Paredes. |
|
|
\newblock The benefit of multitask representation learning. |
|
|
\newblock \emph{Journal of Machine Learning Research}, 17\penalty0 (81):\penalty0 1--32, 2016. |
|
|
|
|
|
\bibitem[Mei et~al.(2020)Mei, Xiao, Szepesvari, and Schuurmans]{mei2020global} |
|
|
Jincheng Mei, Chenjun Xiao, Csaba Szepesvari, and Dale Schuurmans. |
|
|
\newblock On the global convergence rates of softmax policy gradient methods. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 6820--6829. PMLR, 2020. |
|
|
|
|
|
\bibitem[Mitchell et~al.(2021)Mitchell, Rafailov, Peng, Levine, and Finn]{mitchell2021offline} |
|
|
Eric Mitchell, Rafael Rafailov, Xue~Bin Peng, Sergey Levine, and Chelsea Finn. |
|
|
\newblock Offline meta-reinforcement learning with advantage weighting. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 7780--7791. PMLR, 2021. |
|
|
|
|
|
\bibitem[Mokhtari et~al.(2016)Mokhtari, Shahrampour, Jadbabaie, and Ribeiro]{mokhtari2016online} |
|
|
Aryan Mokhtari, Shahin Shahrampour, Ali Jadbabaie, and Alejandro Ribeiro. |
|
|
\newblock Online optimization in dynamic environments: Improved regret rates for strongly convex problems. |
|
|
\newblock In \emph{2016 IEEE 55th Conference on Decision and Control}, pp.\ 7195--7201. IEEE, 2016. |
|
|
|
|
|
\bibitem[Nachum et~al.(2019)Nachum, Chow, Dai, and Li]{nachum2019dualdice} |
|
|
Ofir Nachum, Yinlam Chow, Bo~Dai, and Lihong Li. |
|
|
\newblock Dualdice: Behavior-agnostic estimation of discounted stationary distribution corrections. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019. |
|
|
|
|
|
\bibitem[Neyshabur et~al.(2019)Neyshabur, Li, Bhojanapalli, LeCun, and Srebro]{neyshabur2019towards} |
|
|
Behnam Neyshabur, Zhiyuan Li, Srinadh Bhojanapalli, Yann LeCun, and Nathan Srebro. |
|
|
\newblock Towards understanding the role of over-parametrization in generalization of neural networks. |
|
|
\newblock In \emph{International Conference on Learning Representations (ICLR)}, 2019. |
|
|
|
|
|
\bibitem[Paternain et~al.(2022)Paternain, Calvo-Fullana, Chamon, and Ribeiro]{paternain2022safe} |
|
|
Santiago Paternain, Miguel Calvo-Fullana, Luiz~FO Chamon, and Alejandro Ribeiro. |
|
|
\newblock Safe policies for reinforcement learning via primal-dual methods. |
|
|
\newblock \emph{IEEE Transactions on Automatic Control}, 2022. |
|
|
|
|
|
\bibitem[Resler \& Mansour(2019)Resler and Mansour]{resler2019adversarial} |
|
|
Alon Resler and Yishay Mansour. |
|
|
\newblock Adversarial online learning with noise. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 5429--5437. PMLR, 2019. |
|
|
|
|
|
\bibitem[Rothfuss et~al.(2018)Rothfuss, Lee, Clavera, Asfour, and Abbeel]{rothfuss2018promp} |
|
|
Jonas Rothfuss, Dennis Lee, Ignasi Clavera, Tamim Asfour, and Pieter Abbeel. |
|
|
\newblock Promp: Proximal meta-policy search. |
|
|
\newblock \emph{arXiv preprint arXiv:1810.06784}, 2018. |
|
|
|
|
|
\bibitem[Song et~al.(2019)Song, Gao, Yang, Choromanski, Pacchiano, and Tang]{song2019maml} |
|
|
Xingyou Song, Wenbo Gao, Yuxiang Yang, Krzysztof Choromanski, Aldo Pacchiano, and Yunhao Tang. |
|
|
\newblock Es-maml: Simple hessian-free meta learning. |
|
|
\newblock \emph{arXiv preprint arXiv:1910.01215}, 2019. |
|
|
|
|
|
\bibitem[Suilen et~al.(2022)Suilen, Sim{\~a}o, Jansen, and Parker]{suilen2022robust} |
|
|
Marnix Suilen, Thiago~D Sim{\~a}o, Nils Jansen, and David Parker. |
|
|
\newblock Robust anytime learning of markov decision processes. |
|
|
\newblock \emph{arXiv preprint arXiv:2205.15827}, 2022. |
|
|
|
|
|
\bibitem[Tennenholtz et~al.(2020)Tennenholtz, Shalit, and Mannor]{tennenholtz2020off} |
|
|
Guy Tennenholtz, Uri Shalit, and Shie Mannor. |
|
|
\newblock Off-policy evaluation in partially observable environments. |
|
|
\newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pp.\ 10276--10283, 2020. |
|
|
|
|
|
\bibitem[Thomas et~al.(2021)Thomas, Pineau, Laroche, et~al.]{thomas2021multi} |
|
|
Philip~S Thomas, Joelle Pineau, Romain Laroche, et~al. |
|
|
\newblock Multi-objective spibb: Seldonian offline policy improvement with safety constraints in finite {MDP}s. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021. |
|
|
|
|
|
\bibitem[Todorov et~al.(2012)Todorov, Erez, and Tassa]{todorov2012mujoco} |
|
|
Emanuel Todorov, Tom Erez, and Yuval Tassa. |
|
|
\newblock Mujoco: A physics engine for model-based control. |
|
|
\newblock In \emph{2012 IEEE/RSJ international conference on intelligent robots and systems}, pp.\ 5026--5033. IEEE, 2012. |
|
|
|
|
|
\bibitem[Tripuraneni et~al.(2020)Tripuraneni, Jordan, and Jin]{tripuraneni2020theory} |
|
|
Nilesh Tripuraneni, Michael Jordan, and Chi Jin. |
|
|
\newblock On the theory of transfer learning: The importance of task diversity. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 7852--7862, 2020. |
|
|
|
|
|
\bibitem[Uchibe \& Doya(2007)Uchibe and Doya]{uchibe2007constrained} |
|
|
Eiji Uchibe and Kenji Doya. |
|
|
\newblock Constrained reinforcement learning from intrinsic and extrinsic rewards. |
|
|
\newblock In \emph{2007 IEEE 6th International Conference on Development and Learning}, pp.\ 163--168. IEEE, 2007. |
|
|
|
|
|
\bibitem[Van~den Dries \& Miller(1996)Van~den Dries and Miller]{van1996geometric} |
|
|
Lou Van~den Dries and Chris Miller. |
|
|
\newblock Geometric categories and o-minimal structures. |
|
|
\newblock \emph{Duke Mathematical Journal}, 84\penalty0 (2):\penalty0 497--540, 1996. |
|
|
|
|
|
\bibitem[Wu et~al.(2021)Wu, Zhang, Yang, and Wang]{wu2021offline} |
|
|
Runzhe Wu, Yufeng Zhang, Zhuoran Yang, and Zhaoran Wang. |
|
|
\newblock Offline constrained multi-objective reinforcement learning via pessimistic dual value iteration. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021. |
|
|
|
|
|
\bibitem[Xu et~al.(2020)Xu, Wang, and Liang]{xu2020improving} |
|
|
Tengyu Xu, Zhe Wang, and Yingbin Liang. |
|
|
\newblock Improving sample complexity bounds for (natural) actor-critic algorithms. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 4358--4369, 2020. |
|
|
|
|
|
\bibitem[Xu et~al.(2021)Xu, Liang, and Lan]{xu2021crpo} |
|
|
Tengyu Xu, Yingbin Liang, and Guanghui Lan. |
|
|
\newblock Crpo: A new approach for safe reinforcement learning with convergence guarantee. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 11480--11491. PMLR, 2021. |
|
|
|
|
|
\bibitem[Yang et~al.(2016)Yang, Zhang, Jin, and Yi]{yang2016tracking} |
|
|
Tianbao Yang, Lijun Zhang, Rong Jin, and Jinfeng Yi. |
|
|
\newblock Tracking slowly moving clairvoyant: Optimal dynamic regret of online learning with true and noisy gradient. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 449--457. PMLR, 2016. |
|
|
|
|
|
\bibitem[Ying et~al.(2022)Ying, Ding, and Lavaei]{ying2021dual} |
|
|
Donghao Ying, Yuhao Ding, and Javad Lavaei. |
|
|
\newblock A dual approach to constrained markov decision processes with entropy regularization. |
|
|
\newblock \emph{25th International Conference on Artificial Intelligence and Statistics (AISTATS)}, 2022. |
|
|
|
|
|
\bibitem[Young et~al.(2018)Young, Wang, and Taylor]{young2018metatrace} |
|
|
Kenny Young, Baoxiang Wang, and Matthew~E Taylor. |
|
|
\newblock Metatrace: Online step-size tuning by meta-gradient descent for reinforcement learning control. |
|
|
\newblock \emph{arXiv preprint arXiv:1805.04514}, 2018. |
|
|
|
|
|
\bibitem[Yu et~al.(2019)Yu, Yang, Kolar, and Wang]{yu2019convergent} |
|
|
Ming Yu, Zhuoran Yang, Mladen Kolar, and Zhaoran Wang. |
|
|
\newblock Convergent policy optimization for safe reinforcement learning. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019. |
|
|
|
|
|
\bibitem[Zhang et~al.(2021)Zhang, Bengio, Hardt, Recht, and Vinyals]{zhang2021understanding} |
|
|
Chiyuan Zhang, Samy Bengio, Moritz Hardt, Benjamin Recht, and Oriol Vinyals. |
|
|
\newblock Understanding deep learning (still) requires rethinking generalization. |
|
|
\newblock \emph{Communications of the ACM}, 64\penalty0 (3):\penalty0 107--115, 2021. |
|
|
|
|
|
\bibitem[Zhang et~al.(2017)Zhang, Yang, Yi, Jin, and Zhou]{zhang2017improved} |
|
|
Lijun Zhang, Tianbao Yang, Jinfeng Yi, Rong Jin, and Zhi-Hua Zhou. |
|
|
\newblock Improved dynamic regret for non-degenerate functions. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 30, 2017. |
|
|
|
|
|
\bibitem[Zhao et~al.(2021)Zhao, Chen, and Thuraisingham]{zhao2021fairness} |
|
|
Chen Zhao, Feng Chen, and Bhavani Thuraisingham. |
|
|
\newblock Fairness-aware online meta-learning. |
|
|
\newblock In \emph{Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery \& Data Mining}, pp.\ 2294--2304, 2021. |
|
|
|
|
|
\bibitem[Zhao et~al.(2020)Zhao, Zhang, Zhang, and Zhou]{zhao2020dynamic} |
|
|
Peng Zhao, Yu-Jie Zhang, Lijun Zhang, and Zhi-Hua Zhou. |
|
|
\newblock Dynamic regret of convex and smooth functions. |
|
|
\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 12510--12520, 2020. |
|
|
|
|
|
\bibitem[Zinkevich(2003)]{zinkevich2003online} |
|
|
Martin Zinkevich. |
|
|
\newblock Online convex programming and generalized infinitesimal gradient ascent. |
|
|
\newblock In \emph{Proceedings of the 20th international conference on machine learning (icml-03)}, pp.\ 928--936, 2003. |
|
|
|
|
|
\bibitem[Zintgraf et~al.(2021)Zintgraf, Feng, Lu, Igl, Hartikainen, Hofmann, and Whiteson]{zintgraf2021exploration} |
|
|
Luisa~M Zintgraf, Leo Feng, Cong Lu, Maximilian Igl, Kristian Hartikainen, Katja Hofmann, and Shimon Whiteson. |
|
|
\newblock Exploration in approximate hyper-state space for meta reinforcement learning. |
|
|
\newblock In \emph{International Conference on Machine Learning}, pp.\ 12991--13001. PMLR, 2021. |
|
|
|
|
|
\bibitem[Zou et~al.(2018)Zou, Cao, Zhou, and Gu]{zou2018stochastic} |
|
|
Difan Zou, Yuan Cao, Dongruo Zhou, and Quanquan Gu. |
|
|
\newblock Stochastic gradient descent optimizes over-parameterized deep relu networks. |
|
|
\newblock \emph{arXiv preprint arXiv:1811.08888}, 2018. |
|
|
|
|
|
\end{thebibliography} |
|
|
|