PaperShow / Paper2Video /src /latex_proj /iclr2023_conference.bbl
ZaynZhu
Clean version without large assets
7c08dc3
\begin{thebibliography}{90}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
\providecommand{\doi}[1]{doi: #1}\else
\providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi
\bibitem[Achiam et~al.(2017)Achiam, Held, Tamar, and Abbeel]{achiam2017constrained}
Joshua Achiam, David Held, Aviv Tamar, and Pieter Abbeel.
\newblock Constrained policy optimization.
\newblock In \emph{International Conference on Machine Learning (ICML)}, pp.\ 22--31. PMLR, 2017.
\bibitem[Altman(1999)]{altman1999constrained}
Eitan Altman.
\newblock \emph{Constrained Markov decision processes: stochastic modeling}.
\newblock Routledge, 1999.
\bibitem[Arora et~al.(2019)Arora, Du, Hu, Li, and Wang]{arora2019fine}
Sanjeev Arora, Simon Du, Wei Hu, Zhiyuan Li, and Ruosong Wang.
\newblock Fine-grained analysis of optimization and generalization for overparameterized two-layer neural networks.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 322--332. PMLR, 2019.
\bibitem[Auer et~al.(2002)Auer, Cesa-Bianchi, and Fischer]{auer2002finite}
Peter Auer, Nicolo Cesa-Bianchi, and Paul Fischer.
\newblock Finite-time analysis of the multiarmed bandit problem.
\newblock \emph{Machine learning}, 47\penalty0 (2):\penalty0 235--256, 2002.
\bibitem[Bai et~al.(2021)Bai, Bedi, Agarwal, Koppel, and Aggarwal]{bai2021achieving}
Qinbo Bai, Amrit~Singh Bedi, Mridul Agarwal, Alec Koppel, and Vaneet Aggarwal.
\newblock Achieving zero constraint violation for constrained reinforcement learning via primal-dual approach.
\newblock \emph{arXiv preprint arXiv:2109.06332}, 2021.
\bibitem[Balcan et~al.(2015)Balcan, Blum, and Vempala]{balcan2015efficient}
Maria-Florina Balcan, Avrim Blum, and Santosh Vempala.
\newblock Efficient representations for lifelong learning and autoencoding.
\newblock In \emph{Conference on Learning Theory}, pp.\ 191--210. PMLR, 2015.
\bibitem[Balcan et~al.(2019)Balcan, Khodak, and Talwalkar]{balcan2019provable}
Maria-Florina Balcan, Mikhail Khodak, and Ameet Talwalkar.
\newblock Provable guarantees for gradient-based meta-learning.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 424--433. PMLR, 2019.
\bibitem[Balcan et~al.(2021)Balcan, Khodak, Sharma, and Talwalkar]{balcan2021learning}
Maria-Florina~F Balcan, Mikhail Khodak, Dravyansh Sharma, and Ameet Talwalkar.
\newblock Learning-to-learn non-convex piecewise-lipschitz functions.
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.
\bibitem[Bedi et~al.(2018)Bedi, Sarma, and Rajawat]{bedi2018tracking}
Amrit~Singh Bedi, Paban Sarma, and Ketan Rajawat.
\newblock Tracking moving agents via inexact online gradient descent algorithm.
\newblock \emph{IEEE Journal of Selected Topics in Signal Processing}, 12\penalty0 (1):\penalty0 202--217, 2018.
\bibitem[Besbes et~al.(2015)Besbes, Gur, and Zeevi]{besbes2015non}
Omar Besbes, Yonatan Gur, and Assaf Zeevi.
\newblock Non-stationary stochastic optimization.
\newblock \emph{Operations research}, 63\penalty0 (5):\penalty0 1227--1244, 2015.
\bibitem[Bhandari et~al.(2018)Bhandari, Russo, and Singal]{bhandari2018finite}
Jalaj Bhandari, Daniel Russo, and Raghav Singal.
\newblock A finite time analysis of temporal difference learning with linear function approximation.
\newblock In \emph{Conference on learning theory}, pp.\ 1691--1692. PMLR, 2018.
\bibitem[Bhatnagar \& Lakshmanan(2012)Bhatnagar and Lakshmanan]{bhatnagar2012online}
Shalabh Bhatnagar and K~Lakshmanan.
\newblock An online actor--critic algorithm with function approximation for constrained markov decision processes.
\newblock \emph{Journal of Optimization Theory and Applications}, 153\penalty0 (3):\penalty0 688--708, 2012.
\bibitem[Bolte et~al.(2007)Bolte, Daniilidis, Lewis, and Shiota]{bolte2007clarke}
J{\'e}r{\^o}me Bolte, Aris Daniilidis, Adrian Lewis, and Masahiro Shiota.
\newblock Clarke subgradients of stratifiable functions.
\newblock \emph{SIAM Journal on Optimization}, 18\penalty0 (2):\penalty0 556--572, 2007.
\bibitem[Bolte et~al.(2010)Bolte, Daniilidis, Ley, and Mazet]{bolte2010characterizations}
J{\'e}r{\^o}me Bolte, Aris Daniilidis, Olivier Ley, and Laurent Mazet.
\newblock Characterizations of {\l}ojasiewicz inequalities: subgradient flows, talweg, convexity.
\newblock \emph{Transactions of the American Mathematical Society}, 362\penalty0 (6):\penalty0 3319--3363, 2010.
\bibitem[Borkar(2005)]{borkar2005actor}
Vivek~S Borkar.
\newblock An actor-critic algorithm for constrained markov decision processes.
\newblock \emph{Systems \& control letters}, 54\penalty0 (3):\penalty0 207--213, 2005.
\bibitem[Brockman et~al.(2016)Brockman, Cheung, Pettersson, Schneider, Schulman, Tang, and Zaremba]{brockman2016openai}
Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and Wojciech Zaremba.
\newblock {OpenAI} gym.
\newblock \emph{arXiv preprint arXiv:1606.01540}, 2016.
\bibitem[Cesa-Bianchi et~al.(2011)Cesa-Bianchi, Shalev-Shwartz, and Shamir]{cesa2011online}
Nicolo Cesa-Bianchi, Shai Shalev-Shwartz, and Ohad Shamir.
\newblock Online learning of noisy data.
\newblock \emph{IEEE Transactions on Information Theory}, 57\penalty0 (12):\penalty0 7907--7931, 2011.
\bibitem[Chen et~al.(2021{\natexlab{a}})Chen, Hu, Jin, Li, and Wang]{chen2021understanding}
Xiaoyu Chen, Jiachen Hu, Chi Jin, Lihong Li, and Liwei Wang.
\newblock Understanding domain randomization for sim-to-real transfer.
\newblock \emph{arXiv preprint arXiv:2110.03239}, 2021{\natexlab{a}}.
\bibitem[Chen et~al.(2021{\natexlab{b}})Chen, Dong, and Wang]{chen2021primal}
Yi~Chen, Jing Dong, and Zhaoran Wang.
\newblock A primal-dual approach to constrained markov decision processes.
\newblock \emph{arXiv preprint arXiv:2101.10895}, 2021{\natexlab{b}}.
\bibitem[Chow et~al.(2017)Chow, Ghavamzadeh, Janson, and Pavone]{chow2017risk}
Yinlam Chow, Mohammad Ghavamzadeh, Lucas Janson, and Marco Pavone.
\newblock Risk-constrained reinforcement learning with percentile risk criteria.
\newblock \emph{The Journal of Machine Learning Research}, 18\penalty0 (1):\penalty0 6070--6120, 2017.
\bibitem[Chow et~al.(2018)Chow, Nachum, Du{\'e}{\~n}ez-Guzm{\'a}n, and Ghavamzadeh]{chow2018lyapunov}
Yinlam Chow, Ofir Nachum, Edgar~A Du{\'e}{\~n}ez-Guzm{\'a}n, and Mohammad Ghavamzadeh.
\newblock A {Lyapunov}-based approach to safe reinforcement learning.
\newblock In \emph{Advances in Neural Information Processing Systems}, 2018.
\bibitem[Davis et~al.(2020)Davis, Drusvyatskiy, Kakade, and Lee]{davis2020stochastic}
Damek Davis, Dmitriy Drusvyatskiy, Sham Kakade, and Jason~D Lee.
\newblock Stochastic subgradient method converges on tame functions.
\newblock \emph{Foundations of computational mathematics}, 20\penalty0 (1):\penalty0 119--154, 2020.
\bibitem[De~Nijs et~al.(2021)De~Nijs, Walraven, De~Weerdt, and Spaan]{de2021constrained}
Frits De~Nijs, Erwin Walraven, Mathijs De~Weerdt, and Matthijs Spaan.
\newblock Constrained multiagent markov decision processes: A taxonomy of problems and algorithms.
\newblock \emph{Journal of Artificial Intelligence Research}, 70:\penalty0 955--1001, 2021.
\bibitem[Denevi et~al.(2019)Denevi, Ciliberto, Grazzi, and Pontil]{denevi2019learning}
Giulia Denevi, Carlo Ciliberto, Riccardo Grazzi, and Massimiliano Pontil.
\newblock Learning-to-learn stochastic gradient descent with biased regularization.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 1566--1575. PMLR, 2019.
\bibitem[Ding et~al.(2021{\natexlab{a}})Ding, Wei, Yang, Wang, and Jovanovic]{ding2021provably}
Dongsheng Ding, Xiaohan Wei, Zhuoran Yang, Zhaoran Wang, and Mihailo Jovanovic.
\newblock Provably efficient safe exploration via primal-dual policy optimization.
\newblock In \emph{International Conference on Artificial Intelligence and Statistics}, pp.\ 3304--3312. PMLR, 2021{\natexlab{a}}.
\bibitem[Ding \& Lavaei(2022)Ding and Lavaei]{ding2022provably}
Yuhao Ding and Javad Lavaei.
\newblock Provably efficient primal-dual reinforcement learning for {CMDP}s with non-stationary objectives and constraints.
\newblock \emph{arXiv preprint arXiv:2201.11965}, 2022.
\bibitem[Ding et~al.(2021{\natexlab{b}})Ding, Zhang, and Lavaei]{ding2021beyond}
Yuhao Ding, Junzi Zhang, and Javad Lavaei.
\newblock Beyond exact gradients: Convergence of stochastic soft-max policy gradient methods with entropy regularization.
\newblock \emph{arXiv preprint arXiv:2110.10117}, 2021{\natexlab{b}}.
\bibitem[Dixit et~al.(2019)Dixit, Bedi, Tripathi, and Rajawat]{dixit2019online}
Rishabh Dixit, Amrit~Singh Bedi, Ruchi Tripathi, and Ketan Rajawat.
\newblock Online learning with inexact proximal online gradient descent algorithms.
\newblock \emph{IEEE Transactions on Signal Processing}, 67\penalty0 (5):\penalty0 1338--1352, 2019.
\bibitem[Drusvyatskiy \& Lewis(2018)Drusvyatskiy and Lewis]{drusvyatskiy2018error}
Dmitriy Drusvyatskiy and Adrian~S Lewis.
\newblock Error bounds, quadratic growth, and linear convergence of proximal methods.
\newblock \emph{Mathematics of Operations Research}, 43\penalty0 (3):\penalty0 919--948, 2018.
\bibitem[Du et~al.(2020)Du, Hu, Kakade, Lee, and Lei]{du2020few}
Simon~Shaolei Du, Wei Hu, Sham~M Kakade, Jason~D Lee, and Qi~Lei.
\newblock Few-shot learning via learning the representation, provably.
\newblock In \emph{International Conference on Learning Representations}, 2020.
\bibitem[Duan et~al.(2016)Duan, Schulman, Chen, Bartlett, Sutskever, and Abbeel]{duan2016rl}
Yan Duan, John Schulman, Xi~Chen, Peter~L Bartlett, Ilya Sutskever, and Pieter Abbeel.
\newblock $\text{RL}^2$: Fast reinforcement learning via slow reinforcement learning.
\newblock \emph{arXiv preprint arXiv:1611.02779}, 2016.
\bibitem[Duan et~al.(2020)Duan, Jia, and Wang]{duan2020minimax}
Yaqi Duan, Zeyu Jia, and Mengdi Wang.
\newblock Minimax-optimal off-policy evaluation with linear function approximation.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 2701--2709. PMLR, 2020.
\bibitem[Efroni et~al.(2020)Efroni, Mannor, and Pirotta]{efroni2020exploration}
Yonathan Efroni, Shie Mannor, and Matteo Pirotta.
\newblock Exploration-exploitation in constrained {MDP}s.
\newblock \emph{arXiv preprint arXiv:2003.02189}, 2020.
\bibitem[Fallah et~al.(2021)Fallah, Georgiev, Mokhtari, and Ozdaglar]{fallah2021convergence}
Alireza Fallah, Kristian Georgiev, Aryan Mokhtari, and Asuman Ozdaglar.
\newblock On the convergence theory of debiased model-agnostic meta-reinforcement learning.
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.
\bibitem[Fan et~al.(2021)Fan, Ma, and Zhong]{fan2021selective}
Jianqing Fan, Cong Ma, and Yiqiao Zhong.
\newblock A selective overview of deep learning.
\newblock \emph{Statistical science: a review journal of the Institute of Mathematical Statistics}, 36\penalty0 (2):\penalty0 264, 2021.
\bibitem[Finn et~al.(2017)Finn, Abbeel, and Levine]{finn2017model}
Chelsea Finn, Pieter Abbeel, and Sergey Levine.
\newblock Model-agnostic meta-learning for fast adaptation of deep networks.
\newblock In \emph{International conference on machine learning}, pp.\ 1126--1135. PMLR, 2017.
\bibitem[Finn et~al.(2019)Finn, Rajeswaran, Kakade, and Levine]{finn2019online}
Chelsea Finn, Aravind Rajeswaran, Sham Kakade, and Sergey Levine.
\newblock Online meta-learning.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 1920--1930. PMLR, 2019.
\bibitem[Garc{\i}a \& Fern{\'a}ndez(2015)Garc{\i}a and Fern{\'a}ndez]{garcia2015comprehensive}
Javier Garc{\i}a and Fernando Fern{\'a}ndez.
\newblock A comprehensive survey on safe reinforcement learning.
\newblock \emph{Journal of Machine Learning Research}, 16\penalty0 (1):\penalty0 1437--1480, 2015.
\bibitem[Geist et~al.(2019)Geist, Scherrer, and Pietquin]{geist2019theory}
Matthieu Geist, Bruno Scherrer, and Olivier Pietquin.
\newblock A theory of regularized markov decision processes.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 2160--2169. PMLR, 2019.
\bibitem[Gelada \& Bellemare(2019)Gelada and Bellemare]{gelada2019off}
Carles Gelada and Marc~G Bellemare.
\newblock Off-policy deep reinforcement learning by bootstrapping the covariate shift.
\newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~33, pp.\ 3647--3655, 2019.
\bibitem[Hazan et~al.(2016)]{hazan2016introduction}
Elad Hazan et~al.
\newblock Introduction to online convex optimization.
\newblock \emph{Foundations and Trends{\textregistered} in Optimization}, 2\penalty0 (3-4):\penalty0 157--325, 2016.
\bibitem[Hospedales et~al.(2020)Hospedales, Antoniou, Micaelli, and Storkey]{hospedales2020meta}
Timothy Hospedales, Antreas Antoniou, Paul Micaelli, and Amos Storkey.
\newblock Meta-learning in neural networks: A survey.
\newblock \emph{arXiv preprint arXiv:2004.05439}, 2020.
\bibitem[Ioffe(2009)]{ioffe2009invitation}
Alexander~D Ioffe.
\newblock An invitation to tame optimization.
\newblock \emph{SIAM Journal on Optimization}, 19\penalty0 (4):\penalty0 1894--1917, 2009.
\bibitem[Jadbabaie et~al.(2015)Jadbabaie, Rakhlin, Shahrampour, and Sridharan]{jadbabaie2015online}
Ali Jadbabaie, Alexander Rakhlin, Shahin Shahrampour, and Karthik Sridharan.
\newblock Online optimization: Competing with dynamic comparators.
\newblock In \emph{Artificial Intelligence and Statistics}, pp.\ 398--406. PMLR, 2015.
\bibitem[Jaderberg et~al.(2019)Jaderberg, Czarnecki, Dunning, Marris, Lever, Castaneda, Beattie, Rabinowitz, Morcos, Ruderman, et~al.]{jaderberg2019human}
Max Jaderberg, Wojciech~M Czarnecki, Iain Dunning, Luke Marris, Guy Lever, Antonio~Garcia Castaneda, Charles Beattie, Neil~C Rabinowitz, Ari~S Morcos, Avraham Ruderman, et~al.
\newblock Human-level performance in 3d multiplayer games with population-based reinforcement learning.
\newblock \emph{Science}, 364\penalty0 (6443):\penalty0 859--865, 2019.
\bibitem[Jean-Baptiste(2010)]{jean2010convex}
HU~Jean-Baptiste.
\newblock Convex analysis and minimization algorithms: advanced theory and bundle methods, 2010.
\bibitem[Ji et~al.(2022)Ji, Yang, and Liang]{ji2022theoretical}
Kaiyi Ji, Junjie Yang, and Yingbin Liang.
\newblock Theoretical convergence of multi-step model-agnostic meta-learning.
\newblock \emph{Journal of Machine Learning Research}, 23\penalty0 (29):\penalty0 1--41, 2022.
\bibitem[Johnstone \& Moulin(2020)Johnstone and Moulin]{johnstone2020faster}
Patrick~R Johnstone and Pierre Moulin.
\newblock Faster subgradient methods for functions with h{\"o}lderian growth.
\newblock \emph{Mathematical Programming}, 180\penalty0 (1):\penalty0 417--450, 2020.
\bibitem[Khodak et~al.(2019)Khodak, Balcan, and Talwalkar]{khodak2019adaptive}
Mikhail Khodak, Maria-Florina~F Balcan, and Ameet~S Talwalkar.
\newblock Adaptive gradient-based meta-learning methods.
\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019.
\bibitem[Kwon et~al.(2021)Kwon, Efroni, Caramanis, and Mannor]{kwon2021rl}
Jeongyeol Kwon, Yonathan Efroni, Constantine Caramanis, and Shie Mannor.
\newblock {RL} for latent {MDP}s: Regret guarantees and a lower bound.
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.
\bibitem[Le et~al.(2019)Le, Voloshin, and Yue]{le2019batch}
Hoang Le, Cameron Voloshin, and Yisong Yue.
\newblock Batch policy learning under constraints.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 3703--3712. PMLR, 2019.
\bibitem[Lee et~al.(2021)Lee, Jeon, Lee, Pineau, and Kim]{lee2021optidice}
Jongmin Lee, Wonseok Jeon, Byungjun Lee, Joelle Pineau, and Kee-Eung Kim.
\newblock Optidice: Offline policy optimization via stationary distribution correction estimation.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 6120--6130. PMLR, 2021.
\bibitem[Levin \& Peres(2017)Levin and Peres]{levin2017markov}
David~A Levin and Yuval Peres.
\newblock \emph{Markov chains and mixing times}, volume 107.
\newblock American Mathematical Soc., 2017.
\bibitem[Li \& Liang(2018)Li and Liang]{li2018learning}
Yuanzhi Li and Yingyu Liang.
\newblock Learning overparameterized neural networks via stochastic gradient descent on structured data.
\newblock \emph{Advances in Neural Information Processing Systems}, 31, 2018.
\bibitem[Li et~al.(2017)Li, Zhou, Chen, and Li]{li2017meta}
Zhenguo Li, Fengwei Zhou, Fei Chen, and Hang Li.
\newblock Meta-{SGD}: Learning to learn quickly for few-shot learning.
\newblock \emph{arXiv preprint arXiv:1707.09835}, 2017.
\bibitem[Liu et~al.(2019)Liu, Socher, and Xiong]{liu2019taming}
Hao Liu, Richard Socher, and Caiming Xiong.
\newblock Taming maml: Efficient unbiased meta-reinforcement learning.
\newblock In \emph{International conference on machine learning}, pp.\ 4061--4071. PMLR, 2019.
\bibitem[Liu et~al.(2018)Liu, Li, Tang, and Zhou]{liu2018breaking}
Qiang Liu, Lihong Li, Ziyang Tang, and Dengyong Zhou.
\newblock Breaking the curse of horizon: Infinite-horizon off-policy estimation.
\newblock \emph{Advances in Neural Information Processing Systems}, 31, 2018.
\bibitem[Liu et~al.(2021{\natexlab{a}})Liu, Zhou, Kalathil, Kumar, and Tian]{liu2021learning}
Tao Liu, Ruida Zhou, Dileep Kalathil, Panganamala Kumar, and Chao Tian.
\newblock Learning policies with zero or bounded constraint violation for constrained {MDP}s.
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021{\natexlab{a}}.
\bibitem[Liu et~al.(2021{\natexlab{b}})Liu, Zhou, Kalathil, Kumar, and Tian]{liu2021fast}
Tao Liu, Ruida Zhou, Dileep Kalathil, PR~Kumar, and Chao Tian.
\newblock Fast global convergence of policy optimization for constrained {MDP}s.
\newblock \emph{arXiv preprint arXiv:2111.00552}, 2021{\natexlab{b}}.
\bibitem[Maurer et~al.(2016)Maurer, Pontil, and Romera-Paredes]{maurer2016benefit}
Andreas Maurer, Massimiliano Pontil, and Bernardino Romera-Paredes.
\newblock The benefit of multitask representation learning.
\newblock \emph{Journal of Machine Learning Research}, 17\penalty0 (81):\penalty0 1--32, 2016.
\bibitem[Mei et~al.(2020)Mei, Xiao, Szepesvari, and Schuurmans]{mei2020global}
Jincheng Mei, Chenjun Xiao, Csaba Szepesvari, and Dale Schuurmans.
\newblock On the global convergence rates of softmax policy gradient methods.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 6820--6829. PMLR, 2020.
\bibitem[Mitchell et~al.(2021)Mitchell, Rafailov, Peng, Levine, and Finn]{mitchell2021offline}
Eric Mitchell, Rafael Rafailov, Xue~Bin Peng, Sergey Levine, and Chelsea Finn.
\newblock Offline meta-reinforcement learning with advantage weighting.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 7780--7791. PMLR, 2021.
\bibitem[Mokhtari et~al.(2016)Mokhtari, Shahrampour, Jadbabaie, and Ribeiro]{mokhtari2016online}
Aryan Mokhtari, Shahin Shahrampour, Ali Jadbabaie, and Alejandro Ribeiro.
\newblock Online optimization in dynamic environments: Improved regret rates for strongly convex problems.
\newblock In \emph{2016 IEEE 55th Conference on Decision and Control}, pp.\ 7195--7201. IEEE, 2016.
\bibitem[Nachum et~al.(2019)Nachum, Chow, Dai, and Li]{nachum2019dualdice}
Ofir Nachum, Yinlam Chow, Bo~Dai, and Lihong Li.
\newblock Dualdice: Behavior-agnostic estimation of discounted stationary distribution corrections.
\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019.
\bibitem[Neyshabur et~al.(2019)Neyshabur, Li, Bhojanapalli, LeCun, and Srebro]{neyshabur2019towards}
Behnam Neyshabur, Zhiyuan Li, Srinadh Bhojanapalli, Yann LeCun, and Nathan Srebro.
\newblock Towards understanding the role of over-parametrization in generalization of neural networks.
\newblock In \emph{International Conference on Learning Representations (ICLR)}, 2019.
\bibitem[Paternain et~al.(2022)Paternain, Calvo-Fullana, Chamon, and Ribeiro]{paternain2022safe}
Santiago Paternain, Miguel Calvo-Fullana, Luiz~FO Chamon, and Alejandro Ribeiro.
\newblock Safe policies for reinforcement learning via primal-dual methods.
\newblock \emph{IEEE Transactions on Automatic Control}, 2022.
\bibitem[Resler \& Mansour(2019)Resler and Mansour]{resler2019adversarial}
Alon Resler and Yishay Mansour.
\newblock Adversarial online learning with noise.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 5429--5437. PMLR, 2019.
\bibitem[Rothfuss et~al.(2018)Rothfuss, Lee, Clavera, Asfour, and Abbeel]{rothfuss2018promp}
Jonas Rothfuss, Dennis Lee, Ignasi Clavera, Tamim Asfour, and Pieter Abbeel.
\newblock Promp: Proximal meta-policy search.
\newblock \emph{arXiv preprint arXiv:1810.06784}, 2018.
\bibitem[Song et~al.(2019)Song, Gao, Yang, Choromanski, Pacchiano, and Tang]{song2019maml}
Xingyou Song, Wenbo Gao, Yuxiang Yang, Krzysztof Choromanski, Aldo Pacchiano, and Yunhao Tang.
\newblock Es-maml: Simple hessian-free meta learning.
\newblock \emph{arXiv preprint arXiv:1910.01215}, 2019.
\bibitem[Suilen et~al.(2022)Suilen, Sim{\~a}o, Jansen, and Parker]{suilen2022robust}
Marnix Suilen, Thiago~D Sim{\~a}o, Nils Jansen, and David Parker.
\newblock Robust anytime learning of markov decision processes.
\newblock \emph{arXiv preprint arXiv:2205.15827}, 2022.
\bibitem[Tennenholtz et~al.(2020)Tennenholtz, Shalit, and Mannor]{tennenholtz2020off}
Guy Tennenholtz, Uri Shalit, and Shie Mannor.
\newblock Off-policy evaluation in partially observable environments.
\newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pp.\ 10276--10283, 2020.
\bibitem[Thomas et~al.(2021)Thomas, Pineau, Laroche, et~al.]{thomas2021multi}
Philip~S Thomas, Joelle Pineau, Romain Laroche, et~al.
\newblock Multi-objective spibb: Seldonian offline policy improvement with safety constraints in finite {MDP}s.
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.
\bibitem[Todorov et~al.(2012)Todorov, Erez, and Tassa]{todorov2012mujoco}
Emanuel Todorov, Tom Erez, and Yuval Tassa.
\newblock Mujoco: A physics engine for model-based control.
\newblock In \emph{2012 IEEE/RSJ international conference on intelligent robots and systems}, pp.\ 5026--5033. IEEE, 2012.
\bibitem[Tripuraneni et~al.(2020)Tripuraneni, Jordan, and Jin]{tripuraneni2020theory}
Nilesh Tripuraneni, Michael Jordan, and Chi Jin.
\newblock On the theory of transfer learning: The importance of task diversity.
\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 7852--7862, 2020.
\bibitem[Uchibe \& Doya(2007)Uchibe and Doya]{uchibe2007constrained}
Eiji Uchibe and Kenji Doya.
\newblock Constrained reinforcement learning from intrinsic and extrinsic rewards.
\newblock In \emph{2007 IEEE 6th International Conference on Development and Learning}, pp.\ 163--168. IEEE, 2007.
\bibitem[Van~den Dries \& Miller(1996)Van~den Dries and Miller]{van1996geometric}
Lou Van~den Dries and Chris Miller.
\newblock Geometric categories and o-minimal structures.
\newblock \emph{Duke Mathematical Journal}, 84\penalty0 (2):\penalty0 497--540, 1996.
\bibitem[Wu et~al.(2021)Wu, Zhang, Yang, and Wang]{wu2021offline}
Runzhe Wu, Yufeng Zhang, Zhuoran Yang, and Zhaoran Wang.
\newblock Offline constrained multi-objective reinforcement learning via pessimistic dual value iteration.
\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.
\bibitem[Xu et~al.(2020)Xu, Wang, and Liang]{xu2020improving}
Tengyu Xu, Zhe Wang, and Yingbin Liang.
\newblock Improving sample complexity bounds for (natural) actor-critic algorithms.
\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 4358--4369, 2020.
\bibitem[Xu et~al.(2021)Xu, Liang, and Lan]{xu2021crpo}
Tengyu Xu, Yingbin Liang, and Guanghui Lan.
\newblock Crpo: A new approach for safe reinforcement learning with convergence guarantee.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 11480--11491. PMLR, 2021.
\bibitem[Yang et~al.(2016)Yang, Zhang, Jin, and Yi]{yang2016tracking}
Tianbao Yang, Lijun Zhang, Rong Jin, and Jinfeng Yi.
\newblock Tracking slowly moving clairvoyant: Optimal dynamic regret of online learning with true and noisy gradient.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 449--457. PMLR, 2016.
\bibitem[Ying et~al.(2022)Ying, Ding, and Lavaei]{ying2021dual}
Donghao Ying, Yuhao Ding, and Javad Lavaei.
\newblock A dual approach to constrained markov decision processes with entropy regularization.
\newblock \emph{25th International Conference on Artificial Intelligence and Statistics (AISTATS)}, 2022.
\bibitem[Young et~al.(2018)Young, Wang, and Taylor]{young2018metatrace}
Kenny Young, Baoxiang Wang, and Matthew~E Taylor.
\newblock Metatrace: Online step-size tuning by meta-gradient descent for reinforcement learning control.
\newblock \emph{arXiv preprint arXiv:1805.04514}, 2018.
\bibitem[Yu et~al.(2019)Yu, Yang, Kolar, and Wang]{yu2019convergent}
Ming Yu, Zhuoran Yang, Mladen Kolar, and Zhaoran Wang.
\newblock Convergent policy optimization for safe reinforcement learning.
\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019.
\bibitem[Zhang et~al.(2021)Zhang, Bengio, Hardt, Recht, and Vinyals]{zhang2021understanding}
Chiyuan Zhang, Samy Bengio, Moritz Hardt, Benjamin Recht, and Oriol Vinyals.
\newblock Understanding deep learning (still) requires rethinking generalization.
\newblock \emph{Communications of the ACM}, 64\penalty0 (3):\penalty0 107--115, 2021.
\bibitem[Zhang et~al.(2017)Zhang, Yang, Yi, Jin, and Zhou]{zhang2017improved}
Lijun Zhang, Tianbao Yang, Jinfeng Yi, Rong Jin, and Zhi-Hua Zhou.
\newblock Improved dynamic regret for non-degenerate functions.
\newblock \emph{Advances in Neural Information Processing Systems}, 30, 2017.
\bibitem[Zhao et~al.(2021)Zhao, Chen, and Thuraisingham]{zhao2021fairness}
Chen Zhao, Feng Chen, and Bhavani Thuraisingham.
\newblock Fairness-aware online meta-learning.
\newblock In \emph{Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery \& Data Mining}, pp.\ 2294--2304, 2021.
\bibitem[Zhao et~al.(2020)Zhao, Zhang, Zhang, and Zhou]{zhao2020dynamic}
Peng Zhao, Yu-Jie Zhang, Lijun Zhang, and Zhi-Hua Zhou.
\newblock Dynamic regret of convex and smooth functions.
\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 12510--12520, 2020.
\bibitem[Zinkevich(2003)]{zinkevich2003online}
Martin Zinkevich.
\newblock Online convex programming and generalized infinitesimal gradient ascent.
\newblock In \emph{Proceedings of the 20th international conference on machine learning (icml-03)}, pp.\ 928--936, 2003.
\bibitem[Zintgraf et~al.(2021)Zintgraf, Feng, Lu, Igl, Hartikainen, Hofmann, and Whiteson]{zintgraf2021exploration}
Luisa~M Zintgraf, Leo Feng, Cong Lu, Maximilian Igl, Kristian Hartikainen, Katja Hofmann, and Shimon Whiteson.
\newblock Exploration in approximate hyper-state space for meta reinforcement learning.
\newblock In \emph{International Conference on Machine Learning}, pp.\ 12991--13001. PMLR, 2021.
\bibitem[Zou et~al.(2018)Zou, Cao, Zhou, and Gu]{zou2018stochastic}
Difan Zou, Yuan Cao, Dongruo Zhou, and Quanquan Gu.
\newblock Stochastic gradient descent optimizes over-parameterized deep relu networks.
\newblock \emph{arXiv preprint arXiv:1811.08888}, 2018.
\end{thebibliography}