Spaces:

JaceWei
/

PaperShow

Sleeping

PaperShow / Paper2Video /src /latex_proj /iclr2023_conference.bbl

ZaynZhu

Clean version without large assets

7c08dc3 2 months ago

27.3 kB

	\begin{thebibliography}{90}
	\providecommand{\natexlab}[1]{#1}
	\providecommand{\url}[1]{\texttt{#1}}
	\expandafter\ifx\csname urlstyle\endcsname\relax
	\providecommand{\doi}[1]{doi: #1}\else
	\providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

	\bibitem[Achiam et~al.(2017)Achiam, Held, Tamar, and Abbeel]{achiam2017constrained}
	Joshua Achiam, David Held, Aviv Tamar, and Pieter Abbeel.
	\newblock Constrained policy optimization.
	\newblock In \emph{International Conference on Machine Learning (ICML)}, pp.\ 22--31. PMLR, 2017.

	\bibitem[Altman(1999)]{altman1999constrained}
	Eitan Altman.
	\newblock \emph{Constrained Markov decision processes: stochastic modeling}.
	\newblock Routledge, 1999.

	\bibitem[Arora et~al.(2019)Arora, Du, Hu, Li, and Wang]{arora2019fine}
	Sanjeev Arora, Simon Du, Wei Hu, Zhiyuan Li, and Ruosong Wang.
	\newblock Fine-grained analysis of optimization and generalization for overparameterized two-layer neural networks.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 322--332. PMLR, 2019.

	\bibitem[Auer et~al.(2002)Auer, Cesa-Bianchi, and Fischer]{auer2002finite}
	Peter Auer, Nicolo Cesa-Bianchi, and Paul Fischer.
	\newblock Finite-time analysis of the multiarmed bandit problem.
	\newblock \emph{Machine learning}, 47\penalty0 (2):\penalty0 235--256, 2002.

	\bibitem[Bai et~al.(2021)Bai, Bedi, Agarwal, Koppel, and Aggarwal]{bai2021achieving}
	Qinbo Bai, Amrit~Singh Bedi, Mridul Agarwal, Alec Koppel, and Vaneet Aggarwal.
	\newblock Achieving zero constraint violation for constrained reinforcement learning via primal-dual approach.
	\newblock \emph{arXiv preprint arXiv:2109.06332}, 2021.

	\bibitem[Balcan et~al.(2015)Balcan, Blum, and Vempala]{balcan2015efficient}
	Maria-Florina Balcan, Avrim Blum, and Santosh Vempala.
	\newblock Efficient representations for lifelong learning and autoencoding.
	\newblock In \emph{Conference on Learning Theory}, pp.\ 191--210. PMLR, 2015.

	\bibitem[Balcan et~al.(2019)Balcan, Khodak, and Talwalkar]{balcan2019provable}
	Maria-Florina Balcan, Mikhail Khodak, and Ameet Talwalkar.
	\newblock Provable guarantees for gradient-based meta-learning.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 424--433. PMLR, 2019.

	\bibitem[Balcan et~al.(2021)Balcan, Khodak, Sharma, and Talwalkar]{balcan2021learning}
	Maria-Florina~F Balcan, Mikhail Khodak, Dravyansh Sharma, and Ameet Talwalkar.
	\newblock Learning-to-learn non-convex piecewise-lipschitz functions.
	\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.

	\bibitem[Bedi et~al.(2018)Bedi, Sarma, and Rajawat]{bedi2018tracking}
	Amrit~Singh Bedi, Paban Sarma, and Ketan Rajawat.
	\newblock Tracking moving agents via inexact online gradient descent algorithm.
	\newblock \emph{IEEE Journal of Selected Topics in Signal Processing}, 12\penalty0 (1):\penalty0 202--217, 2018.

	\bibitem[Besbes et~al.(2015)Besbes, Gur, and Zeevi]{besbes2015non}
	Omar Besbes, Yonatan Gur, and Assaf Zeevi.
	\newblock Non-stationary stochastic optimization.
	\newblock \emph{Operations research}, 63\penalty0 (5):\penalty0 1227--1244, 2015.

	\bibitem[Bhandari et~al.(2018)Bhandari, Russo, and Singal]{bhandari2018finite}
	Jalaj Bhandari, Daniel Russo, and Raghav Singal.
	\newblock A finite time analysis of temporal difference learning with linear function approximation.
	\newblock In \emph{Conference on learning theory}, pp.\ 1691--1692. PMLR, 2018.

	\bibitem[Bhatnagar \& Lakshmanan(2012)Bhatnagar and Lakshmanan]{bhatnagar2012online}
	Shalabh Bhatnagar and K~Lakshmanan.
	\newblock An online actor--critic algorithm with function approximation for constrained markov decision processes.
	\newblock \emph{Journal of Optimization Theory and Applications}, 153\penalty0 (3):\penalty0 688--708, 2012.

	\bibitem[Bolte et~al.(2007)Bolte, Daniilidis, Lewis, and Shiota]{bolte2007clarke}
	J{\'e}r{\^o}me Bolte, Aris Daniilidis, Adrian Lewis, and Masahiro Shiota.
	\newblock Clarke subgradients of stratifiable functions.
	\newblock \emph{SIAM Journal on Optimization}, 18\penalty0 (2):\penalty0 556--572, 2007.

	\bibitem[Bolte et~al.(2010)Bolte, Daniilidis, Ley, and Mazet]{bolte2010characterizations}
	J{\'e}r{\^o}me Bolte, Aris Daniilidis, Olivier Ley, and Laurent Mazet.
	\newblock Characterizations of {\l}ojasiewicz inequalities: subgradient flows, talweg, convexity.
	\newblock \emph{Transactions of the American Mathematical Society}, 362\penalty0 (6):\penalty0 3319--3363, 2010.

	\bibitem[Borkar(2005)]{borkar2005actor}
	Vivek~S Borkar.
	\newblock An actor-critic algorithm for constrained markov decision processes.
	\newblock \emph{Systems \& control letters}, 54\penalty0 (3):\penalty0 207--213, 2005.

	\bibitem[Brockman et~al.(2016)Brockman, Cheung, Pettersson, Schneider, Schulman, Tang, and Zaremba]{brockman2016openai}
	Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John Schulman, Jie Tang, and Wojciech Zaremba.
	\newblock {OpenAI} gym.
	\newblock \emph{arXiv preprint arXiv:1606.01540}, 2016.

	\bibitem[Cesa-Bianchi et~al.(2011)Cesa-Bianchi, Shalev-Shwartz, and Shamir]{cesa2011online}
	Nicolo Cesa-Bianchi, Shai Shalev-Shwartz, and Ohad Shamir.
	\newblock Online learning of noisy data.
	\newblock \emph{IEEE Transactions on Information Theory}, 57\penalty0 (12):\penalty0 7907--7931, 2011.

	\bibitem[Chen et~al.(2021{\natexlab{a}})Chen, Hu, Jin, Li, and Wang]{chen2021understanding}
	Xiaoyu Chen, Jiachen Hu, Chi Jin, Lihong Li, and Liwei Wang.
	\newblock Understanding domain randomization for sim-to-real transfer.
	\newblock \emph{arXiv preprint arXiv:2110.03239}, 2021{\natexlab{a}}.

	\bibitem[Chen et~al.(2021{\natexlab{b}})Chen, Dong, and Wang]{chen2021primal}
	Yi~Chen, Jing Dong, and Zhaoran Wang.
	\newblock A primal-dual approach to constrained markov decision processes.
	\newblock \emph{arXiv preprint arXiv:2101.10895}, 2021{\natexlab{b}}.

	\bibitem[Chow et~al.(2017)Chow, Ghavamzadeh, Janson, and Pavone]{chow2017risk}
	Yinlam Chow, Mohammad Ghavamzadeh, Lucas Janson, and Marco Pavone.
	\newblock Risk-constrained reinforcement learning with percentile risk criteria.
	\newblock \emph{The Journal of Machine Learning Research}, 18\penalty0 (1):\penalty0 6070--6120, 2017.

	\bibitem[Chow et~al.(2018)Chow, Nachum, Du{\'e}{\~n}ez-Guzm{\'a}n, and Ghavamzadeh]{chow2018lyapunov}
	Yinlam Chow, Ofir Nachum, Edgar~A Du{\'e}{\~n}ez-Guzm{\'a}n, and Mohammad Ghavamzadeh.
	\newblock A {Lyapunov}-based approach to safe reinforcement learning.
	\newblock In \emph{Advances in Neural Information Processing Systems}, 2018.

	\bibitem[Davis et~al.(2020)Davis, Drusvyatskiy, Kakade, and Lee]{davis2020stochastic}
	Damek Davis, Dmitriy Drusvyatskiy, Sham Kakade, and Jason~D Lee.
	\newblock Stochastic subgradient method converges on tame functions.
	\newblock \emph{Foundations of computational mathematics}, 20\penalty0 (1):\penalty0 119--154, 2020.

	\bibitem[De~Nijs et~al.(2021)De~Nijs, Walraven, De~Weerdt, and Spaan]{de2021constrained}
	Frits De~Nijs, Erwin Walraven, Mathijs De~Weerdt, and Matthijs Spaan.
	\newblock Constrained multiagent markov decision processes: A taxonomy of problems and algorithms.
	\newblock \emph{Journal of Artificial Intelligence Research}, 70:\penalty0 955--1001, 2021.

	\bibitem[Denevi et~al.(2019)Denevi, Ciliberto, Grazzi, and Pontil]{denevi2019learning}
	Giulia Denevi, Carlo Ciliberto, Riccardo Grazzi, and Massimiliano Pontil.
	\newblock Learning-to-learn stochastic gradient descent with biased regularization.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 1566--1575. PMLR, 2019.

	\bibitem[Ding et~al.(2021{\natexlab{a}})Ding, Wei, Yang, Wang, and Jovanovic]{ding2021provably}
	Dongsheng Ding, Xiaohan Wei, Zhuoran Yang, Zhaoran Wang, and Mihailo Jovanovic.
	\newblock Provably efficient safe exploration via primal-dual policy optimization.
	\newblock In \emph{International Conference on Artificial Intelligence and Statistics}, pp.\ 3304--3312. PMLR, 2021{\natexlab{a}}.

	\bibitem[Ding \& Lavaei(2022)Ding and Lavaei]{ding2022provably}
	Yuhao Ding and Javad Lavaei.
	\newblock Provably efficient primal-dual reinforcement learning for {CMDP}s with non-stationary objectives and constraints.
	\newblock \emph{arXiv preprint arXiv:2201.11965}, 2022.

	\bibitem[Ding et~al.(2021{\natexlab{b}})Ding, Zhang, and Lavaei]{ding2021beyond}
	Yuhao Ding, Junzi Zhang, and Javad Lavaei.
	\newblock Beyond exact gradients: Convergence of stochastic soft-max policy gradient methods with entropy regularization.
	\newblock \emph{arXiv preprint arXiv:2110.10117}, 2021{\natexlab{b}}.

	\bibitem[Dixit et~al.(2019)Dixit, Bedi, Tripathi, and Rajawat]{dixit2019online}
	Rishabh Dixit, Amrit~Singh Bedi, Ruchi Tripathi, and Ketan Rajawat.
	\newblock Online learning with inexact proximal online gradient descent algorithms.
	\newblock \emph{IEEE Transactions on Signal Processing}, 67\penalty0 (5):\penalty0 1338--1352, 2019.

	\bibitem[Drusvyatskiy \& Lewis(2018)Drusvyatskiy and Lewis]{drusvyatskiy2018error}
	Dmitriy Drusvyatskiy and Adrian~S Lewis.
	\newblock Error bounds, quadratic growth, and linear convergence of proximal methods.
	\newblock \emph{Mathematics of Operations Research}, 43\penalty0 (3):\penalty0 919--948, 2018.

	\bibitem[Du et~al.(2020)Du, Hu, Kakade, Lee, and Lei]{du2020few}
	Simon~Shaolei Du, Wei Hu, Sham~M Kakade, Jason~D Lee, and Qi~Lei.
	\newblock Few-shot learning via learning the representation, provably.
	\newblock In \emph{International Conference on Learning Representations}, 2020.

	\bibitem[Duan et~al.(2016)Duan, Schulman, Chen, Bartlett, Sutskever, and Abbeel]{duan2016rl}
	Yan Duan, John Schulman, Xi~Chen, Peter~L Bartlett, Ilya Sutskever, and Pieter Abbeel.
	\newblock $\text{RL}^2$: Fast reinforcement learning via slow reinforcement learning.
	\newblock \emph{arXiv preprint arXiv:1611.02779}, 2016.

	\bibitem[Duan et~al.(2020)Duan, Jia, and Wang]{duan2020minimax}
	Yaqi Duan, Zeyu Jia, and Mengdi Wang.
	\newblock Minimax-optimal off-policy evaluation with linear function approximation.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 2701--2709. PMLR, 2020.

	\bibitem[Efroni et~al.(2020)Efroni, Mannor, and Pirotta]{efroni2020exploration}
	Yonathan Efroni, Shie Mannor, and Matteo Pirotta.
	\newblock Exploration-exploitation in constrained {MDP}s.
	\newblock \emph{arXiv preprint arXiv:2003.02189}, 2020.

	\bibitem[Fallah et~al.(2021)Fallah, Georgiev, Mokhtari, and Ozdaglar]{fallah2021convergence}
	Alireza Fallah, Kristian Georgiev, Aryan Mokhtari, and Asuman Ozdaglar.
	\newblock On the convergence theory of debiased model-agnostic meta-reinforcement learning.
	\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.

	\bibitem[Fan et~al.(2021)Fan, Ma, and Zhong]{fan2021selective}
	Jianqing Fan, Cong Ma, and Yiqiao Zhong.
	\newblock A selective overview of deep learning.
	\newblock \emph{Statistical science: a review journal of the Institute of Mathematical Statistics}, 36\penalty0 (2):\penalty0 264, 2021.

	\bibitem[Finn et~al.(2017)Finn, Abbeel, and Levine]{finn2017model}
	Chelsea Finn, Pieter Abbeel, and Sergey Levine.
	\newblock Model-agnostic meta-learning for fast adaptation of deep networks.
	\newblock In \emph{International conference on machine learning}, pp.\ 1126--1135. PMLR, 2017.

	\bibitem[Finn et~al.(2019)Finn, Rajeswaran, Kakade, and Levine]{finn2019online}
	Chelsea Finn, Aravind Rajeswaran, Sham Kakade, and Sergey Levine.
	\newblock Online meta-learning.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 1920--1930. PMLR, 2019.

	\bibitem[Garc{\i}a \& Fern{\'a}ndez(2015)Garc{\i}a and Fern{\'a}ndez]{garcia2015comprehensive}
	Javier Garc{\i}a and Fernando Fern{\'a}ndez.
	\newblock A comprehensive survey on safe reinforcement learning.
	\newblock \emph{Journal of Machine Learning Research}, 16\penalty0 (1):\penalty0 1437--1480, 2015.

	\bibitem[Geist et~al.(2019)Geist, Scherrer, and Pietquin]{geist2019theory}
	Matthieu Geist, Bruno Scherrer, and Olivier Pietquin.
	\newblock A theory of regularized markov decision processes.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 2160--2169. PMLR, 2019.

	\bibitem[Gelada \& Bellemare(2019)Gelada and Bellemare]{gelada2019off}
	Carles Gelada and Marc~G Bellemare.
	\newblock Off-policy deep reinforcement learning by bootstrapping the covariate shift.
	\newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~33, pp.\ 3647--3655, 2019.

	\bibitem[Hazan et~al.(2016)]{hazan2016introduction}
	Elad Hazan et~al.
	\newblock Introduction to online convex optimization.
	\newblock \emph{Foundations and Trends{\textregistered} in Optimization}, 2\penalty0 (3-4):\penalty0 157--325, 2016.

	\bibitem[Hospedales et~al.(2020)Hospedales, Antoniou, Micaelli, and Storkey]{hospedales2020meta}
	Timothy Hospedales, Antreas Antoniou, Paul Micaelli, and Amos Storkey.
	\newblock Meta-learning in neural networks: A survey.
	\newblock \emph{arXiv preprint arXiv:2004.05439}, 2020.

	\bibitem[Ioffe(2009)]{ioffe2009invitation}
	Alexander~D Ioffe.
	\newblock An invitation to tame optimization.
	\newblock \emph{SIAM Journal on Optimization}, 19\penalty0 (4):\penalty0 1894--1917, 2009.

	\bibitem[Jadbabaie et~al.(2015)Jadbabaie, Rakhlin, Shahrampour, and Sridharan]{jadbabaie2015online}
	Ali Jadbabaie, Alexander Rakhlin, Shahin Shahrampour, and Karthik Sridharan.
	\newblock Online optimization: Competing with dynamic comparators.
	\newblock In \emph{Artificial Intelligence and Statistics}, pp.\ 398--406. PMLR, 2015.

	\bibitem[Jaderberg et~al.(2019)Jaderberg, Czarnecki, Dunning, Marris, Lever, Castaneda, Beattie, Rabinowitz, Morcos, Ruderman, et~al.]{jaderberg2019human}
	Max Jaderberg, Wojciech~M Czarnecki, Iain Dunning, Luke Marris, Guy Lever, Antonio~Garcia Castaneda, Charles Beattie, Neil~C Rabinowitz, Ari~S Morcos, Avraham Ruderman, et~al.
	\newblock Human-level performance in 3d multiplayer games with population-based reinforcement learning.
	\newblock \emph{Science}, 364\penalty0 (6443):\penalty0 859--865, 2019.

	\bibitem[Jean-Baptiste(2010)]{jean2010convex}
	HU~Jean-Baptiste.
	\newblock Convex analysis and minimization algorithms: advanced theory and bundle methods, 2010.

	\bibitem[Ji et~al.(2022)Ji, Yang, and Liang]{ji2022theoretical}
	Kaiyi Ji, Junjie Yang, and Yingbin Liang.
	\newblock Theoretical convergence of multi-step model-agnostic meta-learning.
	\newblock \emph{Journal of Machine Learning Research}, 23\penalty0 (29):\penalty0 1--41, 2022.

	\bibitem[Johnstone \& Moulin(2020)Johnstone and Moulin]{johnstone2020faster}
	Patrick~R Johnstone and Pierre Moulin.
	\newblock Faster subgradient methods for functions with h{\"o}lderian growth.
	\newblock \emph{Mathematical Programming}, 180\penalty0 (1):\penalty0 417--450, 2020.

	\bibitem[Khodak et~al.(2019)Khodak, Balcan, and Talwalkar]{khodak2019adaptive}
	Mikhail Khodak, Maria-Florina~F Balcan, and Ameet~S Talwalkar.
	\newblock Adaptive gradient-based meta-learning methods.
	\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019.

	\bibitem[Kwon et~al.(2021)Kwon, Efroni, Caramanis, and Mannor]{kwon2021rl}
	Jeongyeol Kwon, Yonathan Efroni, Constantine Caramanis, and Shie Mannor.
	\newblock {RL} for latent {MDP}s: Regret guarantees and a lower bound.
	\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.

	\bibitem[Le et~al.(2019)Le, Voloshin, and Yue]{le2019batch}
	Hoang Le, Cameron Voloshin, and Yisong Yue.
	\newblock Batch policy learning under constraints.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 3703--3712. PMLR, 2019.

	\bibitem[Lee et~al.(2021)Lee, Jeon, Lee, Pineau, and Kim]{lee2021optidice}
	Jongmin Lee, Wonseok Jeon, Byungjun Lee, Joelle Pineau, and Kee-Eung Kim.
	\newblock Optidice: Offline policy optimization via stationary distribution correction estimation.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 6120--6130. PMLR, 2021.

	\bibitem[Levin \& Peres(2017)Levin and Peres]{levin2017markov}
	David~A Levin and Yuval Peres.
	\newblock \emph{Markov chains and mixing times}, volume 107.
	\newblock American Mathematical Soc., 2017.

	\bibitem[Li \& Liang(2018)Li and Liang]{li2018learning}
	Yuanzhi Li and Yingyu Liang.
	\newblock Learning overparameterized neural networks via stochastic gradient descent on structured data.
	\newblock \emph{Advances in Neural Information Processing Systems}, 31, 2018.

	\bibitem[Li et~al.(2017)Li, Zhou, Chen, and Li]{li2017meta}
	Zhenguo Li, Fengwei Zhou, Fei Chen, and Hang Li.
	\newblock Meta-{SGD}: Learning to learn quickly for few-shot learning.
	\newblock \emph{arXiv preprint arXiv:1707.09835}, 2017.

	\bibitem[Liu et~al.(2019)Liu, Socher, and Xiong]{liu2019taming}
	Hao Liu, Richard Socher, and Caiming Xiong.
	\newblock Taming maml: Efficient unbiased meta-reinforcement learning.
	\newblock In \emph{International conference on machine learning}, pp.\ 4061--4071. PMLR, 2019.

	\bibitem[Liu et~al.(2018)Liu, Li, Tang, and Zhou]{liu2018breaking}
	Qiang Liu, Lihong Li, Ziyang Tang, and Dengyong Zhou.
	\newblock Breaking the curse of horizon: Infinite-horizon off-policy estimation.
	\newblock \emph{Advances in Neural Information Processing Systems}, 31, 2018.

	\bibitem[Liu et~al.(2021{\natexlab{a}})Liu, Zhou, Kalathil, Kumar, and Tian]{liu2021learning}
	Tao Liu, Ruida Zhou, Dileep Kalathil, Panganamala Kumar, and Chao Tian.
	\newblock Learning policies with zero or bounded constraint violation for constrained {MDP}s.
	\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021{\natexlab{a}}.

	\bibitem[Liu et~al.(2021{\natexlab{b}})Liu, Zhou, Kalathil, Kumar, and Tian]{liu2021fast}
	Tao Liu, Ruida Zhou, Dileep Kalathil, PR~Kumar, and Chao Tian.
	\newblock Fast global convergence of policy optimization for constrained {MDP}s.
	\newblock \emph{arXiv preprint arXiv:2111.00552}, 2021{\natexlab{b}}.

	\bibitem[Maurer et~al.(2016)Maurer, Pontil, and Romera-Paredes]{maurer2016benefit}
	Andreas Maurer, Massimiliano Pontil, and Bernardino Romera-Paredes.
	\newblock The benefit of multitask representation learning.
	\newblock \emph{Journal of Machine Learning Research}, 17\penalty0 (81):\penalty0 1--32, 2016.

	\bibitem[Mei et~al.(2020)Mei, Xiao, Szepesvari, and Schuurmans]{mei2020global}
	Jincheng Mei, Chenjun Xiao, Csaba Szepesvari, and Dale Schuurmans.
	\newblock On the global convergence rates of softmax policy gradient methods.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 6820--6829. PMLR, 2020.

	\bibitem[Mitchell et~al.(2021)Mitchell, Rafailov, Peng, Levine, and Finn]{mitchell2021offline}
	Eric Mitchell, Rafael Rafailov, Xue~Bin Peng, Sergey Levine, and Chelsea Finn.
	\newblock Offline meta-reinforcement learning with advantage weighting.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 7780--7791. PMLR, 2021.

	\bibitem[Mokhtari et~al.(2016)Mokhtari, Shahrampour, Jadbabaie, and Ribeiro]{mokhtari2016online}
	Aryan Mokhtari, Shahin Shahrampour, Ali Jadbabaie, and Alejandro Ribeiro.
	\newblock Online optimization in dynamic environments: Improved regret rates for strongly convex problems.
	\newblock In \emph{2016 IEEE 55th Conference on Decision and Control}, pp.\ 7195--7201. IEEE, 2016.

	\bibitem[Nachum et~al.(2019)Nachum, Chow, Dai, and Li]{nachum2019dualdice}
	Ofir Nachum, Yinlam Chow, Bo~Dai, and Lihong Li.
	\newblock Dualdice: Behavior-agnostic estimation of discounted stationary distribution corrections.
	\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019.

	\bibitem[Neyshabur et~al.(2019)Neyshabur, Li, Bhojanapalli, LeCun, and Srebro]{neyshabur2019towards}
	Behnam Neyshabur, Zhiyuan Li, Srinadh Bhojanapalli, Yann LeCun, and Nathan Srebro.
	\newblock Towards understanding the role of over-parametrization in generalization of neural networks.
	\newblock In \emph{International Conference on Learning Representations (ICLR)}, 2019.

	\bibitem[Paternain et~al.(2022)Paternain, Calvo-Fullana, Chamon, and Ribeiro]{paternain2022safe}
	Santiago Paternain, Miguel Calvo-Fullana, Luiz~FO Chamon, and Alejandro Ribeiro.
	\newblock Safe policies for reinforcement learning via primal-dual methods.
	\newblock \emph{IEEE Transactions on Automatic Control}, 2022.

	\bibitem[Resler \& Mansour(2019)Resler and Mansour]{resler2019adversarial}
	Alon Resler and Yishay Mansour.
	\newblock Adversarial online learning with noise.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 5429--5437. PMLR, 2019.

	\bibitem[Rothfuss et~al.(2018)Rothfuss, Lee, Clavera, Asfour, and Abbeel]{rothfuss2018promp}
	Jonas Rothfuss, Dennis Lee, Ignasi Clavera, Tamim Asfour, and Pieter Abbeel.
	\newblock Promp: Proximal meta-policy search.
	\newblock \emph{arXiv preprint arXiv:1810.06784}, 2018.

	\bibitem[Song et~al.(2019)Song, Gao, Yang, Choromanski, Pacchiano, and Tang]{song2019maml}
	Xingyou Song, Wenbo Gao, Yuxiang Yang, Krzysztof Choromanski, Aldo Pacchiano, and Yunhao Tang.
	\newblock Es-maml: Simple hessian-free meta learning.
	\newblock \emph{arXiv preprint arXiv:1910.01215}, 2019.

	\bibitem[Suilen et~al.(2022)Suilen, Sim{\~a}o, Jansen, and Parker]{suilen2022robust}
	Marnix Suilen, Thiago~D Sim{\~a}o, Nils Jansen, and David Parker.
	\newblock Robust anytime learning of markov decision processes.
	\newblock \emph{arXiv preprint arXiv:2205.15827}, 2022.

	\bibitem[Tennenholtz et~al.(2020)Tennenholtz, Shalit, and Mannor]{tennenholtz2020off}
	Guy Tennenholtz, Uri Shalit, and Shie Mannor.
	\newblock Off-policy evaluation in partially observable environments.
	\newblock In \emph{Proceedings of the AAAI Conference on Artificial Intelligence}, volume~34, pp.\ 10276--10283, 2020.

	\bibitem[Thomas et~al.(2021)Thomas, Pineau, Laroche, et~al.]{thomas2021multi}
	Philip~S Thomas, Joelle Pineau, Romain Laroche, et~al.
	\newblock Multi-objective spibb: Seldonian offline policy improvement with safety constraints in finite {MDP}s.
	\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.

	\bibitem[Todorov et~al.(2012)Todorov, Erez, and Tassa]{todorov2012mujoco}
	Emanuel Todorov, Tom Erez, and Yuval Tassa.
	\newblock Mujoco: A physics engine for model-based control.
	\newblock In \emph{2012 IEEE/RSJ international conference on intelligent robots and systems}, pp.\ 5026--5033. IEEE, 2012.

	\bibitem[Tripuraneni et~al.(2020)Tripuraneni, Jordan, and Jin]{tripuraneni2020theory}
	Nilesh Tripuraneni, Michael Jordan, and Chi Jin.
	\newblock On the theory of transfer learning: The importance of task diversity.
	\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 7852--7862, 2020.

	\bibitem[Uchibe \& Doya(2007)Uchibe and Doya]{uchibe2007constrained}
	Eiji Uchibe and Kenji Doya.
	\newblock Constrained reinforcement learning from intrinsic and extrinsic rewards.
	\newblock In \emph{2007 IEEE 6th International Conference on Development and Learning}, pp.\ 163--168. IEEE, 2007.

	\bibitem[Van~den Dries \& Miller(1996)Van~den Dries and Miller]{van1996geometric}
	Lou Van~den Dries and Chris Miller.
	\newblock Geometric categories and o-minimal structures.
	\newblock \emph{Duke Mathematical Journal}, 84\penalty0 (2):\penalty0 497--540, 1996.

	\bibitem[Wu et~al.(2021)Wu, Zhang, Yang, and Wang]{wu2021offline}
	Runzhe Wu, Yufeng Zhang, Zhuoran Yang, and Zhaoran Wang.
	\newblock Offline constrained multi-objective reinforcement learning via pessimistic dual value iteration.
	\newblock \emph{Advances in Neural Information Processing Systems}, 34, 2021.

	\bibitem[Xu et~al.(2020)Xu, Wang, and Liang]{xu2020improving}
	Tengyu Xu, Zhe Wang, and Yingbin Liang.
	\newblock Improving sample complexity bounds for (natural) actor-critic algorithms.
	\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 4358--4369, 2020.

	\bibitem[Xu et~al.(2021)Xu, Liang, and Lan]{xu2021crpo}
	Tengyu Xu, Yingbin Liang, and Guanghui Lan.
	\newblock Crpo: A new approach for safe reinforcement learning with convergence guarantee.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 11480--11491. PMLR, 2021.

	\bibitem[Yang et~al.(2016)Yang, Zhang, Jin, and Yi]{yang2016tracking}
	Tianbao Yang, Lijun Zhang, Rong Jin, and Jinfeng Yi.
	\newblock Tracking slowly moving clairvoyant: Optimal dynamic regret of online learning with true and noisy gradient.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 449--457. PMLR, 2016.

	\bibitem[Ying et~al.(2022)Ying, Ding, and Lavaei]{ying2021dual}
	Donghao Ying, Yuhao Ding, and Javad Lavaei.
	\newblock A dual approach to constrained markov decision processes with entropy regularization.
	\newblock \emph{25th International Conference on Artificial Intelligence and Statistics (AISTATS)}, 2022.

	\bibitem[Young et~al.(2018)Young, Wang, and Taylor]{young2018metatrace}
	Kenny Young, Baoxiang Wang, and Matthew~E Taylor.
	\newblock Metatrace: Online step-size tuning by meta-gradient descent for reinforcement learning control.
	\newblock \emph{arXiv preprint arXiv:1805.04514}, 2018.

	\bibitem[Yu et~al.(2019)Yu, Yang, Kolar, and Wang]{yu2019convergent}
	Ming Yu, Zhuoran Yang, Mladen Kolar, and Zhaoran Wang.
	\newblock Convergent policy optimization for safe reinforcement learning.
	\newblock \emph{Advances in Neural Information Processing Systems}, 32, 2019.

	\bibitem[Zhang et~al.(2021)Zhang, Bengio, Hardt, Recht, and Vinyals]{zhang2021understanding}
	Chiyuan Zhang, Samy Bengio, Moritz Hardt, Benjamin Recht, and Oriol Vinyals.
	\newblock Understanding deep learning (still) requires rethinking generalization.
	\newblock \emph{Communications of the ACM}, 64\penalty0 (3):\penalty0 107--115, 2021.

	\bibitem[Zhang et~al.(2017)Zhang, Yang, Yi, Jin, and Zhou]{zhang2017improved}
	Lijun Zhang, Tianbao Yang, Jinfeng Yi, Rong Jin, and Zhi-Hua Zhou.
	\newblock Improved dynamic regret for non-degenerate functions.
	\newblock \emph{Advances in Neural Information Processing Systems}, 30, 2017.

	\bibitem[Zhao et~al.(2021)Zhao, Chen, and Thuraisingham]{zhao2021fairness}
	Chen Zhao, Feng Chen, and Bhavani Thuraisingham.
	\newblock Fairness-aware online meta-learning.
	\newblock In \emph{Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery \& Data Mining}, pp.\ 2294--2304, 2021.

	\bibitem[Zhao et~al.(2020)Zhao, Zhang, Zhang, and Zhou]{zhao2020dynamic}
	Peng Zhao, Yu-Jie Zhang, Lijun Zhang, and Zhi-Hua Zhou.
	\newblock Dynamic regret of convex and smooth functions.
	\newblock \emph{Advances in Neural Information Processing Systems}, 33:\penalty0 12510--12520, 2020.

	\bibitem[Zinkevich(2003)]{zinkevich2003online}
	Martin Zinkevich.
	\newblock Online convex programming and generalized infinitesimal gradient ascent.
	\newblock In \emph{Proceedings of the 20th international conference on machine learning (icml-03)}, pp.\ 928--936, 2003.

	\bibitem[Zintgraf et~al.(2021)Zintgraf, Feng, Lu, Igl, Hartikainen, Hofmann, and Whiteson]{zintgraf2021exploration}
	Luisa~M Zintgraf, Leo Feng, Cong Lu, Maximilian Igl, Kristian Hartikainen, Katja Hofmann, and Shimon Whiteson.
	\newblock Exploration in approximate hyper-state space for meta reinforcement learning.
	\newblock In \emph{International Conference on Machine Learning}, pp.\ 12991--13001. PMLR, 2021.

	\bibitem[Zou et~al.(2018)Zou, Cao, Zhou, and Gu]{zou2018stochastic}
	Difan Zou, Yuan Cao, Dongruo Zhou, and Quanquan Gu.
	\newblock Stochastic gradient descent optimizes over-parameterized deep relu networks.
	\newblock \emph{arXiv preprint arXiv:1811.08888}, 2018.

	\end{thebibliography}