\documentclass{beamer} % Theme and Color \usetheme{Madrid} \usecolortheme{default} % Packages \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{amsmath, amssymb, amsfonts} \usepackage{booktabs} \usepackage{graphicx} \usepackage{hyperref} \usepackage{bm} % For bold math symbols % Custom commands from the source text for consistency \newcommand{\KL}{D_{\mathrm{KL}}} \def\figref#1{Figure~\ref{#1}} \title[Meta-Safe RL]{A CMDP-within-online framework for Meta-Safe Reinforcement Learning} \author{Vanshaj Khattar\inst{1} \and Yuhao Ding\inst{2} \and Bilgehan Sel\inst{1} \and Javad Lavaei\inst{2} \and Ming Jin\inst{1}} \institute[VT \& UCB]{ \inst{1} Virginia Tech \\ \inst{2} UC Berkeley } \date{\today} \begin{document} % Title Page \begin{frame} \titlepage \end{frame} % Table of Contents \begin{frame}{Outline} \tableofcontents \end{frame} % Section 1: Motivation \section{Motivation} % Corresponds to the source text Section 1 \begin{frame}{Motivation: Why Meta-Safe RL?} \begin{block}{Background: Meta-Reinforcement Learning (Meta-RL)} \begin{itemize} \item Meta-RL enables agents to learn new tasks quickly with limited experience. \item It's a "learning-to-learn" framework successful in robotics, federated learning, etc. \end{itemize} \end{block} \begin{block}{The Problem: Safety is Critical} \begin{itemize} \item Many real-world applications have \alert{safety constraints} that must not be violated (e.g., robotics, autonomous driving). \item Existing Meta-RL methods do not adequately address these constraints. \item Safe RL problems are often modeled as \alert{Constrained Markov Decision Processes (CMDPs)}, but standard CMDP algorithms don't generalize efficiently to new tasks. \end{itemize} \end{block} \begin{block}{Our Goal} \begin{itemize} \item Develop a principled framework, \alert{Meta-Safe RL (Meta-SRL)}, that combines the fast adaptation of meta-learning with the safety guarantees of Safe RL. \item Provide the \alert{first provable guarantees} for learning across multiple safe RL tasks. \end{itemize} \end{block} \end{frame} % Section 2: Related Work \section{Related Work} % Corresponds to the source text Section 1 and Appendix A \begin{frame}{Related Work} \begin{itemize} \item \textbf{Meta-Reinforcement Learning:} \begin{itemize} \item Focuses on learning initial conditions, hyperparameters, etc., for fast adaptation. \item Most work is for \alert{unconstrained} environments. \end{itemize} \item \textbf{Online Meta-Learning:} \begin{itemize} \item Provides theoretical frameworks, often for convex and decomposable loss functions. \item Our work extends this to the \alert{nonconvex and complex} setting of CMDPs. \end{itemize} \item \textbf{Safe RL and CMDPs:} \begin{itemize} \item A rich field with many algorithms (e.g., primal-dual, policy-based like \alert{CRPO}). \item However, these are designed for a \alert{single task} and are not built to generalize or adapt quickly to unseen tasks. \end{itemize} \end{itemize} \end{frame} % Section 3: Method \section{Method} % Corresponds to the source text Sections 2 & 3 \begin{frame}{Method: CMDP-within-Online Framework} \begin{block}{Core Idea} \begin{itemize} \item A \alert{meta-learner} (online algorithm) operates over a sequence of CMDP tasks. \item For each task $t$, the meta-learner provides an initial policy $\alert{\pi_{t,0}}$ and a learning rate $\alert{\alpha_t}$ to a \alert{within-task} Safe RL algorithm (e.g., CRPO). \item The goal is to minimize the \textbf{Task-Averaged Optimality Gap (TAOG)} and \textbf{Task-Averaged Constraint Violation (TACV)}. \end{itemize} \end{block} \begin{figure} \centering \includegraphics[width=0.6\textwidth]{illustrate.pdf} \caption{Conceptual illustration of the meta-learning process.} \label{fig:method_concept} \end{figure} \end{frame} % Method Part 1: Primal Approach \begin{frame}{Method: The Within-Task Algorithm (CRPO)} % Corresponds to the source text Section 2.1 \begin{block}{Constrained Markov Decision Process (CMDP)} For each task $t$, the agent aims to solve: \begin{equation*} \underset{\pi}{\max} \hspace{0.1cm} J_{t,0}(\pi) \hspace{0.3cm} \text{s.t.} \hspace{0.2cm} \alert{J_{t,i}(\pi) \leq d_{t,i}}, \hspace{0.3cm} \forall i = 1,...,p \end{equation*} where $J_{t,0}$ is the expected reward and $J_{t,i}$ are expected costs. \end{block} \begin{block}{CRPO Algorithm \& Regret} \begin{itemize} \item We use the Constraint-Rectified Policy Optimization (\alert{CRPO}) algorithm. \item The single-task optimality gap ($R_0$) and constraint violation ($R_i$) are bounded by: \begin{equation*} R_0, R_i \leq \mathcal{O}\left( \frac{\mathbb{E}_{s \sim \nu_t^*}[\alert{\KL(\pi_t^*|\pi_{t,0})}]}{\alpha_t M} + \alpha_t \right) \end{equation*} \item \textbf{Key Insight:} The performance depends heavily on the KL-divergence between the optimal policy $\pi_t^*$ and the initial policy $\pi_{t,0}$. \item Our meta-learner will optimize this upper bound by choosing good `$\alert{\pi_{t,0}}$` and `$\alert{\alpha_t}$`. \end{itemize} \end{block} \end{frame} % Method Part 2: Inexact Framework \begin{frame}{Method: The Inexact Framework} % Corresponds to the source text Section 3.1 \begin{block}{Challenge: Unknown Optimal Policies} \begin{itemize} \item In practice, the optimal policy $\alert{\pi_t^*}$ and its state distribution $\alert{\nu_t^*}$ are unknown. \item We only have access to a suboptimal policy $\alert{\hat{\pi}_t}$ and collected trajectory data $\alert{\mathcal{D}_t}$. \end{itemize} \end{block} \begin{block}{Solution: Estimate and Bound the Error} \begin{itemize} \item \textbf{Estimate:} Use the suboptimal policy $\hat{\pi}_t$ and estimate its state distribution $\hat{\nu}_t$ from data $\mathcal{D}_t$ using \alert{DualDICE}. \item \textbf{Inexact Loss:} The meta-learner optimizes an inexact loss function: $$ \hat{f}_{t}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)] $$ \item \textbf{Bound the Error:} We prove a bound on the estimation error: $$ |\mathbb{E}_{\nu_t^*}[\KL(\pi_t^*|\phi)] - \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]| \leq \alert{\epsilon_t} $$ This bound (Thm. 3.1) is derived using novel techniques from \alert{tame geometry}. \end{itemize} \end{block} \end{frame} % Method Part 3: Adaptive Learning \begin{frame}{Method: Dynamic Regret \& Adaptive Learning Rates} % Corresponds to the source text Section 3.3 \begin{block}{Challenge: Adapting to Dynamic Environments} \begin{itemize} \item A fixed meta-initialization may not be optimal if the environment changes over time. \item Setting the learning rate $\alpha_t$ optimally requires knowledge of future tasks. \end{itemize} \end{block} \begin{block}{Solution: Separate Online Learners} \begin{itemize} \item We decompose the regret upper bound into two components. \item We use two parallel Online Gradient Descent (OGD) algorithms: \begin{enumerate} \item \textbf{INIT}: Learns the policy initialization $\alert{\pi_{t,0}}$ by minimizing $\hat{f}_{t}^{init}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]$. \item \textbf{SIM}: Learns the learning rate $\alert{\alpha_t}$ by minimizing its own loss term $\hat{f}_t^{sim}(\kappa)$. \end{enumerate} \item This allows the framework to adapt both policy and learning rate online, without knowing task properties in advance. \end{itemize} \end{block} \end{frame} % Section 4: Innovation \section{Innovation} % Corresponds to the source text Section 1 (contributions) \begin{frame}{Our Innovations} \begin{block}{Novel Framework and Guarantees} \begin{itemize} \item The \alert{first provable guarantees} for Meta-Safe RL, establishing bounds on task-averaged optimality gap (TAOG) and constraint violation (TACV). \item The regret bounds explicitly improve with \alert{task-similarity} ($\hat{D}^*$) or \alert{task-relatedness} ($\hat{V}_\psi$). \end{itemize} \end{block} \begin{block}{Practical and Adaptive Algorithm} \begin{itemize} \item \textbf{Inexact framework}: Works with suboptimal policies and estimates distributions using \alert{DualDICE}, making it practical. \item \textbf{Adaptive learning}: The meta-learner adapts both policy initialization and learning rates for each task, handling dynamic environments. \end{itemize} \end{block} \begin{block}{Technical Contributions} \begin{itemize} \item New analysis of the \alert{optimization landscape of CMDPs} using tame geometry to bound the distance between optimal and suboptimal policies. \item Extended analysis for \alert{inexact online gradient descent} to handle dynamic regret with biased gradient estimates. \end{itemize} \end{block} \end{frame} % Transition to Experiments \begin{frame} \centering \Huge Experimental Evaluation \end{frame} % Section 5: Experimental Method \section{Experimental Method} % Corresponds to the source text Section 4 \begin{frame}{Experimental Method} \begin{block}{Objective} \begin{itemize} \item To empirically validate the effectiveness of our \alert{Meta-SRL} framework against standard meta-learning baselines. \end{itemize} \end{block} \begin{block}{Baselines for Comparison} \begin{itemize} \item \alert{Random Initialization}: Standard CRPO with a new random policy for each task. \item \alert{Pre-trained}: Initialize with the final policy from the previous task. \item \alert{Simple Averaging}: Offline average of all previously learned policies. \item \alert{Follow the Average Leader (FAL)}: Online average of all previously learned policies. \end{itemize} \end{block} \begin{block}{Task Generation} \begin{itemize} \item We generate a sequence of related CMDP tasks by sampling from a distribution over environment parameters (e.g., transition dynamics, reward functions). \item We test under two conditions: \alert{high task-similarity} and \alert{low task-similarity}. \end{itemize} \end{block} \end{frame} % Section 6: Experimental Setting \section{Experimental Setting} % Corresponds to the source text Section 4 and Appendix G \begin{frame}{Experimental Setting} \begin{block}{Environments} We use a range of classic control environments with added safety constraints: \begin{itemize} \item \textbf{OpenAI Gym:} \begin{itemize} \item \alert{FrozenLake}: Discrete state space, $T=10$ tasks. \item \alert{Acrobot}: Continuous state space, $T=50$ tasks. \end{itemize} \item \textbf{MuJoCo:} \begin{itemize} \item \alert{Half-Cheetah}: High-dimensional continuous control, $T=100$ tasks. Constraint on head height. \item \alert{Humanoid}: Very high-dimensional, $T=250$ tasks. Constraint on joint angles for smooth motion. \end{itemize} \end{itemize} \end{block} \end{frame} % Section 7: Experimental Results \section{Experimental Results} % Corresponds to the source text Section 4 \begin{frame}{Experimental Results: Low Task-Similarity} \begin{columns}[T] \begin{column}{0.5\textwidth} \centering \textbf{FrozenLake} \includegraphics[width=\textwidth]{FrozenLake/FrozenLakeLowSimilarity.pdf} \end{column} \begin{column}{0.5\textwidth} \centering \textbf{Acrobot} \includegraphics[width=\textwidth]{Acrobot/Acrobot_low_similarity2.pdf} \end{column} \end{columns} \begin{block}{Observations} \begin{itemize} \item In settings with low task similarity, \alert{Meta-SRL} (our method) consistently learns faster and more safely. \item It achieves higher rewards while rapidly satisfying the safety constraints (driving constraint violation to zero). \item Simpler baselines like \alert{FAL} and \alert{Pre-trained} struggle to satisfy constraints or learn good policies. \end{itemize} \end{block} \end{frame} \begin{frame}{Experimental Results: MuJoCo Environments} \centering \textbf{Half-Cheetah (Low Task-Similarity)} \begin{figure} \includegraphics[width=0.8\textwidth]{HalfCheetah/HalfCheetahReward_low_task_similarity_broken_axis.pdf} \includegraphics[width=0.8\textwidth]{HalfCheetah/HalfCheetahCost_low_task_similarity.pdf} \caption{Reward (top) and constraint violation (bottom) for Half-Cheetah. Our method (Meta-SRL) learns a high-reward policy while keeping the constraint violation below the threshold (blue line).} \label{fig:halfcheetah} \end{figure} \end{frame} % Section 8: Ablation Experiment \section{Ablation Experiment} \begin{frame}{Ablation Analysis} While no explicit ablation study was conducted, comparing Meta-SRL to the baselines serves as a validation of its key components. \begin{block}{Meta-SRL vs. FAL / Simple Averaging} \begin{itemize} \item \textbf{Ablated Component:} The intelligent meta-update (using \alert{DualDICE} estimates and \alert{OGD} on the regret bound). \item \textbf{Result:} Meta-SRL significantly outperforms simple averaging, showing that a weighted, adaptive update is crucial and superior to naive averaging. \end{itemize} \end{block} \begin{block}{Meta-SRL vs. Pre-trained} \begin{itemize} \item \textbf{Ablated Component:} Learning from a history of multiple tasks. The pre-trained baseline only uses the most recent task. \item \textbf{Result:} Meta-SRL is more robust, especially in low-similarity settings, demonstrating the benefit of aggregating knowledge from diverse past experiences. \end{itemize} \end{block} \begin{block}{Conclusion} The full \alert{Meta-SRL} model, with its inexact estimation and adaptive learning, is critical for achieving strong performance and safety. \end{block} \end{frame} % Section 9: Deficiencies \section{Deficiencies} % Corresponds to the source text Section 5 \begin{frame}{Limitations of the Current Method} \begin{itemize} \item \textbf{Algorithm-Specific Guarantees:} \begin{itemize} \item Our theoretical framework is built upon the \alert{CRPO} algorithm. \item Extending it to other within-task Safe RL algorithms (e.g., primal-dual methods) would require a new analysis of their specific regret bounds. \end{itemize} \bigskip \item \textbf{No Hard Safety Guarantees During Learning:} \begin{itemize} \item The framework minimizes task-averaged constraint violation, achieving safety \textit{on average} and \textit{asymptotically}. \item It does not guarantee \alert{zero constraint violation} at every step during the learning process, which may be a requirement for highly critical systems. \end{itemize} \end{itemize} \end{frame} % Section 10: Future Research \section{Future Research} % Corresponds to the source text Section 5 \begin{frame}{Future Research Directions} \begin{itemize} \item \textbf{Meta-SRL with Zero-Violation Guarantees:} \begin{itemize} \item Designing frameworks that can provide hard safety constraints throughout the learning phase, possibly by integrating pessimistic or certified approaches. \end{itemize} \bigskip \item \textbf{Extension to More Complex Scenarios:} \begin{itemize} \item \alert{Non-stationary environments} where the task distribution itself may shift over time. \item \alert{Multi-agent settings}, where agents must learn to coordinate safely and adapt to each other's policies. \end{itemize} \bigskip \item \textbf{Fairness and Socially Responsible AI:} \begin{itemize} \item Adapting the framework to handle \alert{fairness constraints}, ensuring that RL agents do not produce biased or discriminatory outcomes in non-stationary environments. \end{itemize} \end{itemize} \end{frame} % Section 11: End Slide \section*{End} \begin{frame} \centering \Huge Thank You! \vfill \Large Questions? \end{frame} \end{document}