\documentclass{article}
\usepackage{cse599s12sp, algorithmic, algorithm, url}
\newcommand{\grad}{\triangledown}
\newcommand{\w}{\mathbf{w}}
\newcommand{\wt}{\mathbf{w}_t}
\newcommand{\wstar}{\mathbf{w}^\star}
\newcommand{\x}{\mathbf{x}}
%% the format for the lecture environment is
% \begin{lecture}{lecture number}{lecture title}{scribe}{lecture date}
\begin{lecture}{6}{The Online Gradient Descent with adaptive learning rate}{Yanping Huang}{Brendan McMahan}{04/12/2012}
\section{The Online Gradient Descent Algorithm}
In the previous lecture, Zinkevich's online gradient descent~\cite{Zinkevich03} algorithm was presented:
\begin{algorithm}~\label{OGD}
\begin{algorithmic}
\STATE ONLINE GRADIENT DESCENT (OGD).
\STATE Inputs: convex feasible set $\Wscr \in \Re^n$, non-increasing step sizes $\eta_1,\eta_2, \ldots \ge 0$, initial $\textbf{w}_0 \in \Wscr$
\FOR {$t = 1,2,\ldots, $}
\STATE $\wt = \Pi_\Wscr(\w_{t-1} - \eta_t \mathbf{g}_t$), where $\mathbf{g}_t \in \partial f_t(\mathbf{w}_t)$
\ENDFOR
\STATE Here $\Pi_{\Wscr}$ denotes the projection onto nearest point in $\Wscr$, $\Pi_\Wscr(\mathbf{\w}) = \arg\min_{\mathbf{\w'} \in \Wscr}\| \w - \w'\|$.
\end{algorithmic}
\end{algorithm}
If we use a feasible set where $\|\w\| \le R$, we showed a general bound for this algorithm of
\begin{equation}\label{getabound}
\text{Regret} \leq \frac{2R^2}{\eta_T} + \frac{1}{2} \sum_{t=1}^T \eta g_t^2.
\end{equation}
%
In the previous lecture, assuming $f_1, f_2, \ldots, f_T$ are
$G$-Lipschitz, we let $\eta_t = \frac{R\sqrt{2}}{G\sqrt{t}}$, and
showed the above bound reduces to
\begin{align}
{\rm Regret} \le 2\sqrt{2} R G \sqrt{T}. \label{OGDBound}
\end{align}
Note that this regret bound is the bound for infinite horizon problems, $i.e.$, the algorithm needs not know the total number of iterations $T$ in advance. The bound holds on the regret up through round $T$ for all $T \geq 1$. The regret bound for the corresponding finite horizon problems (which holds only for a fixed $T$, which we need to know in advance) can be shown to be $2RG\sqrt{T}$.
Some comments on OGD algorithm:
\begin{itemize}
\item Gradient descent problems with smaller feasible sets $\Wscr$ are easier.\\
This can be easily shown by the regret bound in Eq.~\eqref{OGDBound}.
\item The potential function may not be monotonically increasing or
decreasing, depending on the choice of $\mathbf{w}^\star$. Recall
$\Phi(\mathbf{w}) = \frac{1}{2} \|\mathbf{w} -
\mathbf{w}^\star\|^2$ and let the projection operator $\Pi_{\Wscr}$
be the identity operator, that is $\Wscr = \Re^n$. We have the
below recursive relationship:
\begin{align*}
\Phi(\w_{t+1}) &= \frac{1}{2}\|\w_t - \wstar\|^2 - \eta_t \mathbf{g}_t^T (\wt - \wstar) + \frac{1}{2}\eta_t^2 \|g_t\|^2 \\
&= \Phi(\wt) + \frac{1}{2}\eta_t^2\|g_t\|^2 - \eta_t \mathbf{g}_t^T (\wt - \wstar).
\end{align*}
If $f_t(\wstar) \le f_t(\wt)$, we have $\mathbf{g}_t^T (\wstar - \wt) \le 0$ from Lemma~\ref{sublevels} (below), then $\Phi(\w_{t+1})$ will be smaller than $\Phi(\wt)$ if we choose a small enough learning rate. On the other hand, if $f_t(\wstar)$ is not a desired point, $i.e.$, $f_t(\wstar) > f_t(\wt)$, we may have $\mathbf{g}_t^T (\wstar - \wt) > 0$. In this case $\Phi(\w_{t+1}) > \Phi(\wt)$.
\begin{lemma}~\label{sublevels} Let $\mathbf{g}$ be a sub-gradient of
$f$ at $\mathbf{x}$. If $f(\mathbf{y}) \le f(\mathbf{x})$ then
$\mathbf{g}^T(\mathbf{y} - \mathbf{x}) \le 0$ (immediate from
$f(\mathbf{y}) \ge f(\mathbf{x}) + g_t^T(\mathbf{y} -
\mathbf{x})$). Thus, nonzero subgradients of $f$ at $x$ define supporting
hyperplanes to sub-level set $\{\mathbf{y}|f(\mathbf{y}) \le
f(\mathbf{x})\}$ (see picture).
\begin{center}
\includegraphics[scale=0.5]{sublevel.png}
\end{center}
\end{lemma}
\item In a later lecture, we will analyze the
Follow-the-Regularized-Leader (FTRL) algorithm with adaptive
regularization, where on each round we use a total amount of
regularization like $R(\w) \approx \sqrt{t}\|\w\|^2$.
% Suppose
% $R(\w) = \frac{1}{2} \frac{\sqrt{t}G}{\sqrt{2}R} \|\w\|^2$, it is
% easy to verify $\w_{t+1} = \wt - \frac{R\sqrt{2}}{G\sqrt{t}} g_t$
% in
% FTRL, the same as the OGD with optimal $\eta_t$.
\end{itemize}
\section{Lower Bounds for OGD}
In this section, we would like to show that the regret bound for OGD
is tight by constructing problems whose regret is (up to constant
factors) as large as the regret bound for OGD. Note: In this section,
we use $x$'s instead of $w$'s.
\begin{example}
Large learning rates are bad. First we construct a online convex
optimization problem with loss function on every round is $f_t(x) =
G |x - x^\star|$, with $x \in \Re$ and $G \in \Re$ a constant. The
corresponding sub-gradient can be written as ${\rm sign}(x -
x^\star)$, where the sign function ${\rm sign}(x) = 1$ for $x > 0$
and ${\rm sign}(x) = - 1$ for $x < 0$. The OGD update rule $x_{t+1}
= x_t - {\rm sign}(x_t - x^\star)\eta G$ will then make the sequence
$\{x_t\}$ oscillate around the optimal point $x^\star$. Suppose at
time $t$, $x^\star - x_t = \epsilon$ with $0 < \epsilon <
G\eta$. Then, we will have $x_{t+1} = x_0 + G\eta$, so $x_t$ and
$x_{t+1}$ are in the opposite sides of $x^\star$. After the next
update, we will have $x_{t+2} = x_{t+1} - G \eta = x_t$, and so the
oscillation continues. The resulting regret will be $${\rm Regret} =
\frac{T}{2} G\epsilon + \frac{T}{2}G(G\eta - \epsilon) =
\frac{T}{2}G^2\eta. $$
\begin{center}
\includegraphics[scale=0.4]{l1norm.png}
\includegraphics[scale=0.25]{largeStep.png}
\end{center}
The $\frac{T}{2}G\epsilon$ term counts the $T/2$ rounds where we are
at the first point (without loss of generality, $x_0, x_2, \dots, x_t,
x_{t+2}, \dots$), when we are a distance $\epsilon$ from $x^\star$.
The term $\frac{T}{2}G(G\eta - \epsilon)$ counts the regret on the
remaining $T/2$ rounds when we are a distance $G\eta - \epsilon$ from
$x^\star$.
\end{example}
\begin{example}
Small learning rates are bad, too. Let $x \in [0, D]$, and $f_t(x) =
G(D - x)$. The OGD update rule $x_{t+1} = x_t + G\eta$ will generate
a sequence of $\{x_t\} = \{x_0, x_0 + G\eta, x_0 + 2G\eta,
\ldots,\}$. Let $x_0 = 0$, after $K = \frac{D}{2G\eta}$ steps, the
total regret will be
$$
{\rm Regret} = \sum_{t = 0}^k G(D-tG\eta) \ge \frac{GD}{2}\frac{D}{2G\eta} = \frac{D^2}{4\eta}.
$$
\begin{center}
\includegraphics[scale=0.25]{example2.png}
\end{center}
\end{example}
Using previous two constructions, for any learning rate, the adversary
can choose a one dimensional problem where regret is at least
$\max{\{\frac{D^2}{4\eta}, G^2\eta\frac{T}{2}\}}$. In comparison, the
regret bound for OGD with feasible region $[0,D]$ and maximum gradient
$G$ has the form of $\frac{D^2}{2\eta} + G^2\eta\frac{T}{2}$. This
shows the regret bound for OGD is tight.
\section{OGD with adaptive learning rate.}
Our goal now will be to study Eq.~\eqref{getabound}, and to
derive better learning rate schedules which produce lower regret
bounds. We begin with a motivating example that shows when this may
be possible.
Suppose in the online optimization game (in one dimension), the adversary plays a
sequence like
\[
g_t = \{0,0,\ldots, 0, 1, 0, \ldots, 0, -1, 0, 1, 0, \ldots\},
\]
$t = 1, \ldots, T$. For example, let $T = 10^{10}$ but only $10^4$ of
$g_t$ are non-zero. The OGD with $\eta_t \approx 1/\sqrt{t}$ will
have a step size decreasing on each round. This choice of step size
will have a regret bound of order $\sqrt{T} = 10^5$. Alternatively, if
we update the step size only when $g_t \neq 0$, we will have a much
lower regret bound of order $\sqrt{10^4} = 10^2$. The key is that we
can safely ignore rounds when $g_t = 0$, because whatever $w_t$ we
choose will incur the same loss as any $w^\star$, so our regret will
be the same as the OGD algorithm that is only updated when there are
non-zero gradients.
To deal with the case where the adversary replaces $0$s with some
infinitesimal number $\epsilon \simeq 0$, a more general step size
updating rule is needed. Suppose we know the sequence
$\{g_t\}_{t=1,\ldots,T}$ in advance, the optimal fixed learning rate
$\hat{\eta} = \frac{2R}{\sqrt{\sum_{t=1}^Tg_t^2}}$ will have a regret
at most $2R\sqrt{\sum_{t=1}^T g_t^2}$. This can be derived by taking
derivatives to optimize for the best fixed rate, as we have done
several times already.
But, we can do almost as well without knowing any of the $g_t$ in
advance! We use the adaptive global learning rate:
\begin{align}
\eta_t &= \frac{R\sqrt{2}}{\sqrt{\sum_{s=1}^t g_s^2}} \label{eq:rate}
\end{align}
The corresponding regret is
\begin{align}
{\rm Regret} \le 2\sqrt{2} R\sqrt{\sum_{t=1}^Tg_t^2 } \label{eq:bb}
\end{align}
It is not obvious that plugging the learning rate from
Eq.~\eqref{eq:rate} into the bound of Eq.~\eqref{getabound} gives a
bound like Eq.~\eqref{eq:bb}. Proving this requires a technical
lemma, see for example~\cite{streeter10}.
We apply the above algorithm to the following online prediction problem:\\
\begin{example}
{\bf Online Prediction Problem.}
Let $\{\x_t, y_t\}$ be a sequence of learning examples where $\x_t$
represents the feature vector and $y_t$ represents the label. $\x_t
\in \Re^n$, $ y_t \in \{0, 1\}$. We will make predictions using a
\textbf{generalized linear model}, where $\hat{y}_t = \sigma(\wt,
\x_t)$ based on $\x_t$ and parameters $\wt$. The nonlinear function
$\sigma(\cdot)$ maps from $\Re$ to $[0,1]$. For example
$\sigma(\cdot)$ can be a sigmoid function $\sigma(x) = \frac{e^x}{1 +
e^x}$. Then this online prediction problem can be viewed as logistic
regression problem. At each round $t$, the player receives $\x_t$,
makes a prediction $\hat{y}_t$ by choosing a $\wt$, and suffers a loss
$f_t(\wt) = \ell(\wt \cdot \x_t, y_t)$. For generalized linear
models, $\ell$ is usually chosen so that $g_t = \grad f_t(w_t) = (\sigma(\wt \cdot
\x_t) - y_t) \x_t = (\hat{y}_t - y_t) \x_t$.
In problems like document classification, the feature vector $\x_t$ is
usually very sparse. For example, $x_{t,i}$ may represent whether a
word $i$ appears in a document $t$ or not. While $x_t$ is in a very
high $n$-dimensional space ($n$ is the number of words in a
dictionary), typically most $x_{t,i}$ are zero (since most documents
have a relatively small number of distinct words).
% In this case, although the norm $\|\mathbf{g}_t\| > 0$ all
% the time, the projection of $\mathbf{g}_t$ on to a particular
% coordinate will be zero most of the time.
If we choose a feasible set $\Wscr = \{ \w \in \Re^n \mid w_i \in [-B_i,
B_i]\}$ for constants $B_i \in \Re$, we can apply OGD with the learning rate from
Eq.~\eqref{eq:rate} on a per-coordinate basis:
\begin{align*}
w_{t+1,i} &= \Pi_{[-B_i, B_i]} (w_{t,i} - \eta_{t,i} g_{t,i}) \\
{\rm where\ }\ \eta_{t,i} &= \frac{\sqrt{2}B_i}{\sqrt{\sum_{s=1}^t g_{s,i}^2 }}.
\end{align*}
The regret can be shown to be no more than $\sum_{i=1}^n 2B_i \sqrt{2
\sum_{t=1}^T g_{t,i}^2}$. Writing $\mathbf{B} = (B_1, B_2, \ldots,
B_n)$ and $\mathbf{g}_t = (\ldots, \sqrt{\sum_{t=1}^T g_{t,i}^2},
\ldots)$, We have
\begin{align*}
{\rm Regret} \le 2\sqrt{2} \mathbf{B} \cdot \mathbf{g}_t \le 2\sqrt{2} \|\mathbf{B}\| \|\mathbf{g}_t\| = 2\sqrt{2}R\sqrt{\sum_{t=1}^T \|\mathbf{g}_t\|^2},
\end{align*}
and so the adaptive per-coordinate learning rate gives a bound as least as good as the adaptive global rate, Eq.~\eqref{eq:rate}.
\end{example}
\begin{thebibliography}{10}
\bibitem{Zinkevich03}
M. Zinkevich, ``Online convex programming and generalized infinitesimal gradient ascent'', in \emph{International Conference on Machine Learning}, 2003.
\bibitem{streeter10}
M. Streeter and H.B. McMahan, ``Less Regret via Online Conditioning'', \url{http://arxiv.org/abs/1002.4862}, 2010.
\end{thebibliography}
\end{lecture}
\theend