The KL divergence of distribution $p$ from $q$ is a measure of dissimilarity,
taken as the expectation of the log difference over the support of $p$.
It's used a lot in Bayesian inference (specifically, the divergence of a posterior $p$
from some prior $q$), so we're listing some properties and closed-form equations here.
For discrete distributions over sample space $\mathcal{X}$, we have:
\begin{equation}
D_{KL}(p||q) = \expect\left[\log\left(\frac{p(x)}{q(x)}\right)\right]= \sum_{x \in \mathcal{X}}
p(x)\log\left(\frac{p(x)}{q(x)}\right)
\end{equation}
whereas for continuous distributions we have the integral:
\begin{equation}
D_{KL}(p||q) = \int_{-\infty}^{\infty} p(x)\log\left(\frac{p(x)}{q(x)}\right)dx
\end{equation}
Remark: KL divergence has units of "bits" when using log base 2; "nats" when using natural log.
Remark: Due to the last two properties, the KL divergence doesn't qualify as a distance metric, which is why it's called the KL divergence. Because of asymmetry, we usually say that $D_{KL}(p||q)$ is the "divergence of $p$ from $q$" rather than "between $p$ and $q$".
We often encounter the KL divergence of a Gaussian from another Gaussian. As you'd expect, the algebra works out nicely to closed-formed equations. We list them here for the univariate (Eq. 3) and multivariate (Eq. 4) cases, as well as for the special case of an isotropic posterior from a standard normal prior (Eq. 5).
$\underline{
p(x)=N(\mu_1, \sigma_1^2), q(x)=N(\mu_2, \sigma_2^2)
} \\$
The KL divergence of an univariate Gaussian posterior $p$ from a Gaussian prior $q$ is given by Eq. (3).
\begin{align*} D_{KL}(p||q) &= \int_{-\infty}^{\infty} p(x)\log\left(\frac{p(x)}{q(x)}\right)dx \\ &= \int_{-\infty}^{\infty} p(x) \log \left( \frac{\sigma_2}{\sigma_1} \exp \left[ -\frac{(x-\mu_1)^2}{2\sigma_1^2} + \frac{(x-\mu_2)^2}{2\sigma_2^2} \right] \right) dx \\ &=\ln \left( \frac{\sigma_2}{\sigma_1} \right) - \frac{1}{2\sigma_1^2} \underbrace{\int_{-\infty}^{\infty}p(x)(x-\mu_1)^2dx}_{\sigma_1^2} \\ \tag{Use ln} & \quad + \frac{1}{2\sigma_2^2} \int_{-\infty}^{\infty}p(x)(x-\mu_2)^2dx \\ &=\ln \left( \frac{\sigma_2}{\sigma_1} \right) - \frac{1}{2} + \frac{1}{2\sigma_2^2} \int_{-\infty}^{\infty}p(x)(x^2-2\mu_2x + \mu_2^2)dx \\ &=\ln \left( \frac{\sigma_2}{\sigma_1} \right) - \frac{1}{2} + \frac{1}{2\sigma_2^2} \left( \expect[x^2] - 2\mu_1\mu_2 + \mu_2^2 \right) \\ \tag{$E[x^2] = \sigma^2 + \mu^2$} &=\ln \left( \frac{\sigma_2}{\sigma_1} \right) - \frac{1}{2} + \frac{1}{2\sigma_2^2} \left( \sigma_1^2 + \mu_2^2 - 2\mu_1\mu_2 + \mu_2^2 \right) \\\\ \tag{3} D_{KL}(p||q) &=\ln \left( \frac{\sigma_2}{\sigma_1} \right) - \frac{1}{2} + \frac{1}{2\sigma_2^2} \left( \sigma_1^2 + (\mu_1 - \mu_2)^2 \right) \\ \end{align*}
$\underline{
p(x)=N(\mu, \sigma^2), q(x)=N(0, 1)
}$
For a standard normal prior, $D_{KL}$ only depends on the mean and variance of the posterior,
$\mu$ and $\sigma^2$.
\begin{align*}
&D_{KL}(p||q) = -\ln(\sigma) - \frac{1}{2} + \frac{1}{2}(\sigma^2 + \mu^2)
\end{align*}
$\underline{
p(\bold{x})=N(\pmb{\mu_1}, \bold{\Sigma_1}), q(\bold{x})=N(\pmb{\mu_2}, \bold{\Sigma_2})
}$
For multivariate Gaussian distributions we have:
\begin{align*}
D_{KL}(p||q) &= \int_{-\infty}^{\infty}
p(\bold{x})\log\left(\frac{p(\bold{x})}{q(\bold{x})}\right)d\bold{x} \\
&= \int_{-\infty}^{\infty} p(\bold{x})
\left(
\frac{1}{2} \ln
\left(
\frac{\begin{vmatrix}\bold{\Sigma_2}\end{vmatrix}}{\begin{vmatrix}\bold{\Sigma_1}\end{vmatrix}}
\right)
- \frac{1}{2} \quadxmu{1}{1}{1}
+ \frac{1}{2} \quadxmu{2}{2}{2}
\right) d\bold{x} \\
&=\frac{1}{2}\ln
\left(
\frac{\begin{vmatrix}\bold{\Sigma_2}\end{vmatrix}}{\begin{vmatrix}\bold{\Sigma_1}\end{vmatrix}}
\right)
-\frac{1}{2}
\underbrace{
\expectb\left[
\quadxmu{1}{1}{1}
\right]
}_{\text{(a)}}
+\frac{1}{2}
\underbrace{
\expectb\left[
\quadxmu{2}{2}{2}
\right]
}_{\text{(b)}}
\end{align*}
To evaluate (a) and (b), we use the trace trick for the expectation of quadratic forms.
Recall that the trace of a
square matrix is the sum of the main diagonal. The quadratic forms inside (a) and (b)
evaluate to 1 by 1 matrices (each containing a single quadratic polynomial in $\bold{x}$),
the trace of which is the polynomial itself. This allows us to write (a) as:
\begin{align*}
\expectb
\left[ tr
\left(
\quadxmu{1}{1}{1}
\right)
\right]
\end{align*}
Now, traces are invariant under cyclic permutation, i.e. $tr(ABC)=tr(BCA)=tr(CAB)$, and
the expectation of the trace of a matrix is equal to the trace of the expectation,
$E[tr(A)]=tr(E[A])$. Using both properties we get:
\begin{align*}
tr\left(
\expectb
\left[
(\bold{x} - \pmb{\mu_1})(\bold{x} - \pmb{\mu_1})^T \bold{\Sigma_1^{-1}}
\right]
\right)
\end{align*}
Since the inverse covariance matrix is constant w.r.t. $\bold{x}$, we move it out of the
expectation and find that (a) equals $n$.
\begin{align*}
tr\left(
\bold{\Sigma_1^{-1}}
\expectb
\left[
(\bold{x}-\pmb{\mu_1})(\bold{x}-\pmb{\mu_1})^T
\right]
\right)
= tr\left(
\bold{\Sigma_1^{-1}}\bold{\Sigma_1}
\right)
= tr\left(\bold{I}\right)
= n
\end{align*}
Using the same trick we evaluate (b). Start by introducing $\bold{\mu_1}$:
\begin{align*}
\tag{Introduce $\pmb{\mu_1}$}
&\expectb
[
((\bold{x} - \pmb{\mu_1}) + (\pmb{\mu_1} - \pmb{\mu_2}))^T
\bold{\Sigma_2^{-1}}
((\bold{x} - \pmb{\mu_1}) + (\pmb{\mu_1} - \pmb{\mu_2}))
] \\
\tag{Distribute $\bold{\Sigma_2^{-1}}$}
&= \expectb
[
\left(
(\bold{x}-\pmb{\mu_1})^T \bold{\Sigma_2^{-1}}
+ (\pmb{\mu_1}-\pmb{\mu_2})^T \bold{\Sigma_2^{-1}}
\right)
((\bold{x} - \pmb{\mu_1}) + (\pmb{\mu_1} - \pmb{\mu_2}))
] \\
&= \expectb
[
\quadxmu{1}{2}{1}
+ (\bold{x} - \pmb{\mu_1})^T \bold{\Sigma_2^{-1}} (\pmb{\mu_1} - \pmb{\mu_2}) \\
& \quad + (\pmb{\mu_1} - \pmb{\mu_2})^T \bold{\Sigma_2^{-1}} (\bold{x} - \pmb{\mu_1})
+ \quadmumu{1}{2}{2}
] \\
\tag{Move constants}
&= \expectb [\quadxmu{1}{2}{1}]
+ (\pmb{\mu_1} - \pmb{\mu_2}) \bold{\Sigma_2^{-1}} \expectb [(\bold{x} - \pmb{\mu_1})^T] \\
& \quad + (\pmb{\mu_1} - \pmb{\mu_2})^T \bold{\Sigma_2^{-1}} \expectb[(\bold{x} - \pmb{\mu_1})]
+ \quadmumu{1}{2}{2} \\
\end{align*}
Since $\expectb [(\bold{x} - \pmb{\mu_1})] = \expectb[\bold{x}] - \pmb{\mu_1} = \bold{0}$,
we are left with:
\begin{align*}
\expectb [\quadxmu{1}{2}{1}] + \quadmumu{1}{2}{2}
\end{align*}
\begin{align*}
\tag{Trace trick}
\begin{split}
&= \expectb[tr((\bold{x} - \pmb{\mu_1})(\bold{x} - \pmb{\mu_1})^T \bold{\Sigma_2^{-1}})]
+ \quadmumu{1}{2}{2} \\
&= tr(\expectb[(\bold{x} - \pmb{\mu_1})(\bold{x} - \pmb{\mu_1})^T \bold{\Sigma_2^{-1}}])
+ \quadmumu{1}{2}{2}
\end{split} \\
\end{align*}
\begin{align*}
\tag{Move $\bold{\Sigma_2^{-1}}$}
&= tr(\expectb[(\bold{x} - \pmb{\mu_1})(\bold{x} - \pmb{\mu_1})^T] \bold{\Sigma_2^{-1}})
+ \quadmumu{1}{2}{2} \\
\tag{Definition of $\bold{\Sigma_1}$}
&= tr(\bold{\Sigma_1} \bold{\Sigma_2^{-1}}) + \quadmumu{1}{2}{2}
\end{align*}
Plugging back the simplified expressions for (a) and (b) yields:
\begin{equation}
\tag{4}
D_{KL}(p||q) = \frac{1}{2}
\left( \ln
\left(
\frac{\begin{vmatrix}\bold{\Sigma_2}\end{vmatrix}}{\begin{vmatrix}\bold{\Sigma_1}\end{vmatrix}}
\right)
- n + tr(\bold{\Sigma_1} \bold{\Sigma_2^{-1}}) + \quadmumu{1}{2}{2}
\right)
\end{equation}
$\underline{
p(\bold{x})=N(\pmb{\mu}, diag(\sigma_1^2, ..., \sigma_n^2)), q(\bold{x})=N(\bold{0}, I)
}$
For an isotropic Gaussian posterior and the standard normal prior, we have
$\pmb{\mu_1} = \pmb{\mu}, \bold{\Sigma_1}=diag(\sigma_1^2, ..., \sigma_n^2), \pmb{\mu_2}=\bold{0}$
and $\bold{\Sigma_2}=\bold{I}$.
\begin{align*}
D_{KL}(p||q) &= \frac{1}{2}
\left( \ln
\left(
\frac{\begin{vmatrix}\bold{\Sigma_2}\end{vmatrix}}{\begin{vmatrix}\bold{\Sigma_1}\end{vmatrix}}
\right)
- n + tr(\bold{\Sigma_1} \bold{\Sigma_2^{-1}}) + \quadmumu{1}{2}{2}
\right) \\
&= \frac{1}{2}
\left( \ln
\left(
\frac{\begin{vmatrix}\bold{I}\end{vmatrix}}{\begin{vmatrix}\bold{\Sigma_1}\end{vmatrix}}
\right)
- n + tr(\bold{\Sigma_1} \bold{I})
+ (\pmb{\mu} - \bold{0})^T\bold{I}(\pmb{\mu} - \bold{0})
\right) \\
&= \frac{1}{2}
\left(\ln
\left(
\frac{1}{\begin{vmatrix}diag(\sigma_1^2, ..., \sigma_n^2)\end{vmatrix}}
\right)
- n + tr(diag(\sigma_1^2, ..., \sigma_n^2))
+ \pmb{\mu}^T\pmb{\mu}
\right) \\
&= \frac{1}{2}
\left( -\ln
\left(
\prod_{i=1}^n \sigma_i^2
\right)
- \sum_{i=1}^n 1 + \sum_{i=1}^n \sigma_i^2
+ \sum_{i=1}^n \mu_i^2
\right) \\
&= \frac{1}{2}
\left(
- \sum_{i=1}^n \ln\sigma_i^2
+ \sum_{i=1}^n -1 + \sigma_i^2 + \mu_i^2
\right) \\
\tag{5}
&= \frac{1}{2} \sum_{i=1}^n
\left(
- \ln\sigma_i^2
- 1 + \sigma_i^2 +\mu_i^2
\right)
\end{align*}