1 Data

Below we have \(n=10\) observations that we will model as Student’s \(t\) with known degree of freedom \(\nu\).

x = c(-2.33,-0.52,-0.51,-0.38,0.25,0.77,0.79,0.80,1.35,1.40)
n = length(x)

2 Student’s \(t\) model

For a random variable \(x\) following a non-standardized Student’s \(t\)-distribution with \(\nu\) degrees of freedom, location parameter \(\theta\), and scale \(\sigma = 1\), the probability density function (PDF) is given by: \[ f(x | \nu, \theta, 1) = \frac{\Gamma\left(\frac{\nu+1}{2}\right)} {\Gamma\left(\frac{\nu}{2}\right) (\nu\pi)^{1/2}} \left[ 1 + \frac{1}{\nu} (x - \theta)^2 \right]^{-\frac{\nu+1}{2}}. \] Therefore, the likelihood for the \(n\) observations is given by \[ L(\theta;\mbox{data}) \propto \prod_{i=1}^n \left[ 1 + \frac{1}{\nu} (x_i - \theta)^2 \right]^{-\frac{\nu+1}{2}}. \] For simplicity, we will assume a standard normal prior for \(\theta\), ie. \(p(\theta) \propto \exp\left\{-0.5 \theta^2\right\}\).

Obviously, the posterior distribtuion \[ p(\theta|\mbox{data}) \propto \exp\left\{-0.5 \theta^2\right\}\prod_{i=1}^n \left[ 1 + \frac{1}{\nu} (x_i - \theta)^2 \right]^{-\frac{\nu+1}{2}}. \] is of unknown form and posterior inference will be feasible via Monte Carlo integration and simulation. For comparison, below we evaluate the posterior density on a fine grid. Most posterior mass lies between \(-1\) and \(1\), with a location measure around \(0.25\).

nu = 4
N  = 100
thetas = seq(-2,2,length=N)
like = rep(0,N)
for (i in 1:N)
  like[i] = prod(dt(x-thetas[i],df=nu))
prior = exp(-thetas^2/2)
post = prior*like
h = thetas[2]-thetas[1]
post = post/(h*sum(post))
plot(thetas,post,type="l",xlab=expression(theta),ylab="Posterior density")

3 SIR

Here we implement SIR where the proposal density is the prior, \(p(\theta)\)

Recall that the SIR algorithm works as follows:

  1. Sample (large) \(M\) draws from the prior \(p(\theta)\) \[ \{\widetilde{\theta}^{(1)},\ldots,\widetilde{\theta}^{(M)}\}; \]

  2. Evalute the weights \(\omega^{(i)} \propto L(\widetilde{\theta}^{(i)};\mbox{data})\);

  3. Resample \(M_1 \leq M\) from the discrete set \(\{\widetilde{\theta}^{(1)},\ldots,\widetilde{\theta}^{(M)}\}\) with probabilities \(\{\omega^{(1)},\ldots,\omega^{(M)}\}\).

It can be shown that the final sampled set, say \[ \{\theta^{(1)},\ldots,\theta^{(M_1)}\}, \] approximates a sample from \(p(\theta|\mbox{data})\). More details about Monte Carlo integration, MC simulation, SIR and other Monte Carlo schemes, can be found in my course notes here.

M  = 100000
M1 = 20000
time.sir = system.time({
theta.draw = rnorm(M)
w = rep(0,M)
for (i in 1:M)
  w[i] = prod(dt(x-theta.draw[i],df=nu))
theta.sir = sample(theta.draw,size=M1,replace=TRUE,prob=w)
})

hist(theta.sir,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)

4 Random-walk Metropolis

The random-walk Metropolis-Hastings variance is pretty small, \(tau=0.14\). With starting value at \(\theta^{(0)}=0\), the random-walk mMetropolis-Hastings algorithm works as follows, assuming the algorithm is at iteration \(i\):

  1. Sample \(\theta^* \sim N(\theta^{(i-1)},\tau^2)\)

  2. Compute acceptance probability \[ \alpha = \min\left\{1,\frac{p(\theta^*)L(\theta^*;\mbox{data})}{p(\theta^{(i-1)})L(\theta^{(i-1)};\mbox{data})}\right\} \]

  3. With probability \(\alpha\), make \(\theta^{(i)}=\theta^*\); otherwise, make \(\theta^{(i)}=\theta^{(i-1)}\).

  4. Make \(i=i+1\) and repeat steps 1 to 4 until \(i=M\).

theta = 0
tau = 0.1
M0 = M-M1
theta.mh = rep(0,M)
time.mh = system.time({
for (iter in 1:M){
  theta1 = rnorm(1,theta,tau)
  accept = min(1,prod(dt(x-theta1,df=nu))/prod(dt(x-theta,df=nu))*
                 dnorm(theta1)/dnorm(theta))
  if (runif(1)<accept){
  theta = theta1
  }
  theta.mh[iter] = theta
}
})

par(mfrow=c(2,2))
ts.plot(theta.mh,ylab="",xlab="Iterations")
abline(v=M0,col=2,lwd=2)
theta.mh = theta.mh[(M0+1):M]
acf(theta.mh,main="")
hist(theta.mh,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)
plot(thetas,post,main="",xlab="",lwd=2,type="l")
lines(density(theta.sir),col=2,lwd=2)
lines(density(theta.mh),col=4,lwd=2)

5 Independent Metropolis-Hastings

The proposal distribution is Gaussian with mean \(\mu=0\) and variance \(v^2=16\). With starting value at \(\theta^{(0)}=0\), the independent Metropolis-Hastings algorithm works as follows, assuming the algorithm is at iteration \(i\):

  1. Sample \(\theta^* \sim N(\mu,v^2)\)

  2. Compute acceptance probability \[ \alpha = \min\left\{1,\frac{p(\theta^*)L(\theta^*;\mbox{data})}{p(\theta^{(i-1)})L(\theta^{(i-1)};\mbox{data})} \frac{p_N(\theta^{(i-1)}|\mu,v^2)}{p_N(\theta^*|\mu,v^2)} \right\} \]

  3. With probability \(\alpha\), make \(\theta^{(i)}=\theta^*\); otherwise, make \(\theta^{(i)}=\theta^{(i-1)}\).

  4. Make \(i=i+1\) and repeat steps 1 to 4 until \(i=M\).

theta0 = 0
mu = 0
v = 2
theta.imh = rep(0,M)
time.imh = system.time({
for (iter in 1:M){
  theta1 = rnorm(1,mu,v)
  accept = min(1,prod(dt(x-theta1,df=nu))/prod(dt(x-theta,df=nu))*
                 dnorm(theta1)/dnorm(theta)*
                 dnorm(theta,mu,v)/dnorm(theta1,mu,v))
  if (runif(1)<accept){
    theta = theta1
  }
  theta.imh[iter] = theta
}
})

par(mfrow=c(2,2))
ts.plot(theta.imh,ylab="",xlab="Iterations")
abline(v=M0,col=2,lwd=2)
theta.imh = theta.imh[(M0+1):M]
acf(theta.imh,main="")
hist(theta.imh,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)

par(mfrow=c(2,3))

ts.plot(theta.sir,main="SIR",ylab="")
ts.plot(theta.mh,main="Random walk MH",ylab="")
ts.plot(theta.imh,main="Independent MH",ylab="")
acf(theta.sir,lag.max=80,main="")
acf(theta.mh,lag.max=80,main="")
acf(theta.imh,lag.max=80,main="")

par(mfrow=c(1,1))
plot(thetas,post,main="",xlab="",lwd=2,type="l")
lines(density(theta.sir),col=2,lwd=2)
lines(density(theta.mh),col=3,lwd=2)
lines(density(theta.imh),col=4,lwd=2)
legend("topleft",legend=c("Exact posterior","SIR","RW-MH","I-MH"),col=1:4,bty="n",lwd=2)

6 Gibbs sampler based on data augmentation

The Student’s \(t\) distribution \[ x|\theta \sim t_\nu(\theta,1), \] can be rewritten as follows \[\begin{eqnarray*} x|\theta,\lambda \sim N(\theta,1/\lambda)\\ \lambda \sim IG(\nu/2,\nu/2). \end{eqnarray*}\] More precisely, the Student’s \(t\) distribution is a continuous scale-mixture of Gaussians with mixing distribution the Inverse Gamma: \[ p_t(x|\theta,\nu) = \int_0^\infty p_N(x|\theta,\lambda)p_{IG}(\lambda|\nu/2,\nu/2)d\lambda. \]

Therefore, augmented vector of unknown becomes \((\theta,\lambda)\), where \(\lambda=(\lambda_1,\ldots,\lambda_n)\) and the Gibbs sampler cycles through the full conditionals \(p(\theta|\mbox{data},\lambda_1,\ldots,\lambda_n)\) and \(p(\lambda_i|\mbox{data},\theta)\), for \(i=1,\ldots,n\).

  • Sampling \(\theta\): It is easy to see that \((\theta|\mbox{data},\lambda) \sim N(b,B)\), where \[ B^{-1} = 1+\sum_{i=1}^n \lambda_i^{-1} \qquad \mbox{and} \qquad b = B\sum_{i=1}^n x_i/\lambda_i. \]

  • Sampling \(\lambda_i\): It is also easy to see that \[ (\lambda_i|\mbox{data},\theta) \sim IG\left(\frac{\nu+1}{2},\frac{\nu+(x_i-\theta)^2}{2}\right), \] for \(i=1,\ldots,n\).

theta.gs = rep(0,M)
lambdas = rep(1,n)
time.gs = system.time({
for (i in 1:M){
  var     = 1/(1+sum(1/lambdas))
  mean    = var*sum(x/lambdas)
  theta   = rnorm(1,mean,sqrt(var))
  lambdas = 1/rgamma(n,(nu+1)/2,(nu+(x-theta)^2)/2)
  theta.gs[i] = theta
}
})

par(mfrow=c(2,2))
ts.plot(theta.gs,ylab="",xlab="Iterations")
abline(v=M0,col=2,lwd=2)
theta.gs = theta.gs[(M0+1):M]
acf(theta.gs,main="")
hist(theta.gs,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)

7 Comparison

ind = seq(1,M-M0,by=10)
par(mfrow=c(2,4))
plot(ind,theta.sir[ind],main="SIR",ylab="",type="l")
plot(ind,theta.mh[ind],main="Random walk MH",ylab="",type="l")
plot(ind,theta.imh[ind],main="Independent MH",ylab="",type="l")
plot(ind,theta.gs[ind],main="Data augmentation",ylab="",type="l")
acf(theta.sir,lag.max=80,main="")
acf(theta.mh,lag.max=80,main="")
acf(theta.imh,lag.max=80,main="")
acf(theta.gs,lag.max=80,main="")

par(mfrow=c(1,1))
plot(thetas,post,main="",xlab="",lwd=2,type="l")
lines(density(theta.sir),col=2,lwd=2)
lines(density(theta.mh),col=3,lwd=2)
lines(density(theta.imh),col=4,lwd=2)
lines(density(theta.gs),col=5,lwd=2)
#legend("topleft",legend=c("Exact posterior","SIR","RW-MH","I-MH","GS"),col=1:5,bty="n",lwd=2)
legend("topleft",legend=c("Exact posterior",
paste("SIR (",round(time.sir[3],2)," sec)",sep=""),
paste("RW-MH (",round(time.mh[3],2)," sec)",sep=""),
paste("I-MH (",round(time.imh[3],2)," sec)",sep=""),
paste("GS (",round(time.gs[3],2)," sec)",sep="")),col=1:5,bty="n",lwd=2)

---
title: "Student's t data"
subtitle: "SIR, RW-MH, IND-MH and GS"
author: "Hedibert Freitas Lopes"
date: "`r Sys.Date()`"
output:
  html_document:
    theme: paper
    highlight: pygments
    toc: true
    toc_depth: 3
    toc_collapsed: true
    toc_float: true
    code_download: true
    number_sections: true
---


# Data
Below we have $n=10$ observations that we will model as Student's $t$ with known degree of freedom $\nu$.

```{r}
x = c(-2.33,-0.52,-0.51,-0.38,0.25,0.77,0.79,0.80,1.35,1.40)
n = length(x)
```

# Student's $t$ model
For a random variable $x$ following a non-standardized Student's 
$t$-distribution with $\nu$ degrees of freedom, location parameter 
$\theta$, and scale $\sigma = 1$, the probability density function 
(PDF) is given by:
$$
f(x | \nu, \theta, 1) = \frac{\Gamma\left(\frac{\nu+1}{2}\right)}
{\Gamma\left(\frac{\nu}{2}\right) (\nu\pi)^{1/2}} \left[ 1 + 
\frac{1}{\nu} (x - \theta)^2 \right]^{-\frac{\nu+1}{2}}.
$$
Therefore, the likelihood for the $n$ observations is given by
$$
L(\theta;\mbox{data}) \propto \prod_{i=1}^n \left[ 1 + 
\frac{1}{\nu} (x_i - \theta)^2 \right]^{-\frac{\nu+1}{2}}.
$$
For simplicity, we will assume a standard normal prior for $\theta$, ie. 
$p(\theta) \propto \exp\left\{-0.5 \theta^2\right\}$.

Obviously, the posterior distribtuion
$$
p(\theta|\mbox{data}) \propto \exp\left\{-0.5 \theta^2\right\}\prod_{i=1}^n \left[ 1 + 
\frac{1}{\nu} (x_i - \theta)^2 \right]^{-\frac{\nu+1}{2}}.
$$
is of unknown form and posterior inference will be feasible via Monte Carlo integration and simulation.
For comparison, below we evaluate the posterior density on a fine grid.  Most posterior mass lies between 
$-1$ and $1$, with a location measure around $0.25$.

```{r fig.align='center', fig.width=8, fig.height=5}
nu = 4
N  = 100
thetas = seq(-2,2,length=N)
like = rep(0,N)
for (i in 1:N)
  like[i] = prod(dt(x-thetas[i],df=nu))
prior = exp(-thetas^2/2)
post = prior*like
h = thetas[2]-thetas[1]
post = post/(h*sum(post))
plot(thetas,post,type="l",xlab=expression(theta),ylab="Posterior density")
```

# SIR

Here we implement SIR where the proposal density is the prior, $p(\theta)$

Recall that the SIR algorithm works as follows:

1) Sample (large) $M$ draws from the prior $p(\theta)$
$$
\{\widetilde{\theta}^{(1)},\ldots,\widetilde{\theta}^{(M)}\};
$$

2) Evalute the weights $\omega^{(i)} \propto L(\widetilde{\theta}^{(i)};\mbox{data})$;

3) Resample $M_1 \leq M$ from the discrete set $\{\widetilde{\theta}^{(1)},\ldots,\widetilde{\theta}^{(M)}\}$  with probabilities $\{\omega^{(1)},\ldots,\omega^{(M)}\}$.

It can be shown that the final sampled set, say 
$$
\{\theta^{(1)},\ldots,\theta^{(M_1)}\},
$$
approximates a sample from $p(\theta|\mbox{data})$.  More details about Monte Carlo integration, MC simulation, SIR and other Monte Carlo schemes, can be found in my course notes [here](https://hedibert.org/wp-content/uploads/2020/01/aprendizagembayesiana-bayesiancomputation.pdf).



```{r fig.align='center', fig.width=8, fig.height=5}
M  = 100000
M1 = 20000
time.sir = system.time({
theta.draw = rnorm(M)
w = rep(0,M)
for (i in 1:M)
  w[i] = prod(dt(x-theta.draw[i],df=nu))
theta.sir = sample(theta.draw,size=M1,replace=TRUE,prob=w)
})

hist(theta.sir,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)
```

# Random-walk Metropolis

The random-walk Metropolis-Hastings variance is pretty small, $tau=0.14$.  With starting value at $\theta^{(0)}=0$, the random-walk mMetropolis-Hastings algorithm works as follows, assuming the algorithm is at iteration $i$:

1. Sample $\theta^* \sim N(\theta^{(i-1)},\tau^2)$
2. Compute acceptance probability
$$
\alpha = \min\left\{1,\frac{p(\theta^*)L(\theta^*;\mbox{data})}{p(\theta^{(i-1)})L(\theta^{(i-1)};\mbox{data})}\right\}
$$
3. With probability $\alpha$, make $\theta^{(i)}=\theta^*$; otherwise, make $\theta^{(i)}=\theta^{(i-1)}$.

4. Make $i=i+1$ and repeat steps 1 to 4 until $i=M$. 

```{r fig.align='center', fig.width=10, fig.height=7}
theta = 0
tau = 0.1
M0 = M-M1
theta.mh = rep(0,M)
time.mh = system.time({
for (iter in 1:M){
  theta1 = rnorm(1,theta,tau)
  accept = min(1,prod(dt(x-theta1,df=nu))/prod(dt(x-theta,df=nu))*
                 dnorm(theta1)/dnorm(theta))
  if (runif(1)<accept){
  theta = theta1
  }
  theta.mh[iter] = theta
}
})

par(mfrow=c(2,2))
ts.plot(theta.mh,ylab="",xlab="Iterations")
abline(v=M0,col=2,lwd=2)
theta.mh = theta.mh[(M0+1):M]
acf(theta.mh,main="")
hist(theta.mh,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)
plot(thetas,post,main="",xlab="",lwd=2,type="l")
lines(density(theta.sir),col=2,lwd=2)
lines(density(theta.mh),col=4,lwd=2)
```

# Independent Metropolis-Hastings

The proposal distribution is Gaussian with mean $\mu=0$ and variance $v^2=16$.
With starting value at $\theta^{(0)}=0$, the independent Metropolis-Hastings algorithm works as follows, assuming the algorithm is at iteration $i$:

1. Sample $\theta^* \sim N(\mu,v^2)$
2. Compute acceptance probability
$$
\alpha = \min\left\{1,\frac{p(\theta^*)L(\theta^*;\mbox{data})}{p(\theta^{(i-1)})L(\theta^{(i-1)};\mbox{data})}
\frac{p_N(\theta^{(i-1)}|\mu,v^2)}{p_N(\theta^*|\mu,v^2)}
\right\}
$$
3. With probability $\alpha$, make $\theta^{(i)}=\theta^*$; otherwise, make $\theta^{(i)}=\theta^{(i-1)}$.

4. Make $i=i+1$ and repeat steps 1 to 4 until $i=M$. 



```{r fig.align='center', fig.width=10, fig.height=7}
theta0 = 0
mu = 0
v = 2
theta.imh = rep(0,M)
time.imh = system.time({
for (iter in 1:M){
  theta1 = rnorm(1,mu,v)
  accept = min(1,prod(dt(x-theta1,df=nu))/prod(dt(x-theta,df=nu))*
                 dnorm(theta1)/dnorm(theta)*
                 dnorm(theta,mu,v)/dnorm(theta1,mu,v))
  if (runif(1)<accept){
    theta = theta1
  }
  theta.imh[iter] = theta
}
})

par(mfrow=c(2,2))
ts.plot(theta.imh,ylab="",xlab="Iterations")
abline(v=M0,col=2,lwd=2)
theta.imh = theta.imh[(M0+1):M]
acf(theta.imh,main="")
hist(theta.imh,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)

par(mfrow=c(2,3))
ts.plot(theta.sir,main="SIR",ylab="")
ts.plot(theta.mh,main="Random walk MH",ylab="")
ts.plot(theta.imh,main="Independent MH",ylab="")
acf(theta.sir,lag.max=80,main="")
acf(theta.mh,lag.max=80,main="")
acf(theta.imh,lag.max=80,main="")

par(mfrow=c(1,1))
plot(thetas,post,main="",xlab="",lwd=2,type="l")
lines(density(theta.sir),col=2,lwd=2)
lines(density(theta.mh),col=3,lwd=2)
lines(density(theta.imh),col=4,lwd=2)
legend("topleft",legend=c("Exact posterior","SIR","RW-MH","I-MH"),col=1:4,bty="n",lwd=2)
```

# Gibbs sampler based on data augmentation

The Student's $t$ distribution 
$$
x|\theta \sim t_\nu(\theta,1),
$$
can be rewritten as follows
\begin{eqnarray*}
x|\theta,\lambda \sim N(\theta,1/\lambda)\\
\lambda \sim IG(\nu/2,\nu/2).
\end{eqnarray*}
More precisely, the Student's $t$ distribution is a continuous scale-mixture of Gaussians with mixing distribution the Inverse Gamma:
$$
p_t(x|\theta,\nu) = \int_0^\infty p_N(x|\theta,\lambda)p_{IG}(\lambda|\nu/2,\nu/2)d\lambda.
$$

Therefore, augmented vector of unknown becomes $(\theta,\lambda)$, where $\lambda=(\lambda_1,\ldots,\lambda_n)$ and the Gibbs sampler cycles through the full conditionals $p(\theta|\mbox{data},\lambda_1,\ldots,\lambda_n)$ and $p(\lambda_i|\mbox{data},\theta)$, for $i=1,\ldots,n$.

* Sampling $\theta$: It is easy to see that $(\theta|\mbox{data},\lambda) \sim N(b,B)$,
where 
$$
B^{-1} = 1+\sum_{i=1}^n \lambda_i^{-1} \qquad \mbox{and} \qquad
b = B\sum_{i=1}^n x_i/\lambda_i.
$$

* Sampling $\lambda_i$: It is also easy to see that 
$$
(\lambda_i|\mbox{data},\theta) \sim IG\left(\frac{\nu+1}{2},\frac{\nu+(x_i-\theta)^2}{2}\right),
$$
for $i=1,\ldots,n$.

```{r fig.align='center', fig.width=10, fig.height=7}
theta.gs = rep(0,M)
lambdas = rep(1,n)
time.gs = system.time({
for (i in 1:M){
  var     = 1/(1+sum(1/lambdas))
  mean    = var*sum(x/lambdas)
  theta   = rnorm(1,mean,sqrt(var))
  lambdas = 1/rgamma(n,(nu+1)/2,(nu+(x-theta)^2)/2)
  theta.gs[i] = theta
}
})

par(mfrow=c(2,2))
ts.plot(theta.gs,ylab="",xlab="Iterations")
abline(v=M0,col=2,lwd=2)
theta.gs = theta.gs[(M0+1):M]
acf(theta.gs,main="")
hist(theta.gs,prob=TRUE,xlab=expression(theta),ylab="Posterior",main="")
lines(thetas,post,col=2,lwd=3)
```


# Comparison

```{r fig.align='center', fig.width=10, fig.height=6}
ind = seq(1,M-M0,by=10)
par(mfrow=c(2,4))
plot(ind,theta.sir[ind],main="SIR",ylab="",type="l")
plot(ind,theta.mh[ind],main="Random walk MH",ylab="",type="l")
plot(ind,theta.imh[ind],main="Independent MH",ylab="",type="l")
plot(ind,theta.gs[ind],main="Data augmentation",ylab="",type="l")
acf(theta.sir,lag.max=80,main="")
acf(theta.mh,lag.max=80,main="")
acf(theta.imh,lag.max=80,main="")
acf(theta.gs,lag.max=80,main="")


par(mfrow=c(1,1))
plot(thetas,post,main="",xlab="",lwd=2,type="l")
lines(density(theta.sir),col=2,lwd=2)
lines(density(theta.mh),col=3,lwd=2)
lines(density(theta.imh),col=4,lwd=2)
lines(density(theta.gs),col=5,lwd=2)
#legend("topleft",legend=c("Exact posterior","SIR","RW-MH","I-MH","GS"),col=1:5,bty="n",lwd=2)
legend("topleft",legend=c("Exact posterior",
paste("SIR (",round(time.sir[3],2)," sec)",sep=""),
paste("RW-MH (",round(time.mh[3],2)," sec)",sep=""),
paste("I-MH (",round(time.imh[3],2)," sec)",sep=""),
paste("GS (",round(time.gs[3],2)," sec)",sep="")),col=1:5,bty="n",lwd=2)
```
