Introduction

Let us assume that \((y_i,x_i)\) corresponds to the pair height (in cm) and weight (kg) for individual \(i\), for \(i=1,\ldots,n\). Simple linear regression models to relate \(y\)s to \(x\)s are the Gaussian model \[ y_i|x_i,\beta,\sigma \sim N(\beta x_i,\sigma^2), \] and the Student’s \(t_\nu\) model, where \(\nu\) will be kept fixed here, say \(\nu=4\) (heavy tail errors). The Student’s \(t_4\) model, as we will see, is less sensitive to the outliers, when compared to the Gaussian model.

Data colleted in class

We collected data from 15 volunteers (my students and myself) and below we summarize the data information graphically. Notice that we decided to standardized both heights and weights in order to avoid the estimation of an intercept in the linear regression models.

n     = 15
dados = matrix(c(
178,70,
175,80,
184,82,
200,89,
183,80,
173,65,
185,70,
162,70,
190,85,
174,80,
180,87,
165,74,
176,72,
174,63,
197,120),n,2,byrow=TRUE)
x = dados[1:n,1]
y = dados[1:n,2]
x = (x - mean(x))/sd(x)
y = (y - mean(y))/sd(y)

par(mfrow=c(1,1))
plot(x,y,xlab="Height (cm)",ylab="Weight (kg)",pch=16,main="Data and OLS estimation")
abline(lm(y~x)$coef,col=2,lwd=2)
abline(lm(y[1:(n-1)]~x[1:(n-1)])$coef,col=4,lwd=2)
legend("topleft",legend=c("Woutlier","Without outlier"),col=c(2,4),lty=1,bty="n",lwd=2)
text(x[n],y[n]-0.2,"Outlier=Hedibert")

Ordinary least squares (OLS) estimation

Regardless of the Gaussian or Student’s t model, OLS estimates are the same: \[\begin{eqnarray*} {\hat \beta}_{ols} &=& \frac{\sum_{i=1}^n x_iy_i}{\sum_{i=1}^n x_i^2}\\ {\hat \sigma}^2_{ols} &=& \frac{\sum_{i=1}^n (y_i-{\hat \beta}_{ols} x_i)^2}{n-1}\\ se({\hat \beta}_{ols}) &=& \frac{{\hat \sigma}_{ols}}{\sqrt{\sum_{i=1}^n x_i^2}}. \end{eqnarray*}\]

beta.ols  = sum(x*y)/sum(x*x)
sigma.ols = sqrt(sum((y-x*beta.ols)^2)/(n-1))
se.ols    = sqrt(sigma.ols^2/sum(x*x))
c(beta.ols,se.ols)
## [1] 0.685423 0.194605
sigma.ols
## [1] 0.7281451

Bayesian inference

Prior for \((\beta,\sigma)\)

We will use fairly uninformative and independent priors for \(\beta\) and \(\sigma\): \[ \beta \sim N(b_0,\tau_0^2) \ \ \ \mbox{and} \ \ \ \sigma \sim Gamma(c_0,d_0). \] We fixed the hyperparameters at \(b_0=0\), \(\tau_0=1\), \(c_0=2\) and \(d_0=2\).

b0   = 0
tau0 = 1
c0   = 2
d0   = 2

par(mfrow=c(1,2))
curve(dnorm(x,b0,tau0),from=-5,to=5,n=1000,ylab="Prior density",main="Prior of beta",xlab="")
curve(dgamma(x,c0,d0),from=0,to=5,n=1000,ylab="Prior density",main="Prior of sigma",xlab="")

Creating R functions

Below we create R functions for the prior density, both likelihood functions and proposal density for the SIR algorihtm.

prior = function(beta,sigma){
  dnorm(beta,b0,tau0)*dgamma(sigma,c0,d0)
}

likelihood.n = function(beta,sigma){
  prod(dnorm(y,beta*x,sigma))
}

likelihood.t = function(beta,sigma){
  prod(dt((y-beta*x)/sigma,df=nu)/sigma)
}

proposal = function(beta,sigma){
  dnorm(beta,beta.ols,2*se.ols)*dgamma(sigma,sigma.ols,sigma.ols)
}

Using the SIR algorithm for posterior sampling

proposal.is.the.prior = FALSE

set.seed(125353)
nu = 4
M  = 100000
N  = M

if (proposal.is.the.prior==TRUE){
  beta.draw  = rnorm(M,b0,tau0)
  sigma.draw = rgamma(M,c0,d0)
  w.n        = rep(0,M)
  w.t        = rep(0,M)
  for (i in 1:M){
    w.n[i] = likelihood.n(beta.draw[i],sigma.draw[i])
    w.t[i] = likelihood.t(beta.draw[i],sigma.draw[i])   
  }
}else{
  beta.draw  = rnorm(M,beta.ols,2*se.ols)
  sigma.draw = rgamma(M,sigma.ols,sigma.ols)
  w.n        = rep(0,M)
  w.t        = rep(0,M)
  for (i in 1:M){
    w.n[i] = likelihood.n(beta.draw[i],sigma.draw[i])*prior(beta.draw[i],sigma.draw[i])/
             proposal(beta.draw[i],sigma.draw[i])
    w.t[i] = likelihood.t(beta.draw[i],sigma.draw[i])*prior(beta.draw[i],sigma.draw[i])/
             proposal(beta.draw[i],sigma.draw[i])   
  }
}
k.n          = sample(1:M,size=N,replace=TRUE,prob=w.n)
k.t          = sample(1:M,size=N,replace=TRUE,prob=w.t)
beta.post.n  = beta.draw[k.n]
beta.post.t  = beta.draw[k.t]
sigma.post.n = sigma.draw[k.n]
sigma.post.t = sigma.draw[k.t]
par(mfrow=c(2,3))
hist(beta.draw,prob=TRUE,main="Proposal draws",xlab="beta")
abline(v=beta.ols,col=2,lwd=2)
hist(beta.post.n,prob=TRUE,main="Posterior draws\nGaussian model",xlab="beta")
abline(v=beta.ols,col=2,lwd=2)
curve(dnorm(x,b0,tau0),from=-5,to=5,n=1000,add=TRUE,col=4,lwd=2)
hist(beta.post.t,prob=TRUE,main="Posterior draws\nStudent's t(4) model",xlab="beta")
abline(v=beta.ols,col=2,lwd=2)
curve(dnorm(x,b0,tau0),from=-5,to=5,n=1000,add=TRUE,col=4,lwd=2)

hist(sigma.draw,prob=TRUE,main="Proposal draws",xlab="sigma")
abline(v=sigma.ols,col=2,lwd=2)
hist(sigma.post.n,prob=TRUE,main="Posterior draws\nGaussian model",xlab="sigma")
abline(v=sigma.ols,col=2,lwd=2)
curve(dgamma(x,c0,d0),from=0,to=5,n=1000,add=TRUE,col=4,lwd=2)
hist(sigma.post.t,prob=TRUE,main="Posterior draws\nStudent's t(4) model",xlab="sigma")
abline(v=sigma.ols,col=2,lwd=2)
curve(dgamma(x,c0,d0),from=0,to=5,n=1000,add=TRUE,col=4,lwd=2)
legend("topright",legend=c("OLS estimate","Prior density"),col=c(2,4),bty="n",lty=1,lwd=2)

Here we compare \(p(\beta|x,y)\) under both models (Gaussian or Student’s \(t_4\)). We also compare \(p(\sigma|x,y)\) under the Gaussian model with \(p(\sqrt{2}\sigma|x,y)\). Notice that under the Student’s \(t_\nu\) model, \(V(y|x,\sigma^2,\nu)=\nu/(\nu-2)\sigma^2\).

par(mfrow=c(1,2))
plot(density(beta.post.n),xlim=range(beta.post.n,beta.post.t),xlab="theta",main="",col=2,lwd=2,ylim=c(0,2.5))
lines(density(beta.post.t),col=4,lwd=2)
abline(v=beta.ols,lwd=2)
plot(density(sigma.post.n),xlim=range(sigma.post.n,sigma.post.t),xlab="Standard deviation",main="",col=2,lwd=2)
lines(density(sqrt(2)*sigma.post.t),col=4,lwd=2)
abline(v=sigma.ols,lwd=2)
legend("topright",legend=c("Gaussian model","Student's t(4) model","OLS"),col=c(2,4,1),bty="n",lty=1,lwd=2)

Comparing the sensitivity of both models to outliers

Here we compare two models: Gaussian errors, which is more sensitive to outliers, and Student’s \(t_4\) erros, which is less sensitive to outliers.

nn       = 100
xx       = seq(min(x),max(x),length=nn)
meanf    = matrix(xx,N,nn,byrow=TRUE)*matrix(beta.post.n,N,nn)
meanfq.n = apply(meanf,2,quantile,c(0.025,0.5,0.975))
meanf    = matrix(xx,N,nn,byrow=TRUE)*matrix(beta.post.t,N,nn)
meanfq.t = apply(meanf,2,quantile,c(0.025,0.5,0.975))

par(mfrow=c(1,1))
plot(x,y,xlab="Height (standardized)",ylab="Weight (standardized)",pch=16,ylim=c(-2,3))
title("Posterior predictive for the mean function\nE(weight|height,beta) = beta*height")
text(x[n],y[n]-0.2,"Outlier")
lines(xx,meanfq.n[1,],lwd=2,col=2)
lines(xx,meanfq.n[3,],lwd=2,col=2)
lines(xx,meanfq.t[1,],lwd=2,col=4)
lines(xx,meanfq.t[3,],lwd=2,col=4)
legend("topleft",legend=c("Gaussian model (more sensitive to outliers)","Student's t(4) model (less sensitive to outliers)"),col=c(2,4),bty="n",lty=1,lwd=2)

Incorporating \(\nu\) into the analysis of the Student’s \(t_\nu\) model

For the more energized and vitamined of you, repeat the above analysis now including uncertaintly regarding \(\nu\). For instance, for \(\nu \in (3,50)\), three well-known noninformative priors are \[\begin{eqnarray*} p_1(\nu) &\propto& \frac{1}{\nu^2}\\ p_2(\nu) &\propto& \frac{\nu}{(\nu+1)^3}\\ p_3(\nu) &\propto& \left(\frac{\nu}{(\nu+3)}\right)^{1/2}\left\{ \psi'(\nu/2)-\psi'((\nu+1)/2)-\frac{2(\nu+3)}{\nu(\nu+1)^2}\right\}^{1/2}, \end{eqnarray*}\] where \(\psi'(a)=d\{\psi(a)\}/da\) (trigamma function), \(\psi(a)=d\{\log \Gamma(a)\}/da\) (digamma function) and \(\Gamma(a)=\int_0^{\infty} x^{a-1}e^{-x}dx\) (gamma function).

nus = seq(3,50,length=100)
h = nus[2]-nus[1]

f1 = 1/nus^2
f1 = f1/sum(h*f1)

f2 = 2*nus/(nus+1)^3
f2 = f2/sum(h*f2)

f3 = sqrt(nus/(nus+3))*sqrt(trigamma(nus/2)-trigamma((nus+1)/2)-2*(nus+3)/(nus*(nus+1)^2))
f3 = f3/sum(h*f3)

L = min(f1,f2,f3)
U = max(f1,f2,f3)


par(mfrow=c(2,3))
plot(nus,f1,ylim=c(0,U),type="h",ylab="Density",main=expression(p[1](nu)))
plot(nus,f2,ylim=c(0,U),type="h",ylab="Density",main=expression(p[2](nu)),col=2)
plot(nus,f3,ylim=c(0,U),type="h",ylab="Density",main=expression(p[3](nu)),col=3)

plot(nus,f1,ylim=c(0,U),type="l",ylab="Prior density",lwd=2)
lines(nus,f2,lwd=2,col=2)
lines(nus,f3,lwd=2,col=3)

plot(nus,log(f1),ylim=log(c(L,U)),type="l",ylab="Log prior density",lwd=2)
lines(nus,log(f2),lwd=2,col=2)
lines(nus,log(f3),lwd=2,col=3)

Let us use \(p_3(\nu)\)

Here we sample \((\beta,\sigma)\) from their respective priors. For \(\nu\), we use a shifted Exponential, \(3+Exp(1)\), as proposal.

likelihood.t = function(beta,sigma,nu){
  prod(dt((y-beta*x)/sigma,df=nu)/sigma)
}
proposal = function(nu){
  dgamma(nu-3,1,1)
}
prior = function(nu){
  nu/(nu+1)^3
}

set.seed(13567)
beta.draw  = rnorm(M,b0,tau0)
sigma.draw = rgamma(M,c0,d0)
nu.draw    = 3+rgamma(M,1,1)
w          = rep(0,M)
for (i in 1:M)
  w[i] = likelihood.t(beta.draw[i],sigma.draw[i],nu.draw[i])*prior(nu.draw[i])/proposal(nu.draw[i])
k = sample(1:M,size=N,replace=TRUE,prob=w)
beta.post.t2  = beta.draw[k]
sigma.post.t2 = sigma.draw[k]
nu.post.t2    = nu.draw[k]

par(mfrow=c(2,3))
hist(beta.post.n,prob=TRUE,xlab=expression(theta),main="Gaussian model")
abline(v=beta.ols,lwd=2,col=2)
hist(beta.post.t,prob=TRUE,xlab=expression(theta),main="Student's t model\nfixed df")
abline(v=beta.ols,lwd=2,col=2)
hist(beta.post.t2,prob=TRUE,xlab=expression(theta),main="Student's t model\nlearning df")
abline(v=beta.ols,lwd=2,col=2)
hist(sigma.post.n,prob=TRUE,xlab=expression(sigma),main="")
abline(v=sigma.ols,lwd=2,col=2)
hist(sigma.post.t,prob=TRUE,xlab=expression(sigma),main="")
abline(v=sigma.ols,lwd=2,col=2)
hist(sigma.post.t2,prob=TRUE,xlab=expression(sigma),main="")
abline(v=sigma.ols,lwd=2,col=2)

par(mfrow=c(1,1))
hist(nu.post.t2,prob=TRUE,main="",xlab=expression(nu))
lines(nus,f2,col=2,lwd=3)
lines(density(nu.post.t2),col=4,lwd=3)
legend("topright",legend=c("Prior","Posterior"),col=c(2,4),lty=1,lwd=3,bty="n")