Let us assume that \((y_i,x_i)\) corresponds to the pair height (in cm) and weight (kg) for individual \(i\), for \(i=1,\ldots,n\). Simple linear regression models to relate \(y\)s to \(x\)s are the Gaussian model \[ y_i|x_i,\beta,\sigma \sim N(\beta x_i,\sigma^2), \] and the Student’s \(t_\nu\) model, where \(\nu\) will be kept fixed here, say \(\nu=4\) (heavy tail errors). The Student’s \(t_4\) model, as we will see, is less sensitive to the outliers, when compared to the Gaussian model.
We collected data from 15 volunteers (my students and myself) and below we summarize the data information graphically. Notice that we decided to standardized both heights and weights in order to avoid the estimation of an intercept in the linear regression models.
n = 15
dados = matrix(c(
178,70,
175,80,
184,82,
200,89,
183,80,
173,65,
185,70,
162,70,
190,85,
174,80,
180,87,
165,74,
176,72,
174,63,
197,120),n,2,byrow=TRUE)
x = dados[1:n,1]
y = dados[1:n,2]
x = (x - mean(x))/sd(x)
y = (y - mean(y))/sd(y)
par(mfrow=c(1,1))
plot(x,y,xlab="Height (cm)",ylab="Weight (kg)",pch=16,main="Data and OLS estimation")
abline(lm(y~x)$coef,col=2,lwd=2)
abline(lm(y[1:(n-1)]~x[1:(n-1)])$coef,col=4,lwd=2)
legend("topleft",legend=c("Woutlier","Without outlier"),col=c(2,4),lty=1,bty="n",lwd=2)
text(x[n],y[n]-0.2,"Outlier=Hedibert")
Regardless of the Gaussian or Student’s t model, OLS estimates are the same: \[\begin{eqnarray*} {\hat \beta}_{ols} &=& \frac{\sum_{i=1}^n x_iy_i}{\sum_{i=1}^n x_i^2}\\ {\hat \sigma}^2_{ols} &=& \frac{\sum_{i=1}^n (y_i-{\hat \beta}_{ols} x_i)^2}{n-1}\\ se({\hat \beta}_{ols}) &=& \frac{{\hat \sigma}_{ols}}{\sqrt{\sum_{i=1}^n x_i^2}}. \end{eqnarray*}\]
beta.ols = sum(x*y)/sum(x*x)
sigma.ols = sqrt(sum((y-x*beta.ols)^2)/(n-1))
se.ols = sqrt(sigma.ols^2/sum(x*x))
c(beta.ols,se.ols)
## [1] 0.685423 0.194605
sigma.ols
## [1] 0.7281451
We will use fairly uninformative and independent priors for \(\beta\) and \(\sigma\): \[ \beta \sim N(b_0,\tau_0^2) \ \ \ \mbox{and} \ \ \ \sigma \sim Gamma(c_0,d_0). \] We fixed the hyperparameters at \(b_0=0\), \(\tau_0=1\), \(c_0=2\) and \(d_0=2\).
b0 = 0
tau0 = 1
c0 = 2
d0 = 2
par(mfrow=c(1,2))
curve(dnorm(x,b0,tau0),from=-5,to=5,n=1000,ylab="Prior density",main="Prior of beta",xlab="")
curve(dgamma(x,c0,d0),from=0,to=5,n=1000,ylab="Prior density",main="Prior of sigma",xlab="")
Below we create R functions for the prior density, both likelihood functions and proposal density for the SIR algorihtm.
prior = function(beta,sigma){
dnorm(beta,b0,tau0)*dgamma(sigma,c0,d0)
}
likelihood.n = function(beta,sigma){
prod(dnorm(y,beta*x,sigma))
}
likelihood.t = function(beta,sigma){
prod(dt((y-beta*x)/sigma,df=nu)/sigma)
}
proposal = function(beta,sigma){
dnorm(beta,beta.ols,2*se.ols)*dgamma(sigma,sigma.ols,sigma.ols)
}
proposal.is.the.prior = FALSE
set.seed(125353)
nu = 4
M = 100000
N = M
if (proposal.is.the.prior==TRUE){
beta.draw = rnorm(M,b0,tau0)
sigma.draw = rgamma(M,c0,d0)
w.n = rep(0,M)
w.t = rep(0,M)
for (i in 1:M){
w.n[i] = likelihood.n(beta.draw[i],sigma.draw[i])
w.t[i] = likelihood.t(beta.draw[i],sigma.draw[i])
}
}else{
beta.draw = rnorm(M,beta.ols,2*se.ols)
sigma.draw = rgamma(M,sigma.ols,sigma.ols)
w.n = rep(0,M)
w.t = rep(0,M)
for (i in 1:M){
w.n[i] = likelihood.n(beta.draw[i],sigma.draw[i])*prior(beta.draw[i],sigma.draw[i])/
proposal(beta.draw[i],sigma.draw[i])
w.t[i] = likelihood.t(beta.draw[i],sigma.draw[i])*prior(beta.draw[i],sigma.draw[i])/
proposal(beta.draw[i],sigma.draw[i])
}
}
k.n = sample(1:M,size=N,replace=TRUE,prob=w.n)
k.t = sample(1:M,size=N,replace=TRUE,prob=w.t)
beta.post.n = beta.draw[k.n]
beta.post.t = beta.draw[k.t]
sigma.post.n = sigma.draw[k.n]
sigma.post.t = sigma.draw[k.t]
par(mfrow=c(2,3))
hist(beta.draw,prob=TRUE,main="Proposal draws",xlab="beta")
abline(v=beta.ols,col=2,lwd=2)
hist(beta.post.n,prob=TRUE,main="Posterior draws\nGaussian model",xlab="beta")
abline(v=beta.ols,col=2,lwd=2)
curve(dnorm(x,b0,tau0),from=-5,to=5,n=1000,add=TRUE,col=4,lwd=2)
hist(beta.post.t,prob=TRUE,main="Posterior draws\nStudent's t(4) model",xlab="beta")
abline(v=beta.ols,col=2,lwd=2)
curve(dnorm(x,b0,tau0),from=-5,to=5,n=1000,add=TRUE,col=4,lwd=2)
hist(sigma.draw,prob=TRUE,main="Proposal draws",xlab="sigma")
abline(v=sigma.ols,col=2,lwd=2)
hist(sigma.post.n,prob=TRUE,main="Posterior draws\nGaussian model",xlab="sigma")
abline(v=sigma.ols,col=2,lwd=2)
curve(dgamma(x,c0,d0),from=0,to=5,n=1000,add=TRUE,col=4,lwd=2)
hist(sigma.post.t,prob=TRUE,main="Posterior draws\nStudent's t(4) model",xlab="sigma")
abline(v=sigma.ols,col=2,lwd=2)
curve(dgamma(x,c0,d0),from=0,to=5,n=1000,add=TRUE,col=4,lwd=2)
legend("topright",legend=c("OLS estimate","Prior density"),col=c(2,4),bty="n",lty=1,lwd=2)
Here we compare \(p(\beta|x,y)\) under both models (Gaussian or Student’s \(t_4\)). We also compare \(p(\sigma|x,y)\) under the Gaussian model with \(p(\sqrt{2}\sigma|x,y)\). Notice that under the Student’s \(t_\nu\) model, \(V(y|x,\sigma^2,\nu)=\nu/(\nu-2)\sigma^2\).
par(mfrow=c(1,2))
plot(density(beta.post.n),xlim=range(beta.post.n,beta.post.t),xlab="theta",main="",col=2,lwd=2,ylim=c(0,2.5))
lines(density(beta.post.t),col=4,lwd=2)
abline(v=beta.ols,lwd=2)
plot(density(sigma.post.n),xlim=range(sigma.post.n,sigma.post.t),xlab="Standard deviation",main="",col=2,lwd=2)
lines(density(sqrt(2)*sigma.post.t),col=4,lwd=2)
abline(v=sigma.ols,lwd=2)
legend("topright",legend=c("Gaussian model","Student's t(4) model","OLS"),col=c(2,4,1),bty="n",lty=1,lwd=2)
Here we compare two models: Gaussian errors, which is more sensitive to outliers, and Student’s \(t_4\) erros, which is less sensitive to outliers.
nn = 100
xx = seq(min(x),max(x),length=nn)
meanf = matrix(xx,N,nn,byrow=TRUE)*matrix(beta.post.n,N,nn)
meanfq.n = apply(meanf,2,quantile,c(0.025,0.5,0.975))
meanf = matrix(xx,N,nn,byrow=TRUE)*matrix(beta.post.t,N,nn)
meanfq.t = apply(meanf,2,quantile,c(0.025,0.5,0.975))
par(mfrow=c(1,1))
plot(x,y,xlab="Height (standardized)",ylab="Weight (standardized)",pch=16,ylim=c(-2,3))
title("Posterior predictive for the mean function\nE(weight|height,beta) = beta*height")
text(x[n],y[n]-0.2,"Outlier")
lines(xx,meanfq.n[1,],lwd=2,col=2)
lines(xx,meanfq.n[3,],lwd=2,col=2)
lines(xx,meanfq.t[1,],lwd=2,col=4)
lines(xx,meanfq.t[3,],lwd=2,col=4)
legend("topleft",legend=c("Gaussian model (more sensitive to outliers)","Student's t(4) model (less sensitive to outliers)"),col=c(2,4),bty="n",lty=1,lwd=2)
For the more energized and vitamined of you, repeat the above analysis now including uncertaintly regarding \(\nu\). For instance, for \(\nu \in (3,50)\), three well-known noninformative priors are \[\begin{eqnarray*} p_1(\nu) &\propto& \frac{1}{\nu^2}\\ p_2(\nu) &\propto& \frac{\nu}{(\nu+1)^3}\\ p_3(\nu) &\propto& \left(\frac{\nu}{(\nu+3)}\right)^{1/2}\left\{ \psi'(\nu/2)-\psi'((\nu+1)/2)-\frac{2(\nu+3)}{\nu(\nu+1)^2}\right\}^{1/2}, \end{eqnarray*}\] where \(\psi'(a)=d\{\psi(a)\}/da\) (trigamma function), \(\psi(a)=d\{\log \Gamma(a)\}/da\) (digamma function) and \(\Gamma(a)=\int_0^{\infty} x^{a-1}e^{-x}dx\) (gamma function).
nus = seq(3,50,length=100)
h = nus[2]-nus[1]
f1 = 1/nus^2
f1 = f1/sum(h*f1)
f2 = 2*nus/(nus+1)^3
f2 = f2/sum(h*f2)
f3 = sqrt(nus/(nus+3))*sqrt(trigamma(nus/2)-trigamma((nus+1)/2)-2*(nus+3)/(nus*(nus+1)^2))
f3 = f3/sum(h*f3)
L = min(f1,f2,f3)
U = max(f1,f2,f3)
par(mfrow=c(2,3))
plot(nus,f1,ylim=c(0,U),type="h",ylab="Density",main=expression(p[1](nu)))
plot(nus,f2,ylim=c(0,U),type="h",ylab="Density",main=expression(p[2](nu)),col=2)
plot(nus,f3,ylim=c(0,U),type="h",ylab="Density",main=expression(p[3](nu)),col=3)
plot(nus,f1,ylim=c(0,U),type="l",ylab="Prior density",lwd=2)
lines(nus,f2,lwd=2,col=2)
lines(nus,f3,lwd=2,col=3)
plot(nus,log(f1),ylim=log(c(L,U)),type="l",ylab="Log prior density",lwd=2)
lines(nus,log(f2),lwd=2,col=2)
lines(nus,log(f3),lwd=2,col=3)
Here we sample \((\beta,\sigma)\) from their respective priors. For \(\nu\), we use a shifted Exponential, \(3+Exp(1)\), as proposal.
likelihood.t = function(beta,sigma,nu){
prod(dt((y-beta*x)/sigma,df=nu)/sigma)
}
proposal = function(nu){
dgamma(nu-3,1,1)
}
prior = function(nu){
nu/(nu+1)^3
}
set.seed(13567)
beta.draw = rnorm(M,b0,tau0)
sigma.draw = rgamma(M,c0,d0)
nu.draw = 3+rgamma(M,1,1)
w = rep(0,M)
for (i in 1:M)
w[i] = likelihood.t(beta.draw[i],sigma.draw[i],nu.draw[i])*prior(nu.draw[i])/proposal(nu.draw[i])
k = sample(1:M,size=N,replace=TRUE,prob=w)
beta.post.t2 = beta.draw[k]
sigma.post.t2 = sigma.draw[k]
nu.post.t2 = nu.draw[k]
par(mfrow=c(2,3))
hist(beta.post.n,prob=TRUE,xlab=expression(theta),main="Gaussian model")
abline(v=beta.ols,lwd=2,col=2)
hist(beta.post.t,prob=TRUE,xlab=expression(theta),main="Student's t model\nfixed df")
abline(v=beta.ols,lwd=2,col=2)
hist(beta.post.t2,prob=TRUE,xlab=expression(theta),main="Student's t model\nlearning df")
abline(v=beta.ols,lwd=2,col=2)
hist(sigma.post.n,prob=TRUE,xlab=expression(sigma),main="")
abline(v=sigma.ols,lwd=2,col=2)
hist(sigma.post.t,prob=TRUE,xlab=expression(sigma),main="")
abline(v=sigma.ols,lwd=2,col=2)
hist(sigma.post.t2,prob=TRUE,xlab=expression(sigma),main="")
abline(v=sigma.ols,lwd=2,col=2)
par(mfrow=c(1,1))
hist(nu.post.t2,prob=TRUE,main="",xlab=expression(nu))
lines(nus,f2,col=2,lwd=3)
lines(density(nu.post.t2),col=4,lwd=3)
legend("topright",legend=c("Prior","Posterior"),col=c(2,4),lty=1,lwd=3,bty="n")