First we simulated \(n\) pairs of observations, \((y_i,x_i)\) as follows \[ x_i \sim N(0,1) \ \ \ \mbox{and} \ \ \ y_i|x_i \sim N(\alpha+\beta x_i,\sigma^2), \] and randomly select \(d\) regressores \(x_i\) as missing. For convenience, we rearrange the pairs \((y_i,x_i)\) so the last \(d\) are the missing cases.
set.seed(3141593)
set.seed(12345)
n = 10
d = 5
x1 = rnorm(n)
alpha = 0
beta = 1
sig = 1
true = c(alpha,beta,sig)
y1 = alpha+beta*x1+rnorm(n,0,sig)
missing = sort(sample(1:n,size=d,replace=FALSE))
y = c(y1[-missing],y1[missing])
x = c(x1[-missing],x1[missing])
par(mfrow=c(1,1))
plot(x,y,pch=16)
points(x[(n-d+1):n],y[(n-d+1):n],col=4,pch=16,cex=2)
abline(lm(y~x)$coef,col=4,lwd=2)
abline(lm(y[1:(n-d)]~x[1:(n-d)])$coef,col=1,lwd=2)
legend("topleft",legend=c("OLS fit n-ds obs",
"OLS fit n obs"),col=c(1,4),lwd=2)
ols = lm(y~x)
summary(ols)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95785 -0.54544 -0.07149 0.13093 1.76813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2486 0.2757 0.902 0.3936
## x 0.7189 0.3520 2.042 0.0754 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8593 on 8 degrees of freedom
## Multiple R-squared: 0.3427, Adjusted R-squared: 0.2605
## F-statistic: 4.171 on 1 and 8 DF, p-value: 0.07541
ols = lm(y[1:(n-d)]~x[1:(n-d)])
summary(ols)
##
## Call:
## lm(formula = y[1:(n - d)] ~ x[1:(n - d)])
##
## Residuals:
## 1 2 3 4 5
## 0.07834 -0.01394 -0.04091 -0.10670 0.08322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.41399 0.04482 -9.237 0.00268 **
## x[1:(n - d)] 0.31526 0.04486 7.027 0.00592 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09366 on 3 degrees of freedom
## Multiple R-squared: 0.9427, Adjusted R-squared: 0.9236
## F-statistic: 49.39 on 1 and 3 DF, p-value: 0.005919
We will assume the missing values are completely at random and perform Bayesian inference where the unknowns are \[ \theta = (\alpha,\beta,\sigma^2,x_{d+1},\ldots,x_n), \] with independent prior \[ p(\theta) = p(\alpha)p(\beta)p(\sigma^2)\prod_{i=(n-d)+1}^n p(x_{n-d+i}), \] where \(\mu \sim N(a_0,A_0)\), \(\beta \sim N(b_0,B_0)\) and \(\sigma^2 \sim IG(c_0,d_0)\). Here, we will try two noninformative priors for \(x_{n-d+i}\): \[\begin{eqnarray*} \mbox{Prior I} &:& x_{n-d+i} \sim N(0,1), \ \ \mbox{for} \ \ i=1,\ldots,d\\ \mbox{Prior II} &:& x_{n-d+i} \sim U(x_{min},x_{max}), \ \ \mbox{for} \ \ i=1,\ldots,d. \end{eqnarray*}\]
a0 = 0
A0 = 1
b0 = 0
B0 = 1
c0 = 3
d0 = 4
xmin = min(x[1:(n-d)])
xmax = max(x[1:(n-d)])
The MCMC scheme cycles through the following \(d+3\) full conditionals: \[\begin{eqnarray*} p(\alpha|y_{1:n},x_{1:n},\beta,\sigma^2)\\ p(\beta|y_{1:n},x_{1:n},\beta,\sigma^2)\\ p(\sigma^2|y_{1:n},x_{1:n},\beta,\sigma^2)\\ p(x_i|y_{1:n},x_{-i},\alpha,\beta,\sigma^2), \end{eqnarray*}\] for \(i=n-d+1,\ldots,n\). Show that these full conditionals are as follows. \[\begin{eqnarray*} [\alpha] &\sim& N\left\{\left(\frac{1}{A_0}+\frac{n}{\sigma^2}\right)^{-1}\left( \frac{a_0}{A_0}+\frac{\sum_{i=1}^n (y_i-\beta x_i)}{\sigma^2} \right) ;\left(\frac{1}{A_0}+\frac{n}{\sigma^2}\right)^{-1}\right\}\\ [\beta] &\sim& N\left\{\left(\frac{1}{B_0}+\frac{\sum_{i=1}^n x_i^2}{\sigma^2}\right)^{-1}\left( \frac{b_0}{B_0}+\frac{\sum_{i=1}^n (y_i-\alpha)x_i}{\sigma^2} \right) ;\left(\frac{1}{B_0}+\frac{\sum_{i=1}^n x_i^2}{\sigma^2}\right)^{-1}\right\}\\ [\sigma^2] &\sim& IG\left\{c_0+\frac{n}{2};d_0+\frac{\sum_{i=1}^n (y_i-\alpha-\beta x_i)/\sigma^2}{2}\right\}\\ [x_i] &\sim& \left\{ \begin{array}{ll} N\left\{ \left(1+\frac{\beta^2}{\sigma^2}\right)^{-1}\beta(y_i-\alpha); \left(1+\frac{\beta^2}{\sigma^2}\right)^{-1}\right\}, \qquad i=n-d+1,\ldots,n & \mbox{Prior I}\\ N_{[x_{min},x_{max}]}\left(\frac{y_i-\alpha}{\beta};\frac{\sigma^2}{\beta^2}\right)& \mbox{Prior II} \end{array} \right. \end{eqnarray*}\]
# Initial values
xmiss = rep(mean(x[1:(n-d)]),d)
alpha = 0
beta = 0
# Markov chain parameters
M0 = 10000
M = 10000
niter = M0+M
draws = matrix(0,niter,3+d)
# Gibbs sampler
for (iter in 1:niter){
xx = c(x[1:(n-d)],xmiss)
# Sampling sig2
sig2 = 1/rgamma(1,c0+n/2,d0+(sum((y-alpha-beta*xx)^2))/2)
# Sampling alpha
A1 = 1/(1/A0+n/sig2)
a1 = A1*(a0/A0+sum(y-beta*xx)/sig2)
alpha = rnorm(1,a1,sqrt(A1))
# Sampling beta
B1 = 1/(1/B0+sum(xx^2)/sig2)
b1 = B1*(b0/B0+sum((y-alpha)*xx)/sig2)
beta = rnorm(1,b1,sqrt(B1))
# Sampling missing x's
vx = 1/(1+beta^2/sig2)
mx = vx*(beta*(y[(n-d+1):n]-alpha)/sig2)
xmiss = rnorm(d,mx,sqrt(vx))
# Storing the Monte Carlo chains
draws[iter,] = c(alpha,beta,sqrt(sig2),xmiss)
}
drawsI = draws[(M0+1):niter,]
# drawsing truncated normal
rtruncnorm = function(mu,sig,a,b){
A = pnorm(b,mu,sig)
B = pnorm(a,mu,sig)
u = runif(1)
mu+sig*qnorm(u*A-u*B+B)
}
# Initial values
xmiss = rep(mean(x[1:(n-d)]),d)
alpha = 0
beta = 0
# Markov chain parameters
M0 = 10000
M = 10000
niter = M0+M
draws = matrix(0,niter,3+d)
# Gibbs sampler
for (iter in 1:niter){
xx = c(x[1:(n-d)],xmiss)
# Sampling sig2
sig2 = 1/rgamma(1,c0+n/2,d0+(sum((y-alpha-beta*xx)^2))/2)
# Sampling alpha
A1 = 1/(1/A0+n/sig2)
a1 = A1*(a0/A0+sum(y-beta*xx)/sig2)
alpha = rnorm(1,a1,sqrt(A1))
# Sampling beta
B1 = 1/(1/B0+sum(xx^2)/sig2)
b1 = B1*(b0/B0+sum((y-alpha)*xx)/sig2)
beta = rnorm(1,b1,sqrt(B1))
# Sampling missing x's
mx = (y[(n-d+1):n]-alpha)/beta
vx = sig2/beta^2
xmiss = rtruncnorm(mx,sqrt(vx),xmin,xmax)
# Storing the Monte Carlo chains
draws[iter,] = c(alpha,beta,sqrt(sig2),xmiss)
}
drawsII = draws[(M0+1):niter,]
par(mfrow=c(3,2))
ts.plot(drawsI[,1],xlab="Iterations",ylab="",main=expression(alpha))
lines(drawsII[,1],col=2)
hist(drawsI[,1],prob=TRUE,main="",xlab=expression(alpha))
lines(density(drawsII[,1]),col=2)
abline(v=true[1],col=3,lwd=3)
ts.plot(drawsI[,2],xlab="Iterations",ylab="",main=expression(beta))
lines(drawsII[,2],col=2)
hist(drawsI[,2],prob=TRUE,main="",xlab=expression(beta))
lines(density(drawsII[,2]),col=2)
abline(v=true[2],col=3,lwd=3)
ts.plot(drawsI[,3],xlab="Iterations",ylab="",main=expression(sigma))
lines(drawsII[,3],col=2)
hist(drawsI[,3],prob=TRUE,main="",xlab=expression(beta))
lines(density(drawsII[,3]),col=2)
abline(v=true[3],col=3,lwd=3)
par(mfrow=c(2,3))
plot(drawsI[,1],drawsI[,2],xlab=expression(alpha),ylab=expression(beta))
plot(drawsI[,1],drawsI[,3],xlab=expression(alpha),ylab=expression(sigma))
plot(drawsI[,2],drawsI[,3],xlab=expression(beta),ylab=expression(sigma))
plot(drawsII[,1],drawsII[,2],xlab=expression(alpha),ylab=expression(beta))
plot(drawsII[,1],drawsII[,3],xlab=expression(alpha),ylab=expression(sigma))
plot(drawsII[,2],drawsII[,3],xlab=expression(beta),ylab=expression(sigma))
qx = t(apply(rbind(drawsI[,4:(3+d)],drawsI[,4:(3+d)]),2,quantile,c(0.25,0.5,0.75)))
qxI = t(apply(drawsI[,4:(3+d)],2,quantile,c(0.25,0.5,0.75)))
qxII = t(apply(drawsII[,4:(3+d)],2,quantile,c(0.25,0.5,0.75)))
par(mfrow=c(1,2))
plot(x,y,pch=16,xlim=range(x,qx),main="Prior I")
points(x[(n-d+1):n],y[(n-d+1):n],col=4,pch=16,cex=1.2)
points(qxI[,2],y[(n-d+1):n],col=6,pch=16,cex=1.2)
for (i in 1:d)
segments(qxI[i,1],y[n-d+i],qxI[i,3],y[n-d+i],col=6)
plot(x,y,pch=16,xlim=range(x,qx),main="Prior II")
points(x[(n-d+1):n],y[(n-d+1):n],col=4,pch=16,cex=1.2)
points(qxII[,2],y[(n-d+1):n],col=6,pch=16,cex=1.2)
for (i in 1:d)
segments(qxII[i,1],y[n-d+i],qxII[i,3],y[n-d+i],col=6)
xxx = seq(min(x),max(x),length=100)
fx = array(0,c(2,M,100))
for (i in 1:100){
fx[1,,i] = drawsI[,1]+drawsI[,2]*xxx[i]
fx[2,,i] = drawsII[,1]+drawsII[,2]*xxx[i]
}
qfxI = t(apply(fx[1,,],2,quantile,c(0.25,0.5,0.75)))
qfxII = t(apply(fx[2,,],2,quantile,c(0.25,0.5,0.75)))
par(mfrow=c(1,1))
plot(x,y,pch=16,xlim=range(x,qx))
points(x[(n-d+1):n],y[(n-d+1):n],col=4,pch=16,cex=1.2)
lines(xxx,qfxI[,1],col=2,lty=2,lwd=2)
lines(xxx,qfxI[,2],col=2,lwd=2)
lines(xxx,qfxI[,3],col=2,lty=2,lwd=2)
lines(xxx,qfxII[,1],col=3,lty=2,lwd=2)
lines(xxx,qfxII[,2],col=3,lwd=2)
lines(xxx,qfxII[,3],col=3,lty=2,lwd=2)
abline(lm(y[1:(n-d)]~x[1:(n-d)])$coef,lwd=2)
abline(lm(y~x)$coef,col=4,lwd=2)
legend("topleft",legend=c("OLS (n-d)","OLS (n)","Bayes (Prior I)","Bayes (Prior II)"),col=c(1,4,2,3),lwd=2)