Boston house data
- Map of the Boston area
- Covariates
OLS fit
- OLS from scretch
- Running all models (regsubsets)
Bayesian inference estimation of the complete model
Bayesian inference for each one of the \(2^{p-1}\) models
- Posterior model probability
Splitting training and testing samples

Boston house data

library("mlbench")
library("mvtnorm")
library("leaps")

standardize = function(x){
  (x-mean(x))/sd(x)
}
logpredictive = function(y,X,b,B,nu,sig2){
  In = diag(1,nrow(X))
  return(dmvt(y,X%*%b,sig2*(In+X%*%B%*%t(X)),df=nu,log=TRUE))
}

data(BostonHousing2)
attach(BostonHousing2)

Xnames = c("lstat","rm","b","ptratio","lon","lat","indus","rad","nox","chas")

Xnames1 = c("% of lower status of the population","average number of rooms per dwelling",
"1000(B-0.63)^2 - B=proportion of blacks by town","pupil-teacher ratio by town","longitude of census tract",
"latitude of census tract","proportion of non-retail business acres per town","index of accessibility to radial highways",
"nitric oxides concentration (parts per 10 million)","Charles River dummy variable")
yname = "median value of homes"

Map of the Boston area

par(mfrow=c(1,1))
plot(lon,lat)

Covariates

y = cmedv[cmedv<50]  
X = cbind(lstat,rm,b,ptratio,lon,lat,indus,rad,nox,chas)
X = X[cmedv<50,]
n = nrow(X)
p = ncol(X)
y = (y-mean(y))/sqrt(var(y))
for (i in 1:p)
  X[,i] = (X[,i]-mean(X[,i]))/sqrt(var(X[,i]))
colnames(X) = Xnames

par(mfrow=c(2,5))
for (i in 1:p){
  plot(X[,i],y,xlab="",ylab="")
  mtext(side=1,line=2,text=Xnames1[i],cex=0.55)
  mtext(side=2,line=2,text=yname,cex=0.55)
}

OLS fit

fit = lm(y~X-1)
summary(fit)

## 
## Call:
## lm(formula = y ~ X - 1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36495 -0.28402 -0.06474  0.23647  2.11424 
## 
## Coefficients:
##          Estimate Std. Error t value Pr(>|t|)    
## Xlstat   -0.35350    0.03694  -9.570  < 2e-16 ***
## Xrm       0.32673    0.03006  10.870  < 2e-16 ***
## Xb        0.10777    0.02611   4.128 4.31e-05 ***
## Xptratio -0.18398    0.02994  -6.144 1.69e-09 ***
## Xlon     -0.13123    0.02626  -4.997 8.17e-07 ***
## Xlat      0.04839    0.02383   2.031   0.0428 *  
## Xindus   -0.09371    0.04110  -2.280   0.0230 *  
## Xrad     -0.01962    0.03556  -0.552   0.5814    
## Xnox     -0.02379    0.04227  -0.563   0.5737    
## Xchas     0.01481    0.02349   0.631   0.5286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5014 on 480 degrees of freedom
## Multiple R-squared:  0.7533, Adjusted R-squared:  0.7481 
## F-statistic: 146.5 on 10 and 480 DF,  p-value: < 2.2e-16

par(mfrow=c(1,1))
hist(y,breaks=20,xlim=c(-3,3),prob=TRUE,ylim=c(0,1.1))
lines(density(fit$residuals),col=2,lwd=3)

OLS from scretch

Here we replicate the derivations from the R function lm

iXtX    = solve(t(X)%*%X)
betahat = iXtX%*%t(X)%*%y
sigma2hat = sum((y-X%*%betahat)^2)/(n-p)
se.betahat = sqrt(sigma2hat*diag(iXtX))
L = betahat+qt(0.025,df=n-p)
U = betahat+qt(0.975,df=n-p)
tvalue = abs(betahat)/se.betahat
pvalue = 2*(1-pt(tvalue,df=n-p))

tab = cbind(round(betahat,5),round(se.betahat,5),
round(tvalue,3),round(pvalue,5))
colnames(tab) = c("Estimate","Std.Error","t-value","Pr(>|t|)")

Let us compare the results. They should be identical!

tab

##         Estimate Std.Error t-value Pr(>|t|)
## lstat   -0.35350   0.03694   9.570  0.00000
## rm       0.32673   0.03006  10.870  0.00000
## b        0.10777   0.02611   4.128  0.00004
## ptratio -0.18398   0.02994   6.144  0.00000
## lon     -0.13123   0.02626   4.997  0.00000
## lat      0.04839   0.02383   2.031  0.04282
## indus   -0.09371   0.04110   2.280  0.02305
## rad     -0.01962   0.03556   0.552  0.58144
## nox     -0.02379   0.04227   0.563  0.57371
## chas     0.01481   0.02349   0.631  0.52861

summary(lm(y~X-1))

## 
## Call:
## lm(formula = y ~ X - 1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36495 -0.28402 -0.06474  0.23647  2.11424 
## 
## Coefficients:
##          Estimate Std. Error t value Pr(>|t|)    
## Xlstat   -0.35350    0.03694  -9.570  < 2e-16 ***
## Xrm       0.32673    0.03006  10.870  < 2e-16 ***
## Xb        0.10777    0.02611   4.128 4.31e-05 ***
## Xptratio -0.18398    0.02994  -6.144 1.69e-09 ***
## Xlon     -0.13123    0.02626  -4.997 8.17e-07 ***
## Xlat      0.04839    0.02383   2.031   0.0428 *  
## Xindus   -0.09371    0.04110  -2.280   0.0230 *  
## Xrad     -0.01962    0.03556  -0.552   0.5814    
## Xnox     -0.02379    0.04227  -0.563   0.5737    
## Xchas     0.01481    0.02349   0.631   0.5286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5014 on 480 degrees of freedom
## Multiple R-squared:  0.7533, Adjusted R-squared:  0.7481 
## F-statistic: 146.5 on 10 and 480 DF,  p-value: < 2.2e-16

Running all models (regsubsets)

nvmax = 10
data  = data.frame(y=y,X=X)
regs  = regsubsets(y~.,data=data,method="seq",nvmax=nvmax,intercept=FALSE)
regs.summary = summary(regs)

par(mfrow=c(1,2))
plot(regs.summary$bic,pch=16,xlab="Number of predictors",ylab="BIC")
plot(regs,scale="bic")

Bayesian inference estimation of the complete model

b0       = rep(0,p)
B0       = diag(9,p)
nu0      = 5
sig20    = 1.0
nu0sig20 = nu0*sig20
iB0      = solve(B0)
iB0b0    = iB0%*%b0
B1       = solve(iB0+t(X)%*%X)
tcB1     = t(chol(B1))
b1       = B1%*%(iB0b0+t(X)%*%y)
nu1      = nu0 + (n-p)
sig21    = ((nu0sig20+sum((y-X%*%b1)*y)+t(b0-b1)%*%iB0b0)/nu1)[1]
se1      = sqrt(sig21*diag(B1))
logpredy = logpredictive(y,X,b0,B0,nu0,sig20)
L = b1+qt(0.025,df=nu1)
U = b1+qt(0.975,df=nu1)
tvalue = abs(b1)/se1
pvalue = 2*(1-pt(tvalue,df=nu1))

tab.bayes = cbind(round(b1,5),round(se1,5),
round(tvalue,3),round(pvalue,5))
colnames(tab.bayes) = c("Estimate","Std.Error","t-value","Pr(>|t|)")

tab.bayes

##         Estimate Std.Error t-value Pr(>|t|)
## lstat   -0.35339   0.03749   9.426  0.00000
## rm       0.32670   0.03051  10.708  0.00000
## b        0.10775   0.02650   4.066  0.00006
## ptratio -0.18396   0.03039   6.053  0.00000
## lon     -0.13122   0.02666   4.922  0.00000
## lat      0.04836   0.02419   1.999  0.04616
## indus   -0.09372   0.04171   2.247  0.02509
## rad     -0.01966   0.03610   0.545  0.58619
## nox     -0.02384   0.04289   0.556  0.57853
## chas     0.01482   0.02385   0.621  0.53465

Bayesian inference for each one of the \(2^{p-1}\) models

Here we consider \(p-1\) since we will include the intercept in all models

k       = 2^p
model   = matrix(0,k-1,p)
ind     = 1:p
logpred = rep(0,k-1)
nu0     = 5
sig20   = 1.0
l       = 0
for (i1 in 1:0)
for (i2 in 1:0)
for (i3 in 1:0)
for (i4 in 1:0)
for (i5 in 1:0)
for (i6 in 1:0)
for (i7 in 1:0)
for (i8 in 1:0)
for (i9 in 1:0)
for (i10 in 1:0){
  l = l + 1 
  if (l<k){
   model[l,] = c(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10)
   p1 = sum(model[l,])
   X1 = matrix(X[,ind[model[l,]==1]],n,p1)
   b0 = rep(0,p1)
   B0 = diag(9,p1)
   logpred[l] = logpredictive(y,X1,b0,B0,nu0,sig20)
  }
}
nreg = apply(model,1,sum)
tab.bayes = cbind(model,nreg,logpred)
tab.bayes[1:20,]

##                           nreg   logpred
##  [1,] 1 1 1 1 1 1 1 1 1 1   10 -398.4742
##  [2,] 1 1 1 1 1 1 1 1 1 0    9 -394.5118
##  [3,] 1 1 1 1 1 1 1 1 0 1    9 -395.0592
##  [4,] 1 1 1 1 1 1 1 1 0 0    8 -391.0438
##  [5,] 1 1 1 1 1 1 1 0 1 1    9 -394.8806
##  [6,] 1 1 1 1 1 1 1 0 1 0    8 -390.9475
##  [7,] 1 1 1 1 1 1 1 0 0 1    8 -391.5660
##  [8,] 1 1 1 1 1 1 1 0 0 0    7 -387.5690
##  [9,] 1 1 1 1 1 1 0 1 1 1    9 -397.4370
## [10,] 1 1 1 1 1 1 0 1 1 0    8 -393.5044
## [11,] 1 1 1 1 1 1 0 1 0 1    8 -396.4628
## [12,] 1 1 1 1 1 1 0 1 0 0    7 -392.3354
## [13,] 1 1 1 1 1 1 0 0 1 1    8 -393.8890
## [14,] 1 1 1 1 1 1 0 0 1 0    7 -389.9935
## [15,] 1 1 1 1 1 1 0 0 0 1    7 -394.4709
## [16,] 1 1 1 1 1 1 0 0 0 0    6 -390.3542
## [17,] 1 1 1 1 1 0 1 1 1 1    9 -396.3601
## [18,] 1 1 1 1 1 0 1 1 1 0    8 -392.3603
## [19,] 1 1 1 1 1 0 1 1 0 1    8 -392.9659
## [20,] 1 1 1 1 1 0 1 1 0 0    7 -388.9145

plot(nreg,logpred,xlab="Number of regressors",ylab="Log predictive")
legend("bottomright",legend=paste(Xnames[tab.bayes[logpred==max(logpred),1:p]==1]))

Posterior model probability

PMP = exp(logpred)
PMP = PMP/sum(PMP)

plot(nreg,PMP)

tab.bayes = cbind(tab.bayes,PMP)

ord = order(PMP,decreasing=TRUE)

tab.bayes = tab.bayes[ord,]

tab.bayes = cbind(tab.bayes,cumsum(tab.bayes[,p+3]))

tab.bayes[tab.bayes[,p+4]<0.99,]

##                           nreg   logpred         PMP          
##  [1,] 1 1 1 1 1 0 1 0 0 0    6 -385.9380 0.644334765 0.6443348
##  [2,] 1 1 1 1 1 1 1 0 0 0    7 -387.5690 0.126112726 0.7704475
##  [3,] 1 1 1 1 1 0 0 0 1 0    6 -388.0284 0.079661435 0.8501089
##  [4,] 1 1 1 1 1 0 1 1 0 0    7 -388.9145 0.032840131 0.8829491
##  [5,] 1 1 1 1 1 0 0 0 0 0    5 -389.1183 0.026785070 0.9097341
##  [6,] 1 1 1 1 1 0 1 0 1 0    7 -389.1333 0.026387458 0.9361216
##  [7,] 1 1 1 1 1 0 1 0 0 1    7 -389.9666 0.011468109 0.9475897
##  [8,] 1 1 1 1 1 1 0 0 1 0    7 -389.9935 0.011163289 0.9587530
##  [9,] 1 1 1 1 1 0 0 1 0 0    6 -390.0158 0.010917114 0.9696701
## [10,] 1 1 1 1 1 1 0 0 0 0    6 -390.3542 0.007783398 0.9774535
## [11,] 1 1 1 1 1 1 1 0 1 0    8 -390.9475 0.004300264 0.9817538
## [12,] 1 1 1 1 1 1 1 1 0 0    8 -391.0438 0.003905589 0.9856593
## [13,] 1 1 1 1 1 0 0 1 1 0    7 -391.1861 0.003387563 0.9890469

colnames(tab.bayes) = c(Xnames,"nreg","logpred","PMP","cum PMP")
round(tab.bayes[1:10,],3)

##       lstat rm b ptratio lon lat indus rad nox chas nreg  logpred   PMP cum PMP
##  [1,]     1  1 1       1   1   0     1   0   0    0    6 -385.938 0.644   0.644
##  [2,]     1  1 1       1   1   1     1   0   0    0    7 -387.569 0.126   0.770
##  [3,]     1  1 1       1   1   0     0   0   1    0    6 -388.028 0.080   0.850
##  [4,]     1  1 1       1   1   0     1   1   0    0    7 -388.915 0.033   0.883
##  [5,]     1  1 1       1   1   0     0   0   0    0    5 -389.118 0.027   0.910
##  [6,]     1  1 1       1   1   0     1   0   1    0    7 -389.133 0.026   0.936
##  [7,]     1  1 1       1   1   0     1   0   0    1    7 -389.967 0.011   0.948
##  [8,]     1  1 1       1   1   1     0   0   1    0    7 -389.994 0.011   0.959
##  [9,]     1  1 1       1   1   0     0   1   0    0    6 -390.016 0.011   0.970
## [10,]     1  1 1       1   1   1     0   0   0    0    6 -390.354 0.008   0.977

Splitting training and testing samples

We repeat the splitting 100 times

set.seed(12353)
Rep     = 100
n1      = trunc(0.75*n)
n2      = n-n1
k       = 2^p
model   = matrix(0,k-1,p)
ind     = 1:p
nu0     = 5
sig20   = 1.0
logpred1 = matrix(0,Rep,k-1)
logpostpred1 = matrix(0,Rep,k-1)
for (rep in 1:Rep){
  frac    = sample(1:n,size=n1,replace=FALSE)
  Xtrain  = X[frac,]
  ytrain  = y[frac]
  Xtest   = X[-frac,]
  ytest   = y[-frac]
  l       = 0
  for (i1 in 1:0)
  for (i2 in 1:0)
  for (i3 in 1:0)
  for (i4 in 1:0)
  for (i5 in 1:0)
  for (i6 in 1:0)
  for (i7 in 1:0)
  for (i8 in 1:0)
  for (i9 in 1:0)
  for (i10 in 1:0){
    l = l + 1 
    if (l<k){
      model[l,] = c(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10)
      p1 = sum(model[l,])
      X1 = matrix(Xtrain[,ind[model[l,]==1]],n1,p1)
      b0 = rep(0,p1)
      B0 = diag(9,p1)
      nu0sig20 = nu0*sig20
      iB0      = solve(B0)
      iB0b0    = iB0%*%b0
      B1       = solve(iB0+t(X1)%*%X1)
      tcB1     = t(chol(B1))
      b1       = B1%*%(iB0b0+t(X1)%*%ytrain)
      nu1      = nu0 + (n1-p1)
      sig21    = ((nu0sig20+sum((ytrain-X1%*%b1)*ytrain)+t(b0-b1)%*%iB0b0)/nu1)[1]
      Xtest1   = matrix(Xtest[,ind[model[l,]==1]],n2,p1)
      logpred1[rep,l] = logpredictive(ytrain,X1,b0,B0,nu0,sig20)
      logpostpred1[rep,l] = logpredictive(ytest,Xtest1,b1,B1,nu1,sig21)
    }
  }
}

ind            = 1:(k-1)
predmodels     = rep(0,Rep)
postpredmodels = rep(0,Rep)
for (i in 1:Rep){
  predmodels[i]     = ind[logpred1[i,]==max(logpred1[i,])]
  postpredmodels[i] = ind[logpostpred1[i,]==max(logpostpred1[i,])]
}

table(predmodels)/Rep

## predmodels
##    8   16   24   28   30   32   62  152 
## 0.01 0.01 0.71 0.01 0.05 0.17 0.02 0.02

table(postpredmodels)/Rep

## postpredmodels
##    3    4    5    6    7    8   12   13   14   16   20   23   24   26   28   29 
## 0.01 0.09 0.02 0.01 0.03 0.10 0.04 0.01 0.05 0.02 0.09 0.02 0.06 0.07 0.01 0.01 
##   37   38   39   40   64   66   84  131  132  133  136  142  143  144  149  150 
## 0.01 0.01 0.02 0.01 0.01 0.01 0.01 0.01 0.03 0.01 0.01 0.01 0.01 0.01 0.01 0.01 
##  151  158  160  165  166  260  263  392  530  532  538  641  644  645  658 
## 0.01 0.01 0.01 0.01 0.01 0.01 0.02 0.01 0.01 0.01 0.02 0.01 0.01 0.01 0.01

plot(table(predmodels)/Rep)

plot(table(postpredmodels)/Rep)

plot(predmodels)

plot(postpredmodels)

c(Xnames[model[24,]==1],sum(predmodels==24)/Rep)

## [1] "lstat"   "rm"      "b"       "ptratio" "lon"     "indus"   "0.71"

c(Xnames[model[32,]==1],sum(predmodels==32)/Rep)

## [1] "lstat"   "rm"      "b"       "ptratio" "lon"     "0.17"

c(Xnames[model[8,]==1],sum(postpredmodels==8)/Rep)

## [1] "lstat"   "rm"      "b"       "ptratio" "lon"     "lat"     "indus"  
## [8] "0.1"

c(Xnames[model[20,]==1],sum(postpredmodels==20)/Rep)

## [1] "lstat"   "rm"      "b"       "ptratio" "lon"     "indus"   "rad"    
## [8] "0.09"

c(Xnames[model[24,]==1],sum(postpredmodels==24)/Rep)

## [1] "lstat"   "rm"      "b"       "ptratio" "lon"     "indus"   "0.06"

c(Xnames[model[26,]==1],sum(postpredmodels==26)/Rep)

## [1] "lstat"   "rm"      "b"       "ptratio" "lon"     "rad"     "nox"    
## [8] "0.07"

Bayesian linear regression

Conjugate analysis

Hedibert Freitas Lopes

6/02/2021