#=================================================================================================== # BAYESIAN MULTIPLE LINEAR REGRESSION ANALYSIS #=================================================================================================== # # # Hedibert Freitas Lopes, PhD # Associate Professor of Econometrics and Statistics # The University of Chicago Booth School of Business # 5807 South Woodlawn Avenue # Chicago, IL, 60637 # # This version: October 22nd 2009. # Course: 41913-01 Bayesian Econometrics, Fall 2009. #=================================================================================================== # # MODEL: The standard Bayesian approach to multiple linear regression is # # (y|theta,X) ~ N(X*beta,sig2*In) # # where y=(y1,...,yn), theta = (beta,sig2), X=(x1,...,xn)' is the (n x q), design matrix and q=p+1. # # PRIOR: The prior distribution of theta is NIG(b0,B0,n0,S0), i.e. # # beta|sig2 ~ N(b0,sig2*B0) and sig2 ~ IG(n0/2,n0*S0/2) # # for known hyperparameters b0,B0,n0 and S0. # # PREDICTIVE DISTRIBUTION: # # (y|X) ~ t(n0,X*b0,S0*(In+X*B0*X')) # # CONDITIONAL AND MARGINAL POSTERIOR DISTRIBUTIONS: # # (beta|sig2,y,X) ~ N(b1,sig2*B1) # (sig2|beta,y,X) ~ IG(n1/2,n1*S11/2) # # (beta|y,X) ~ t(n1,b1,S1*B1) # (sig2|y,X) ~ IG(n1/2,n1*S1/2) # # where # iB1*b1 = iB0*b0 + X'y # iB1 = iB0 + X'X # n1 = n0 + n # n1*S1 = n0*S0 + (y-X*b1)'y + (b0-b1)'*iB0*b0 # n1*S11 = n0*S0 + (y-X*beta)'(y-X*beta) # # ORDINARY LEAST SQUARE ANALYSIS: # # betahat = iXtX*X'y # sig2hat = Se/(n-q) # # where Se=e'e, e=y-X*betahat and iXtX is the inverse of X'X. # # The conditional and unconditional sampling distributions of betahat are # # betahat|sig2 ~ N(beta,sig2*iXtX) # sig2hat|sig2 ~ IG((n-q)/2,((n-q)*sig2/2) # betahat ~ t(n-q,beta,Se) # #=================================================================================================== rm(list=ls()) ldigamma = function(sig2,a,b){ dgamma(1/sig2,a,b,log=TRUE)-2*log(sig2) } ldmnorm = function(x,q,m,ldv,iv){ -0.5*(q*log(2*pi)+ldv+t(x-m)%*%iv%*%(x-m)) } ldmnorm1 = function(x,m,C){ if(length(x)==1){ -0.5*(log(2*pi) + log(C) + (x-m)^2/C) }else{ -0.5*(length(x)*log(2*pi) + log(det(C)) + t(x-m)%*%solve(C)%*%(x-m)) } } ldmt = function(x,nu,mu,ldv,iv){ d = length(x) ll = log(nu+t(x-mu)%*%iv%*%(x-mu)) lgamma((nu+d)/2)-lgamma(nu/2)+0.5*nu*log(nu)-0.5*d*log(pi)-0.5*ldv-0.5*(nu+d)*ll } #=================================================================================================== # DATASET 1: 1993-4 U.S. COLLEGE TUITION # Source: http://www.stat.duke.edu/data-sets/mw/teaching-data/univ-data #=================================================================================================== # tuition : College tuition ("out-of-state" rate for those with in-state discount). # pcttop25 : Percent of new students from the top 25% of high school class. # sfratio : Student to faculty ratio. # faccomp : Average faculty compensation. # accrate : Fraction of applicants accepted for admission. # graduat : Percent of students who graduate. # pctphd : Percent of faculty with Ph.D.'s. # fulltime : Percent of undergraduates who are full time students. # alumni : Percent of alumni who donate. # numenrl : Number of new students enrolled. # public : Is the college a public or private institution? public=0, private=1 #=================================================================================================== if(0){ names=c("constant","pcttop25","sfratio ","faccomp ","accrate ","graduate","perctphd","fulltime", "alumni ","numenrl ","public ") data = read.table("college.txt",header=TRUE) n = nrow(data) missing = rep(0,n) data1 = NULL for (i in 1:n){ missing[i] = sum(is.na(data[i,])) if (missing[i]==0){ data1=rbind(data1,data[i,]) } } n = nrow(data1) y = matrix(data1[,1],n,1) q = ncol(data1) p = q-1 X = matrix(1,n,q) for (i in 2:q) X[,i] = data1[,i] set.seed(1245) ind = sample(1:n,replace=F,size=100,prob=rep(1/n,n)) vars = c(1,2,6,11) names = names[vars] X = X[ind,vars] y = y[ind] n = nrow(X) q = ncol(X) p = q-1 } #=================================================================================================== # DATASET 2: Data on maximum compressive strength parallel to the grain y[i], # the density x[i] and the resin-adjusted density z[i] for n=42 specimens of radiata pine. # Source: Han and Carlin (2001) MCMC methods for computing Bayes factors: a comparative review # Journal of the American Statistical Association, 96, 1122-1132. #=================================================================================================== data = matrix(c(1,3040,29.2,25.4,22,3840,30.7,30.7,2,2470,24.7,22.2,23,3800,32.7,32.6, 3,3610,32.3,32.2,24,4600,32.6,32.5,4,3480,31.3,31.0,25,1900,22.1,20.8, 5,3810,31.5,30.9,26,2530,25.3,23.1,6,2330,24.5,23.9,27,2920,30.8,29.8, 7,1800,19.9,19.2,28,4990,38.9,38.1,8,3110,27.3,27.2,29,1670,22.1,21.3, 9,3160,27.1,26.3,30,3310,29.2,28.5,10,2310,24.0,23.9,31,3450,30.1,29.2, 11,4360,33.8,33.2,32,3600,31.4,31.4,12,1880,21.5,21.0,33,2850,26.7,25.9, 13,3670,32.2,29.0,34,1590,22.1,21.4,14,1740,22.5,22.0,35,3770,30.3,29.8, 15,2250,27.5,23.8,36,3850,32.0,30.6,16,2650,25.6,25.3,37,2480,23.2,22.6, 17,4970,34.5,34.2,38,3570,30.3,30.3,18,2620,26.2,25.7,39,2620,29.9,23.8, 19,2900,26.7,26.4,40,1890,20.8,18.4,20,1670,21.1,20.0,41,3030,33.2,29.4, 21,2540,24.1,23.9,42,3030,28.2,28.2),21,8,byrow=T) data = rbind(data[,1:4],data[,5:8]) names = c("constant","density","resin-ajusted density") y = data[,2] X = cbind(1,data[,3]-mean(data[,3]),data[,4]-mean(data[,4])) n = nrow(X) q = ncol(X) p = q-1 # -------------------------------------------------------------------------------- # OLS quantities # -------------------------------------------------------------------------------- Xty = t(X)%*%y XtX = t(X)%*%X iXtX = solve(XtX) bhat = iXtX%*%t(X)%*%y Se = t(y-X%*%bhat)%*%(y-X%*%bhat) sig2hat = Se/(n-q) # -------------------------------------------------------------------------------- # Prior hyperparameters # -------------------------------------------------------------------------------- b0 = c(3000,185,185) B0 = diag(c(10^6,10^4,10^4)) iB0 = solve(B0) iB0b0 = iB0%*%b0 n0 = 6 S0 = 60000 n0S0 = n0*S0 # -------------------------------------------------------------------------------- # All 2^p models have the constant term # -------------------------------------------------------------------------------- In = diag(1,n) l = 0 nmodels = 2^p Model = matrix(0,nmodels,p) for (i1 in 1:0) for (i2 in 1:0){ l = l+1 Model[l,] = c(i1,i2) } Model = cbind(1,Model) # -------------------------------------------------------------------------------- # Computing true value of p(y) # -------------------------------------------------------------------------------- log.pred.true = NULL for (i in 1:nmodels){ ind = Model[i,]==1 q1 = sum(ind) X1 = matrix(X[,ind],n,q1) B01 = B0[ind,ind] b01 = b0[ind] v = S0*(In+X1%*%B01%*%t(X1)) cv = chol(v) ldv = 2*sum(log(diag(cv))) iv = solve(v) log.pred.true = c(log.pred.true,ldmt(y,n0,X1%*%b01,ldv,iv)) } nvar = apply(Model,1,sum) ord = order(log.pred.true,decreasing=TRUE) top = Model[ord,] for (i in 1:nmodels){ names1 = NULL for (j in 1:q) names1 = paste(names1,ifelse(top[i,j]==1,names[j]," "),sep=" ") print(paste(names1,round(log.pred.true[ord[i]],4),sep=" ")) } # --------------------------------------------- # DATASET1: True marginal likelihoods # --------------------------------------------- # constant pcttop25 graduate public -937.2816 # constant pcttop25 public -939.8830 # constant graduate public -941.1353 # constant pcttop25 graduate -951.3476 # constant graduate -952.4882 # constant public -955.9858 # constant pcttop25 -961.2662 # constant -977.8107 # --------------------------------------------- # ------------------------------------------------ # DATASET2: True marginal likelihoods # ------------------------------------------------ # constant resin-ajusted density -312.6480 # constant density resin-ajusted density -318.7411 # constant density -321.5050 # constant -359.1343 # ------------------------------------------------ # -------------------------------------------------------------------------------- # Computing p(y) based on prior draws, posterior draws and the candidate's formula # -------------------------------------------------------------------------------- set.seed(1255) delta = 0.1 nrit = 10 M = 1000 nsim = 10 lp.hm = matrix(0,nsim,nmodels) lp.mc = matrix(0,nsim,nmodels) lp.ch = matrix(0,nsim,nmodels) lp.lm = matrix(0,nsim,nmodels) lp.nr = matrix(0,nsim,nmodels) log.pred.true1 = NULL for (ii in 1:nmodels){ # Likelihood summaries ind = top[ii,]==1 q1 = sum(ind) X1 = matrix(X[,ind],n,q1) X1tX1 = t(X1)%*%X1 X1ty = t(X1)%*%y # Prior hyperparameters B01 = B0[ind,ind] b01 = b0[ind] iB0b01 = iB0b0[ind] iB01 = iB0[ind,ind] # Posetrior hyperparameters B1 = solve(iB01+X1tX1) b1 = B1%*%(iB0b01+X1ty) n1 = n0 + n n1S1 = n0S0 + t(y-X1%*%b1)%*%y + t(b01-b1)%*%iB0b01 sig21 = (n1S1/2)/(n1/2-1) # True p(y) based on p(y)=p(y|th)p(th)/p(th|y) where th=(beta,sig2) loglike = sum(dnorm(y,X1%*%b1,sqrt(sig21),log=TRUE)) logprior = ldmnorm1(b1,b01,sig21[1,1]*B01) + ldigamma(sig21,n0/2,n0S0/2) logpost1 = ldmnorm1(b1,b1,sig21[1,1]*B1) logpost2 = ldigamma(sig21,n1/2,n1S1/2) log.pred.true1 = c(log.pred.true1,loglike+logprior-logpost1-logpost2) # Replications for (sim in 1:nsim){ # Prior draws sig2p = 1/rgamma(M,n0/2,n0S0/2) zp=NULL;for(i in 1:q1){zp=cbind(zp,rnorm(M,0,sqrt(sig2p)))} betap=matrix(b01,M,q1,byrow=T)+zp%*%chol(B01) # Posterior draws sig2s = 1/rgamma(M,n1/2,n1S1/2) zs=NULL;for(i in 1:q1){zs=cbind(zs,rnorm(M,0,sqrt(sig2s)))} betas=matrix(b1,M,q1,byrow=T)+zs%*%chol(B1) # Estimating p(y) loglikep = -0.5*n*(log(2*pi*sig2p)) loglikes = -0.5*n*(log(2*pi*sig2s)) logpriors = rep(0,M) posts = 0.0 for (i in 1:M){ SSE = sum((y-X1%*%betas[i,])^2) logpriors[i] = ldmnorm1(betas[i,],b01,sig2s[1]*B01) + ldigamma(sig2s[i],n0/2,n0S0/2) loglikep[i] = loglikep[i] - 0.5*sum((y-X1%*%betap[i,])^2)/sig2p[i] loglikes[i] = loglikes[i] - 0.5*sum((y-X1%*%betas[i,])^2)/sig2s[i] posts = posts + exp(ldigamma(sig21,(n0+n)/2,(n0S0+SSE/2))) } logposts = loglikes+logpriors likes = exp(loglikes) lp.mc[sim,ii] = log(mean(exp(loglikep-max(loglikep))))+max(loglikep) lp.hm[sim,ii] = log(1/mean(1/exp(loglikes-max(loglikes))))+max(loglikes) lp.ch[sim,ii] = loglike + logprior - logpost1 - log(posts/M) lp.lm[sim,ii] = logposts[logposts==max(logposts)][1]+q1*log(2*pi)+0.5*log(det(var(cbind(betas,sig2s)))) lp.nr[sim,ii] = lp.lm[sim,ii] for (i in 1:nrit){ summ = delta*lp.nr[sim,ii]+(1-delta)*likes lp.nr[sim,ii] = sum(likes/summ)/sum(1/summ) } lp.nr[sim,ii] = log(lp.nr[sim,ii]) } } par(mfrow=c(1,3)) ts.plot(t(lp.mc),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=range(lp.mc,log.pred.true1)) lines(log.pred.true1,col=1,type="b",pch=16) title("Monte Carlo estimator \n sampling from the prior") ts.plot(t(lp.hm),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=range(lp.hm,log.pred.true1)) lines(log.pred.true1,col=1,type="b",pch=16) title("Harmonic mean estimator \n sampling from the posterior") ts.plot(t(lp.ch),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=range(lp.ch,log.pred.true1)) lines(log.pred.true1,col=1,type="b",pch=16) title("Chib's estimator \n Rao-blackwellization of p(sig2|y)") par(mfrow=c(2,2)) L = min(lp.hm,lp.ch,lp.lm,lp.nr,log.pred.true1) U = max(lp.hm,lp.ch,lp.lm,lp.nr,log.pred.true1) ts.plot(t(lp.hm),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U)) lines(log.pred.true1,col=1,type="b",pch=16) title("Harmonic mean estimator \n sampling from the posterior") ts.plot(t(lp.nr),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U)) lines(log.pred.true1,col=1,type="b",pch=16) title(paste("Newton-Raftery estimator \n delta=",delta," and ",nrit," iterations",sep="")) ts.plot(t(lp.lm),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U)) lines(log.pred.true1,col=1,type="b",pch=16) title("Laplace-Metropolis estimator") ts.plot(t(lp.ch),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U)) lines(log.pred.true1,col=1,type="b",pch=16) title("Chib's estimator \n Rao-blackwellization of p(sig2|y)")