Introduction

In this set of notes we will illustrate the power (and limitations) of basic Monte Carlo (MC) methods.

Properties of the OLS estimator

Suppose we observe \(n\) pair of points \((x_i,y_i)\) and would like to study the behavior of the ordinary least square (OLS) estimator of \(\beta\) in a simple (through the origin) linear and Gaussian regression: \[ y_i = \beta x_i + \epsilon_i \qquad \epsilon_i \sim N(0,\sigma^2), \] where \(cov(\epsilon_i,\epsilon_j)=0\) for all \(i \neq j\) and known \(\sigma^2\). The OLS estimator \(\beta\) is \[ {\hat \beta}_{ols} = \frac{\sum_{i=1}^n x_i y_i}{\sum_{i=1}^n x_i^2}. \] You have learned, in your elementary statistics and elementary econometrics courses, that the sampling distribution of \({\hat \beta}_{ols}\) is Gaussian with mean \(\beta\). Here is one example:

set.seed(121345)
n     = 100
beta  = 1
sig   = 0.25
x     = runif(n)
error = rnorm(n,0,sig) 
y     = beta*x+error
plot(x,y)

fit = lm(y~x-1)
summary(fit)
## 
## Call:
## lm(formula = y ~ x - 1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52471 -0.14866  0.02009  0.19172  0.58672 
## 
## Coefficients:
##   Estimate Std. Error t value Pr(>|t|)    
## x  0.97986    0.04414    22.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2335 on 99 degrees of freedom
## Multiple R-squared:  0.8327, Adjusted R-squared:  0.831 
## F-statistic: 492.7 on 1 and 99 DF,  p-value: < 2.2e-16

Let us replicate the exercise

Now, let us replicate the above simulation exercise and see the actual behavior of \({\hat \beta}_{ols}\). This is what we commonly call a Monte Carlo exercise.

set.seed(31415)
R   = 1000
ols = rep(0,R)
for (rep in 1:R){
  x        = runif(n)
  error    = rnorm(n,0,sig) 
  y        = beta*x+error
  ols[rep] = lm(y~x-1)$coef
}
hist(ols,prob=TRUE,xlab="OLS estimate",ylab="Density",main="")
title(paste("(n,beta,sig)=(",n,",",beta,",",sig,")",sep=""))
abline(v=beta,col=2,lwd=4)

Let us now get a bit more serious!

Now, let us transform the above exercise into a function of the sample size \(n\), the replication size \(R\), the true value of \(\beta\) and the value of \(\sigma\). The simulated errors \(\{\epsilon_1,\ldots,\epsilon_n\}\) are either standard Gaussian or Student’s \(t\) with 5 degrees of freedom.

sim.ols = function(n,R,beta,sig,L,U,errortype){
  ols = rep(0,R)
  for (r in 1:R){
    if (errortype==1){
      error = sig*rnorm(n) 
    }
    if (errortype==2){
      error = sig*rt(n,df=5) 
    }
    x      = runif(n)
    y      = beta*x+error
    ols[r] = lm(y~x-1)$coef
  }
  hist(ols,prob=TRUE,xlab="OLS estimate",ylab="Density",main="",xlim=c(L,U))
  title(paste("(n,sig)=(",n,",",sig,")",sep=""))
  abline(v=beta,col=2,lwd=2)
  if (errortype==1){
    legend("topleft",legend="Gaussian error",bty="n")
  }
  if (errortype==2){
    legend("topleft",legend="Student's t error",bty="n")
  }
}

Look how compact and clean (and easy to debug!) is the stripc that loops through \(n \in \{20,100\}\) and \(\sigma \in \{0.25,0.5\}\).

set.seed(31415)
R    = 1000
beta = 1
par(mfrow=c(2,4))
for (n in c(20,100))
for (sig in c(0.25,0.5)){
  sim = sim.ols(n,R,beta,sig,0,2,1)
  sim = sim.ols(n,R,beta,sig,0,2,2)
}

MC also helps revealing weaknesses of OLS

In the following Monte Carlo exercise, we add a second variable, \(Z\), in the regression model, but do not include it in the regression estimation. This is commonly known as a omitted variable problem.

sim.ols = function(n,R,beta,sig,L,U,errortype){
  ols = rep(0,R)
  for (r in 1:R){
    if (errortype==1){
      error = sig*rnorm(n) 
    }
    if (errortype==2){
      error = sig*rt(n,df=5) 
    }
    x      = runif(n)
    z      = runif(n)
    y      = beta*x + 0.1*z + error
    ols[r] = lm(y~x-1)$coef
  }
  hist(ols,prob=TRUE,xlab="OLS estimate",ylab="Density",main="",xlim=c(L,U))
  title(paste("(n,sig)=(",n,",",sig,")",sep=""))
  abline(v=beta,col=2,lwd=2)
  if (errortype==1){
    legend("topleft",legend="Gaussian error",bty="n")
  }
  if (errortype==2){
    legend("topleft",legend="Student's t error",bty="n")
  }
}

Let us run it and see the anomaly.

set.seed(31415)
R    = 1000
beta = 1
par(mfrow=c(2,4))
for (n in c(20,100))
for (sig in c(0.25,0.5)){
  sim = sim.ols(n,R,beta,sig,0,2,1)
  sim = sim.ols(n,R,beta,sig,0,2,2)
}

Monte Carlo Integration (MCI)

MCI is a powerful tool to approximate integrals, such as following ones. First, let \(X \sim t_5(0,1)\) and \(Y \sim N(0,1)\). We want to compute: \[\begin{eqnarray*} Pr(X>3) &=& \int_{3}^{\infty} f_t(x)dx\\ E(X^2) &=& \int_{-\infty}^{\infty} x^2 f_t(x)dx\\ P(Y>3) &=& \int_{3}^{\infty} f_n(y)dy\\ E(Y^2) &=& \int_{3}^{\infty} y^2 f_n(y)dy \end{eqnarray*}\]

The exact values are \(0.01504962\), \(0.001349898\), \(5/3\) and \(1\).

set.seed(123456)
exact = c(1-pt(3,df=5),1-pnorm(3),5/(5-2),1)
M0 = 100
M  = 10000
ind = M0:M
draw.x = rt(M,df=5)
draw.y = rnorm(M)
I1.x = cumsum(draw.x>3)/(1:M)
I1.y = cumsum(draw.y>3)/(1:M)
I2.x = cumsum(draw.x^2)/(1:M)
I2.y = cumsum(draw.y^2)/(1:M)
MC.approx = cbind(I1.x,I1.y,I2.x,I2.y)

par(mfrow=c(2,2))
ts.plot(MC.approx[ind,1],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[1],col=2)
title("X ~ t_5(0,1) - Pr(X>3)")
ts.plot(MC.approx[ind,2],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[2],col=2)
title("Y ~ N(0,1)   - Pr(Y>3)")
ts.plot(MC.approx[ind,3],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[3],col=2)
title("X ~ t_5(0,1) - E(X^2)")
ts.plot(MC.approx[ind,4],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[4],col=2)
title("Y ~ N(0,1)   - E(Y^2)")

Reducing MCI error

Since \(X \sim t_5(0,1)\) is symmetric, we can use the identity: \[ I_1 = Pr(X>3) = \frac{Pr(X>3)+Pr(X<-3)}{2} = I_2. \] Therefore we could use MCI to approximate \(I_1\) or \(I_2\).

set.seed(654321)
exact = 1-pt(3,df=5)
M0  = 100
M   = 10000
ind = M0:M
xd  = rt(M,df=5)
MC1 = cumsum(xd>3)/(1:M)
MC2 = cumsum((xd>3)|(xd<(-3)))/(1:M)/2

par(mfrow=c(1,1))
ts.plot(MC1[ind],xlab="Monte Carlo sample",ylab="Estimate of Pr(X>3)",ylim=range(MC1[ind],MC2[ind]))
lines(MC2[ind],col=2)
abline(h=exact[1],col=2)
title("X ~ t_5(0,1)")

MC = array(0,c(2,100,M))
for (i in 1:100){
  xd  = rt(M,df=5)
  MC[1,i,] = cumsum(xd>3)/(1:M)
  MC[2,i,] = cumsum((xd>3)|(xd<(-3)))/(1:M)/2
}
  
SD1 = sqrt(apply(MC[1,,ind],2,var))
SD2 = sqrt(apply(MC[2,,ind],2,var))

par(mfrow=c(2,2))
ts.plot(MC[1,1,ind],xlab="Monte Carlo sample",ylab="Estimate of Pr(X>3)",ylim=range(MC[,,ind]))
for (i in 2:100)
  lines(MC[1,i,ind])
ts.plot(MC[2,1,ind],xlab="Monte Carlo sample",ylab="Estimate of Pr(X>3)",ylim=range(MC[,,ind]))
for (i in 2:100)
  lines(MC[2,i,ind])

ts.plot(SD1,ylim=range(SD1,SD2),ylab="MC standard deviation",xlab="Monte Carlo sample")
lines(SD2,col=2)
ts.plot(SD2/SD1,ylab="Standard deviation ratio",xlab="Monte Carlo sample")

Another example of an important integral

The Kullback–Leibler divergence, or relative entropy, is a measure of how one probability distribution is different from a second, reference probability distribution. \[ KL(p|q) = \int p(x) \log (p(x)/q(x)) dx = E_p(\log(p/q)) = E_p(\log p)-E_p(\log q) \]

Comparing Gaussian and Student’s \(t\) distributions

set.seed(123654)
M     = 100000
x     = rnorm(M)
nus   = 1:30
kl.pq = rep(0,length(nus))
kl.qp = rep(0,length(nus))
for (nu in nus){
  y = rt(M,df=nu)
  kl.pq[nu] = mean(log(dnorm(x)/dt(x,df=nu)))
  kl.qp[nu] = mean(log(dt(x,df=nu)/dnorm(x)))
}
par(mfrow=c(1,2))
plot(nus,kl.pq,xlab="Degrees of freedom",ylab="Divergence",main="KL(normal|student's t)")
abline(h=0)
plot(nus,kl.qp,xlab="Degrees of freedom",ylab="Divergence",main="KL(student's t|normal)")
abline(h=0)

Back to the two physicists

Let us go back to the example of the two physicists.
Assume that p is the posterior of physicist A and that q is the posterior of physicist B. We also extend the example to accommodate n>=1 measurements from a true distribution \(N(\theta_0,\sigma^2)\).

# Physicist A
m.A  = 900
sd.A = 20

# Physicits B
m.B  = 800
sd.B = 80

# Model & data
set.seed(1234)
theta0 = 850
sd.X   = 40
n      = 30
x      = rnorm(n,theta0,sd.X)
n      = length(x)

# Posterior Physicist A
pi.A   = sd.X^2/(1:n)/(sd.X^2/(1:n)+sd.A^2)
mean.A = pi.A*m.A + (1-pi.A)*cumsum(x)/(1:n)
var.A  = pi.A*sd.A^2

# Posterior Physicist B
pi.B   = sd.X^2/(1:n)/(sd.X^2/(1:n)+sd.B^2)
mean.B = pi.B*m.B + (1-pi.B)*cumsum(x)/(1:n)
var.B  = pi.B*sd.B^2

mean.A = c(m.A,mean.A)
mean.B = c(m.B,mean.B)
var.A  = c(sd.A^2,var.A)
var.B  = c(sd.B^2,var.B)

cbind(mean.A,mean.B)
##         mean.A   mean.B
##  [1,] 900.0000 800.0000
##  [2,] 880.3435 801.3739
##  [3,] 877.1358 827.9176
##  [4,] 879.4560 848.0591
##  [5,] 864.0455 826.4387
##  [6,] 864.3921 834.1961
##  [7,] 864.9772 839.9634
##  [8,] 861.5256 838.1768
##  [9,] 858.7431 836.9596
## [10,] 856.3337 835.9285
## [11,] 853.3384 833.8280
## [12,] 851.8433 833.5688
## [13,] 849.2321 831.6501
## [14,] 847.4508 830.6916
## [15,] 847.7357 832.2275
## [16,] 849.8748 835.9096
## [17,] 849.6605 836.5053
## [18,] 848.7033 836.1026
## [19,] 847.1056 834.8670
## [20,] 845.7754 833.9135
## [21,] 849.9779 839.4799
## [22,] 850.1933 840.2274
## [23,] 849.4310 839.7845
## [24,] 848.7994 839.4659
## [25,] 849.4988 840.6584
## [26,] 848.5592 839.9294
## [27,] 846.6763 838.1063
## [28,] 847.5252 839.3864
## [29,] 846.3229 838.3127
## [30,] 846.4160 838.6916
## [31,] 845.4203 837.8278
cbind(sqrt(var.A),sqrt(var.B))
##            [,1]      [,2]
##  [1,] 20.000000 80.000000
##  [2,] 17.888544 35.777088
##  [3,] 16.329932 26.666667
##  [4,] 15.118579 22.188008
##  [5,] 14.142136 19.402850
##  [6,] 13.333333 17.457431
##  [7,] 12.649111 16.000000
##  [8,] 12.060454 14.855627
##  [9,] 11.547005 13.926212
## [10,] 11.094004 13.151919
## [11,] 10.690450 12.493901
## [12,] 10.327956 11.925696
## [13,] 10.000000 11.428571
## [14,]  9.701425 10.988845
## [15,]  9.428090 10.596259
## [16,]  9.176629 10.242950
## [17,]  8.944272  9.922779
## [18,]  8.728716  9.630868
## [19,]  8.528029  9.363292
## [20,]  8.340577  9.116846
## [21,]  8.164966  8.888889
## [22,]  8.000000  8.677218
## [23,]  7.844645  8.479983
## [24,]  7.698004  8.295614
## [25,]  7.559289  8.122769
## [26,]  7.427814  7.960298
## [27,]  7.302967  7.807201
## [28,]  7.184212  7.662610
## [29,]  7.071068  7.525767
## [30,]  6.963106  7.396003
## [31,]  6.859943  7.272727

Let us compare the posterior distributions.

set.seed(24681357)
M  = 1000000
kl = rep(0,n+1)
for (i in 1:(n+1)){
  z = rnorm(M,mean.A[i],sqrt(var.A[i]))
  kl[i] = mean(dnorm(z,mean.A[i],sqrt(var.A[i]),log=TRUE))-
          mean(dnorm(z,mean.B[i],sqrt(var.B[i]),log=TRUE))
}

par(mfrow=c(1,2))
plot(0:n,var.B/var.A,xlab="Sample size",ylab="Variance ratio (Physicist B over A)")
abline(h=1,col=2)
plot(0:n,kl,xlab="Sample size",ylab="KL(post.A|post.B)")
abline(h=0,col=2)