In this set of notes we will illustrate the power (and limitations) of basic Monte Carlo (MC) methods.
Suppose we observe \(n\) pair of points \((x_i,y_i)\) and would like to study the behavior of the ordinary least square (OLS) estimator of \(\beta\) in a simple (through the origin) linear and Gaussian regression: \[ y_i = \beta x_i + \epsilon_i \qquad \epsilon_i \sim N(0,\sigma^2), \] where \(cov(\epsilon_i,\epsilon_j)=0\) for all \(i \neq j\) and known \(\sigma^2\). The OLS estimator \(\beta\) is \[ {\hat \beta}_{ols} = \frac{\sum_{i=1}^n x_i y_i}{\sum_{i=1}^n x_i^2}. \] You have learned, in your elementary statistics and elementary econometrics courses, that the sampling distribution of \({\hat \beta}_{ols}\) is Gaussian with mean \(\beta\). Here is one example:
set.seed(121345)
n = 100
beta = 1
sig = 0.25
x = runif(n)
error = rnorm(n,0,sig)
y = beta*x+error
plot(x,y)
fit = lm(y~x-1)
summary(fit)
##
## Call:
## lm(formula = y ~ x - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52471 -0.14866 0.02009 0.19172 0.58672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 0.97986 0.04414 22.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2335 on 99 degrees of freedom
## Multiple R-squared: 0.8327, Adjusted R-squared: 0.831
## F-statistic: 492.7 on 1 and 99 DF, p-value: < 2.2e-16
Now, let us replicate the above simulation exercise and see the actual behavior of \({\hat \beta}_{ols}\). This is what we commonly call a Monte Carlo exercise.
set.seed(31415)
R = 1000
ols = rep(0,R)
for (rep in 1:R){
x = runif(n)
error = rnorm(n,0,sig)
y = beta*x+error
ols[rep] = lm(y~x-1)$coef
}
hist(ols,prob=TRUE,xlab="OLS estimate",ylab="Density",main="")
title(paste("(n,beta,sig)=(",n,",",beta,",",sig,")",sep=""))
abline(v=beta,col=2,lwd=4)
Now, let us transform the above exercise into a function of the sample size \(n\), the replication size \(R\), the true value of \(\beta\) and the value of \(\sigma\). The simulated errors \(\{\epsilon_1,\ldots,\epsilon_n\}\) are either standard Gaussian or Student’s \(t\) with 5 degrees of freedom.
sim.ols = function(n,R,beta,sig,L,U,errortype){
ols = rep(0,R)
for (r in 1:R){
if (errortype==1){
error = sig*rnorm(n)
}
if (errortype==2){
error = sig*rt(n,df=5)
}
x = runif(n)
y = beta*x+error
ols[r] = lm(y~x-1)$coef
}
hist(ols,prob=TRUE,xlab="OLS estimate",ylab="Density",main="",xlim=c(L,U))
title(paste("(n,sig)=(",n,",",sig,")",sep=""))
abline(v=beta,col=2,lwd=2)
if (errortype==1){
legend("topleft",legend="Gaussian error",bty="n")
}
if (errortype==2){
legend("topleft",legend="Student's t error",bty="n")
}
}
Look how compact and clean (and easy to debug!) is the stripc that loops through \(n \in \{20,100\}\) and \(\sigma \in \{0.25,0.5\}\).
set.seed(31415)
R = 1000
beta = 1
par(mfrow=c(2,4))
for (n in c(20,100))
for (sig in c(0.25,0.5)){
sim = sim.ols(n,R,beta,sig,0,2,1)
sim = sim.ols(n,R,beta,sig,0,2,2)
}
In the following Monte Carlo exercise, we add a second variable, \(Z\), in the regression model, but do not include it in the regression estimation. This is commonly known as a omitted variable problem.
sim.ols = function(n,R,beta,sig,L,U,errortype){
ols = rep(0,R)
for (r in 1:R){
if (errortype==1){
error = sig*rnorm(n)
}
if (errortype==2){
error = sig*rt(n,df=5)
}
x = runif(n)
z = runif(n)
y = beta*x + 0.1*z + error
ols[r] = lm(y~x-1)$coef
}
hist(ols,prob=TRUE,xlab="OLS estimate",ylab="Density",main="",xlim=c(L,U))
title(paste("(n,sig)=(",n,",",sig,")",sep=""))
abline(v=beta,col=2,lwd=2)
if (errortype==1){
legend("topleft",legend="Gaussian error",bty="n")
}
if (errortype==2){
legend("topleft",legend="Student's t error",bty="n")
}
}
Let us run it and see the anomaly.
set.seed(31415)
R = 1000
beta = 1
par(mfrow=c(2,4))
for (n in c(20,100))
for (sig in c(0.25,0.5)){
sim = sim.ols(n,R,beta,sig,0,2,1)
sim = sim.ols(n,R,beta,sig,0,2,2)
}
MCI is a powerful tool to approximate integrals, such as following ones. First, let \(X \sim t_5(0,1)\) and \(Y \sim N(0,1)\). We want to compute: \[\begin{eqnarray*} Pr(X>3) &=& \int_{3}^{\infty} f_t(x)dx\\ E(X^2) &=& \int_{-\infty}^{\infty} x^2 f_t(x)dx\\ P(Y>3) &=& \int_{3}^{\infty} f_n(y)dy\\ E(Y^2) &=& \int_{3}^{\infty} y^2 f_n(y)dy \end{eqnarray*}\]
The exact values are \(0.01504962\), \(0.001349898\), \(5/3\) and \(1\).
set.seed(123456)
exact = c(1-pt(3,df=5),1-pnorm(3),5/(5-2),1)
M0 = 100
M = 10000
ind = M0:M
draw.x = rt(M,df=5)
draw.y = rnorm(M)
I1.x = cumsum(draw.x>3)/(1:M)
I1.y = cumsum(draw.y>3)/(1:M)
I2.x = cumsum(draw.x^2)/(1:M)
I2.y = cumsum(draw.y^2)/(1:M)
MC.approx = cbind(I1.x,I1.y,I2.x,I2.y)
par(mfrow=c(2,2))
ts.plot(MC.approx[ind,1],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[1],col=2)
title("X ~ t_5(0,1) - Pr(X>3)")
ts.plot(MC.approx[ind,2],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[2],col=2)
title("Y ~ N(0,1) - Pr(Y>3)")
ts.plot(MC.approx[ind,3],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[3],col=2)
title("X ~ t_5(0,1) - E(X^2)")
ts.plot(MC.approx[ind,4],xlab="Monte Carlo sample",ylab="Estimate")
abline(h=exact[4],col=2)
title("Y ~ N(0,1) - E(Y^2)")
Since \(X \sim t_5(0,1)\) is symmetric, we can use the identity: \[ I_1 = Pr(X>3) = \frac{Pr(X>3)+Pr(X<-3)}{2} = I_2. \] Therefore we could use MCI to approximate \(I_1\) or \(I_2\).
set.seed(654321)
exact = 1-pt(3,df=5)
M0 = 100
M = 10000
ind = M0:M
xd = rt(M,df=5)
MC1 = cumsum(xd>3)/(1:M)
MC2 = cumsum((xd>3)|(xd<(-3)))/(1:M)/2
par(mfrow=c(1,1))
ts.plot(MC1[ind],xlab="Monte Carlo sample",ylab="Estimate of Pr(X>3)",ylim=range(MC1[ind],MC2[ind]))
lines(MC2[ind],col=2)
abline(h=exact[1],col=2)
title("X ~ t_5(0,1)")
MC = array(0,c(2,100,M))
for (i in 1:100){
xd = rt(M,df=5)
MC[1,i,] = cumsum(xd>3)/(1:M)
MC[2,i,] = cumsum((xd>3)|(xd<(-3)))/(1:M)/2
}
SD1 = sqrt(apply(MC[1,,ind],2,var))
SD2 = sqrt(apply(MC[2,,ind],2,var))
par(mfrow=c(2,2))
ts.plot(MC[1,1,ind],xlab="Monte Carlo sample",ylab="Estimate of Pr(X>3)",ylim=range(MC[,,ind]))
for (i in 2:100)
lines(MC[1,i,ind])
ts.plot(MC[2,1,ind],xlab="Monte Carlo sample",ylab="Estimate of Pr(X>3)",ylim=range(MC[,,ind]))
for (i in 2:100)
lines(MC[2,i,ind])
ts.plot(SD1,ylim=range(SD1,SD2),ylab="MC standard deviation",xlab="Monte Carlo sample")
lines(SD2,col=2)
ts.plot(SD2/SD1,ylab="Standard deviation ratio",xlab="Monte Carlo sample")
The Kullback–Leibler divergence, or relative entropy, is a measure of how one probability distribution is different from a second, reference probability distribution. \[ KL(p|q) = \int p(x) \log (p(x)/q(x)) dx = E_p(\log(p/q)) = E_p(\log p)-E_p(\log q) \]
set.seed(123654)
M = 100000
x = rnorm(M)
nus = 1:30
kl.pq = rep(0,length(nus))
kl.qp = rep(0,length(nus))
for (nu in nus){
y = rt(M,df=nu)
kl.pq[nu] = mean(log(dnorm(x)/dt(x,df=nu)))
kl.qp[nu] = mean(log(dt(x,df=nu)/dnorm(x)))
}
par(mfrow=c(1,2))
plot(nus,kl.pq,xlab="Degrees of freedom",ylab="Divergence",main="KL(normal|student's t)")
abline(h=0)
plot(nus,kl.qp,xlab="Degrees of freedom",ylab="Divergence",main="KL(student's t|normal)")
abline(h=0)
Let us go back to the example of the two physicists.
Assume that p is the posterior of physicist A and that q is the posterior of physicist B. We also extend the example to accommodate n>=1 measurements from a true distribution \(N(\theta_0,\sigma^2)\).
# Physicist A
m.A = 900
sd.A = 20
# Physicits B
m.B = 800
sd.B = 80
# Model & data
set.seed(1234)
theta0 = 850
sd.X = 40
n = 30
x = rnorm(n,theta0,sd.X)
n = length(x)
# Posterior Physicist A
pi.A = sd.X^2/(1:n)/(sd.X^2/(1:n)+sd.A^2)
mean.A = pi.A*m.A + (1-pi.A)*cumsum(x)/(1:n)
var.A = pi.A*sd.A^2
# Posterior Physicist B
pi.B = sd.X^2/(1:n)/(sd.X^2/(1:n)+sd.B^2)
mean.B = pi.B*m.B + (1-pi.B)*cumsum(x)/(1:n)
var.B = pi.B*sd.B^2
mean.A = c(m.A,mean.A)
mean.B = c(m.B,mean.B)
var.A = c(sd.A^2,var.A)
var.B = c(sd.B^2,var.B)
cbind(mean.A,mean.B)
## mean.A mean.B
## [1,] 900.0000 800.0000
## [2,] 880.3435 801.3739
## [3,] 877.1358 827.9176
## [4,] 879.4560 848.0591
## [5,] 864.0455 826.4387
## [6,] 864.3921 834.1961
## [7,] 864.9772 839.9634
## [8,] 861.5256 838.1768
## [9,] 858.7431 836.9596
## [10,] 856.3337 835.9285
## [11,] 853.3384 833.8280
## [12,] 851.8433 833.5688
## [13,] 849.2321 831.6501
## [14,] 847.4508 830.6916
## [15,] 847.7357 832.2275
## [16,] 849.8748 835.9096
## [17,] 849.6605 836.5053
## [18,] 848.7033 836.1026
## [19,] 847.1056 834.8670
## [20,] 845.7754 833.9135
## [21,] 849.9779 839.4799
## [22,] 850.1933 840.2274
## [23,] 849.4310 839.7845
## [24,] 848.7994 839.4659
## [25,] 849.4988 840.6584
## [26,] 848.5592 839.9294
## [27,] 846.6763 838.1063
## [28,] 847.5252 839.3864
## [29,] 846.3229 838.3127
## [30,] 846.4160 838.6916
## [31,] 845.4203 837.8278
cbind(sqrt(var.A),sqrt(var.B))
## [,1] [,2]
## [1,] 20.000000 80.000000
## [2,] 17.888544 35.777088
## [3,] 16.329932 26.666667
## [4,] 15.118579 22.188008
## [5,] 14.142136 19.402850
## [6,] 13.333333 17.457431
## [7,] 12.649111 16.000000
## [8,] 12.060454 14.855627
## [9,] 11.547005 13.926212
## [10,] 11.094004 13.151919
## [11,] 10.690450 12.493901
## [12,] 10.327956 11.925696
## [13,] 10.000000 11.428571
## [14,] 9.701425 10.988845
## [15,] 9.428090 10.596259
## [16,] 9.176629 10.242950
## [17,] 8.944272 9.922779
## [18,] 8.728716 9.630868
## [19,] 8.528029 9.363292
## [20,] 8.340577 9.116846
## [21,] 8.164966 8.888889
## [22,] 8.000000 8.677218
## [23,] 7.844645 8.479983
## [24,] 7.698004 8.295614
## [25,] 7.559289 8.122769
## [26,] 7.427814 7.960298
## [27,] 7.302967 7.807201
## [28,] 7.184212 7.662610
## [29,] 7.071068 7.525767
## [30,] 6.963106 7.396003
## [31,] 6.859943 7.272727
Let us compare the posterior distributions.
set.seed(24681357)
M = 1000000
kl = rep(0,n+1)
for (i in 1:(n+1)){
z = rnorm(M,mean.A[i],sqrt(var.A[i]))
kl[i] = mean(dnorm(z,mean.A[i],sqrt(var.A[i]),log=TRUE))-
mean(dnorm(z,mean.B[i],sqrt(var.B[i]),log=TRUE))
}
par(mfrow=c(1,2))
plot(0:n,var.B/var.A,xlab="Sample size",ylab="Variance ratio (Physicist B over A)")
abline(h=1,col=2)
plot(0:n,kl,xlab="Sample size",ylab="KL(post.A|post.B)")
abline(h=0,col=2)