1 The Michaelis-Menten nonlinear model

In biochemistry, Michaelis–Menten kinetics, named after Leonor Michaelis and Maud Menten, is the simplest case of enzyme kinetics, applied to enzyme-catalysed reactions involving the transformation of one substrate into one product. It takes the form of a differential equation describing the reaction rate \(y\) (μmol/min:Micromoles per minute), as a function of \(x\), the concentration of the substrate (mM:millimolar) - Source: https://en.wikipedia.org/wiki/Michaelis-Menten_kinetics.

The dataset below is from a enzyme kinetics experiments, which will be modeled as \[ y_i = \frac{\alpha x_i}{\beta+x_i} + \epsilon_i \qquad\qquad \epsilon_i \sim N(0,\sigma^2), \] where \(\alpha\) is the maximum reaction rate and \(\beta\) is the Michaelis constant.

#install.packages("mvtnorm")
library("mvtnorm")

substrate = c(0.5, 1.0, 2.0, 3.0, 5.0, 7.0, 10.0, 15.0)
rate      = c(0.26, 0.50, 0.85, 1.10, 1.40, 1.55, 1.65, 1.75)
x         = substrate
y         = rate
n         = length(x)

1.1 Exploratory data analysis

par(mfrow=c(1,1))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)")

Here we try to figure out \(\alpha\) and \(\beta\) by running regressions through the origin of the for \[ y_i = \alpha z_i + \epsilon_i \] where \(z_i=x_i/(\beta+x_i)\), for each value of \(\beta\) on a grid in the interval \((0,5)\), and then choosing the \(\beta\) that minimizes the residual sum of squares, \(RSS(\beta)\).

nn = 100
bs = seq(0,5,length=nn)
rss = rep(0,nn)
for (i in 1:nn){
  x1 = x/(bs[i]+x)
  rss[i] = mean(lm(y~x1-1)$res^2)
}

beta.eda  = bs[rss==min(rss)]
x1        = x/(beta.eda+x)
alpha.eda = lm(y~x1-1)$coef
alpha.eda
##       x1 
## 2.153489
beta.eda
## [1] 2.979798
par(mfrow=c(1,2))
plot(bs,rss,xlab=expression(beta),ylab="Residual sum of squares",pch=16,cex=0.5)
abline(v=beta.eda)
title(paste("alpha=",round(alpha.eda,3)," - beta=",round(beta.eda,3),sep=""))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)",pch=16)
xx = seq(min(x),max(x),length=100)
lines(xx,alpha.eda*xx/(beta.eda+xx),col=2)
points(x,alpha.eda*x/(beta.eda+x),col=2,pch=16)

2 Nonlinear least squares estimation

Let us plot the likelihood function on a grid for \((\alpha,\beta)\).

sigma   = 0.04305 
alpha.g = seq(1,3,length=200)
beta.g  = seq(0,6,length=200)
like    = matrix(0,200,200)
for (i in 1:200)
for (j in 1:200)
  like[i,j] = sum(dnorm(y,alpha.g[i]*x/(beta.g[j]+x),sigma,log=TRUE))
  
contour(alpha.g,beta.g,exp(like),xlab=expression(alpha),ylab=expression(beta),drawlabels=FALSE)
title("Likelihood function")

2.1 Nonlinear fit via function nls

fit.nls = nls(y~alpha*x/(beta+x),start=list(alpha=alpha.eda,beta=beta.eda))

summary(fit.nls)
## 
## Formula: y ~ alpha * x/(beta + x)
## 
## Parameters:
##       Estimate Std. Error t value Pr(>|t|)    
## alpha  2.15547    0.06062   35.56 3.30e-08 ***
## beta   2.98860    0.24264   12.32 1.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04305 on 6 degrees of freedom
## 
## Number of iterations to convergence: 3 
## Achieved convergence tolerance: 6.585e-06
beta.nls  = summary(fit.nls)$coef[2]
alpha.nls = summary(fit.nls)$coef[1]
sigma.nls = summary(fit.nls)$sigma
se = sigma.nls*sqrt(diag(summary(fit.nls)$cov.unscaled))
sa.nls = se[1]
sb.nls = se[2]
c(alpha.nls,beta.nls)
## [1] 2.15547 2.98860
c(sa.nls,sb.nls)
##      alpha       beta 
## 0.06062228 0.24264405

3 Bayesian estimation

Let us rewrite the likelihood function for clarity \[ L(\alpha,\beta|\mbox{data}) \propto \exp\left\{-\frac{1}{2\sigma^2}\sum_{i=1}^n \left(y_i-\frac{\alpha x_i}{\beta + x_i}\right)^2\right\}. \] We will assume that the prior for \((\alpha,\beta)\) is \[ p(\alpha,\beta)=p_N(\alpha|\mu_\alpha,\sigma_\alpha^2)p(\beta|\mu_\beta,\sigma_\beta^2), \] where \(p_N(\cdot|\theta,\tau^2)\) is the density of Gaussian distribution with mean \(\theta\) and variance \(\tau^2\), Here we assume that \(\mu_\alpha=2\), \(\sigma_\alpha^2=1\), \(\mu_\beta=3\) and \(\sigma_\beta^2=1\).

mua   = 2
mub   = 3
siga  = 1
sigb  = 1
prior = matrix(0,200,200)
post  = matrix(0,200,200)
for (i in 1:200)
for (j in 1:200){
  prior[i,j] = dnorm(alpha.g[i],mua,siga,log=TRUE)+dnorm(beta.g[j],mub,sigb,log=TRUE)
  post[i,j]  = like[i,j] + prior[i,j]
}
contour(alpha.g,beta.g,exp(prior),xlab=expression(alpha),ylab=expression(beta),drawlabels=FALSE)
contour(alpha.g,beta.g,exp(post),drawlabels=FALSE,col=2,add=TRUE)
legend("topleft",legend=c("Prior","Posterior"),col=1:2,lwd=2,bty="n",lty=1)

3.1 Sampling importance resampling (SIR)

Here, we will use the output of the previous nonlinear least squares estimation to feed the hyperparameters for the proposal densities: \[ q(\alpha,\beta) = q(\alpha)q(\beta) \equiv p_N(\alpha|\alpha_{nls},9\sigma_{\alpha,nls}^2) p_N(\beta|\beta_{nls},9\sigma_{\beta,nls}^2). \]

M     = 10000
ma    = alpha.nls
sa    = 3*sa.nls
mb    = beta.nls
sb    = 3*sb.nls


set.seed(12345)
alphas = rnorm(M,ma,sa)
betas  = rnorm(M,mb,sb)
w      = rep(0,M)
for (i in 1:M)
w[i] = sum(dnorm(y,alphas[i]*x/(betas[i]+x),sigma,log=TRUE))+
       dnorm(alphas[i],mua,siga,log=TRUE)+
       dnorm(betas[i],mub,sigb)-
       dnorm(alphas[i],ma,sa,log=TRUE)-
       dnorm(betas[i],mb,sb,log=TRUE)
w = exp(w-max(w))
ind = sample(1:M,size=M,replace=TRUE,prob=w)
alphas1 = alphas[ind]
betas1 = betas[ind]
draws.sir = cbind(alphas1,betas1)

par(mfrow=c(1,1))
contour(alpha.g,beta.g,exp(prior),xlab=expression(alpha),ylab=expression(beta),
        drawlabels=FALSE,col=2)
points(alphas,betas,pch=16)
points(draws.sir[,1],draws.sir[,2],pch=16,col=3)
contour(alpha.g,beta.g,exp(post),add=TRUE,col=4)
legend("topleft",legend=c("Prior density (grid)","Proposal draws","Posterior (grid)",
       "Posterior draws (SIR)"),col=c(2,1,4,3),pch=16,bty="n")

3.2 MCMC1

In this first MCMC scheme, we use a random-walk Metropolis step for \(\beta\) and a Gibbs sampler step for \(\alpha\). The Gibbs sampler for \(\alpha\) takes advantage of the fact that conditional on \(\beta\) the nonlinear model becomes a linear model as far as \(\alpha\) is concerned so the full conditional \(p(\alpha|\beta,\mbox{data})\) is an easily derived Gaussian distribution.

  • Random walk Metropolis step: \(\beta \sim N(\beta^{(i)},v_\beta^2)\)

  • Gibbs sampler step: \(\alpha|\beta,\mbox{data} \sim N(\gamma,\omega^2)\), where \[ \omega^2 = \left( \frac{1}{\sigma_\alpha^2} + \frac{\sum_{i=1}^n z_i^2}{\sigma^2} \right)^{-1} \ \ \ \mbox{and} \ \ \ \gamma = \omega^2\left( \frac{\mu_\alpha}{\sigma_\alpha^2} + \frac{\sum_{i=1}^n y_iz_i}{\sigma^2}\right), \] where \(z_i = x_i/(\beta+x_i)\), for \(i=1,\ldots,n\).

# MCMC setup
M0    = 10000
M     = 20000
L     = 1
niter = M0+M*L
v.b   = 0.5
beta  = beta.nls
draws = matrix(0,niter,2)
for (iter in 1:niter){
  # sampling alpha (Gibbs step)
  z     = x/(beta+x)
  var   = 1/(1/siga^2+sum(z^2)/sigma^2)
  mean  = var*(mua/siga^2+sum(z*y)/sigma^2)
  alpha = rnorm(1,mean,sqrt(var))
  # sampling beta (random-walk Metropolis step)
  beta1     = rnorm(1,beta,v.b)
  logaccept = min(0,dnorm(beta1,mub,sigb,log=TRUE)+sum(dnorm(y,alpha*x/(beta1+x),sigma,log=TRUE))-
                    dnorm(beta,mub,sigb,log=TRUE)-sum(dnorm(y,alpha*x/(beta+x),sigma,log=TRUE)))
  if (log(runif(1))<logaccept){
    beta = beta1
  }
  draws[iter,] = c(alpha,beta)
}
draws.mcmc1 = draws[seq(M0+1,niter,by=L),]


names = c("alpha","beta")
par(mfrow=c(2,3))
for (i in 1:2){
  ts.plot(draws.mcmc1[,i],ylab="",main=names[i])
  acf(draws.mcmc1[,i],main="")
  hist(draws.mcmc1[,i],xlab="",main="",prob=TRUE)
}

3.3 MCMC2

Here, we decided to sample \((\alpha,\beta)\) jointly in the MCM scheme. In order to do that, we implement an independent Metropolis algorithm for \((\alpha,\beta)\) where the proposal \(q(\alpha,\beta)\) is the same one used in our implementation of the above SIR scheme.

alpha = alpha.nls
beta  = beta.nls
niter = M0+M*L
draws = matrix(0,niter,2)
for (iter in 1:niter){
  alpha1 = rnorm(1,ma,sa)
  beta1  = rnorm(1,mb,sb)
  post.n = dnorm(alpha1,mua,siga,log=TRUE)+dnorm(beta1,mub,sigb,log=TRUE)+
           sum(dnorm(y,alpha1*x/(beta1+x),sigma,log=TRUE))
  post.d = dnorm(alpha,mua,siga,log=TRUE)+dnorm(beta,mub,sigb,log=TRUE)+
           sum(dnorm(y,alpha*x/(beta+x),sigma,log=TRUE))
  prop.n = dnorm(alpha,ma,sa)+dnorm(beta,mb,sb)
  prop.d = dnorm(alpha1,ma,sa)+dnorm(beta1,mb,sb)
  logaccept = min(0,post.n-post.d+prop.n-prop.d)
  if (log(runif(1))<logaccept){
    alpha = alpha1
    beta = beta1
  }
  draws[iter,] = c(alpha,beta)
}
draws.mcmc2 = draws[seq(M0+1,niter,by=L),]

par(mfrow=c(2,3))
for (i in 1:2){
  ts.plot(draws.mcmc2[,i],ylab="",main=names[i])
  acf(draws.mcmc2[,i],main="")
  hist(draws.mcmc2[,i],xlab="",main="",prob=TRUE)
}

3.4 MC

Here, we integrate out \(\alpha\) analytically in order to obtain \(p(\beta|\mbox{data})\). Then we sample \(\beta\) from this marginal posterior distribution via SIR. Then, the \(\beta\) draws are used to directly sample \(\alpha\) from \(p(\alpha|\beta,\mbox{data})\). Therefore, an MCMC is not needed!

  1. Sample \(\{\beta^{(1)},\ldots,\beta^{(M)}\}\) from \(p(\beta|\mbox{data})\) via SIR with proposal \(q(\beta)=p_N(\beta|\beta_{nls},\sigma_{nls}^2)\). To implement SIR we need the integrated likelihood for \(\beta\) given \(y=(y_1,\ldots,y_n)'\): \[ p(y|\beta) = \int_{-\infty}^{\infty} p(y|\alpha,\beta)p(\alpha)d\alpha = p_N(y|\mu_\alpha z,\sigma^2 I_n + \sigma_\alpha^2 zz'), \] where \(z=(z_1,\ldots,z_n)'\) and \(z_i=x_i/(\beta+x_i)\), for \(i=1,\ldots,n\). Notice that, by integration \(\alpha\) out, the marginal \(p(y|\beta)\) is a multivariate normal with non-diagonal variance function of the \((n \times n\)) matrix \(zz'\).

  2. Sample \(\alpha^{(i)}\) from \(p(\alpha|\beta^{(i)},\mbox{data})\), similar to the Gibbs sampler step of the above MCMC1 algorithm.

# sampling beta from p(beta|data)
M    = 10000
beta = rnorm(M,mb,sb)
for (i in 1:M){
 xx = x/(beta[i]+x)
 mean = mua*xx
 var = siga^2*xx%*%t(xx)+diag(sigma^2,n)
 w[i] = dmvnorm(y,mean=mean,sigma=var,log=TRUE)+
        dnorm(beta[i],mub,sigb,log=TRUE)-
        dnorm(beta[i],mb,sb,log=TRUE)
}
w     = exp(w-max(w))
beta1 = sample(beta,size=M,replace=TRUE,prob=w)
beta  = beta1

# sampling alpha from p(alpha|beta,data)
alpha = rep(0,M)
for (i in 1:M){
  z        = x/(beta[i]+x)
  var      = 1/(1/siga^2+sum(z^2)/sigma^2)
  mean     = var*(mua/siga^2+sum(z*y)/sigma^2)
  alpha[i] = rnorm(1,mean,sqrt(var))
}
draws.mc = cbind(alpha,beta)

par(mfrow=c(2,2))
for (i in 1:2){
  ts.plot(draws.mc[,i],ylab="",main=names[i])
  hist(draws.mc[,i],xlab="",main="",prob=TRUE)
}

3.5 Comparing MC and MCMC algorithms

par(mfrow=c(2,2))
xrange = range(draws.sir[,1],draws.mcmc1[,1],draws.mcmc2[,1],draws.mc[,1])
yrange = range(draws.sir[,2],draws.mcmc1[,2],draws.mcmc2[,2],draws.mc[,2])
plot(draws.sir[,1],draws.sir[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("SIR")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)
plot(draws.mcmc1[,1],draws.mcmc1[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("MCMC 1")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)
plot(draws.mcmc2[,1],draws.mcmc2[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("MCMC 2")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)
plot(draws.mc[,1],draws.mc[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("MC")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)

par(mfrow=c(1,2))
plot(density(draws.sir[,1]),xlab="",main=expression(alpha))
lines(density(draws.mcmc1[,1]),col=2)
lines(density(draws.mcmc2[,1]),col=3)
lines(density(draws.mc[,1]),col=4)
plot(density(draws.sir[,2]),xlab="",main=expression(beta))
lines(density(draws.mcmc1[,2]),col=2)
lines(density(draws.mcmc2[,2]),col=3)
lines(density(draws.mc[,2]),col=4)

tab.a = rbind(quantile(draws.sir[,1],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc1[,1],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc2[,1],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mc[,1],c(0.05,0.25,0.5,0.75,0.95)))

tab.b = rbind(quantile(draws.sir[,2],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc1[,2],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc2[,2],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mc[,2],c(0.05,0.25,0.5,0.75,0.95)))

rownames(tab.a) = c("SIR","MCMC1","MCMC2","MC")
rownames(tab.b) = c("SIR","MCMC1","MCMC2","MC")

round(tab.a,2)
##         5%  25%  50% 75%  95%
## SIR   2.06 2.12 2.15 2.2 2.26
## MCMC1 2.07 2.12 2.16 2.2 2.26
## MCMC2 2.07 2.12 2.16 2.2 2.26
## MC    2.07 2.12 2.16 2.2 2.26
round(tab.b,2)
##         5%  25%  50%  75%  95%
## SIR   2.64 2.85 2.99 3.16 3.40
## MCMC1 2.65 2.87 3.01 3.17 3.41
## MCMC2 2.67 2.87 3.02 3.18 3.40
## MC    2.66 2.87 3.01 3.17 3.41

3.6 Posterior predictive distribution

A final exercise is to obtain a Monte Carlo approximation to the posterior predictive of \(y\) for a new value of \(x\), say \({\widetilde x}\): \[ p({\widetilde y}|{\widetilde x}) = \int_{-\infty}^{\infty}\int_{-\infty}^{\infty} p_N\left({\widetilde y}|\frac{\alpha{\widetilde x}}{\beta+{\widetilde x}},\sigma^2\right) p(\alpha,\beta|\mbox{data})d\alpha d\beta, \] which can be approximated by \[ p_{MC}({\widetilde y}|{\widetilde x}) = \frac1M\sum_{i=1}^N p_N\left({\widetilde y}|\frac{\alpha^{(i)}{\widetilde x}}{\beta^{(i)}+{\widetilde x}},\sigma^2\right) \] where \(\{(\alpha,\beta)^{(1)},\ldots,(\alpha,\beta)^{(M)}\}\) is a Monte Carlo sample from \(p(\alpha,\beta|\mbox{data})\), which could be from any of the previous 4 schemes we arleady implemente, ie. SIR, MCMC1, MCMC2 or MC.

# Posterior predictive for xnew=5 and xnew=10
alpha.d  = draws.mc[,1]
beta.d   = draws.mc[,2]
N        = 200
yy       = seq(1,2.5,length=N)
postpred = matrix(0,N,2)
for (i in 1:N){
  postpred[i,1] = mean(dnorm(yy[i],alpha.d*5/(beta.d+5),sigma))
  postpred[i,2] = mean(dnorm(yy[i],alpha.d*10/(beta.d+10),sigma))
}
par(mfrow=c(1,2))
plot(yy,postpred[,1],ylab="Posterior predictive density",xlab="Rate of a reaction (μmol)",type="l")
title("x = 5")
points(1.4,0,pch=16,cex=2)
plot(yy,postpred[,2],ylab="Posterior predictive density",xlab="Rate of a reaction (μmol)",type="l")
title("x = 10")
points(1.65,0,pch=16,cex=2)

postpred1 = postpred

Similarly, given \({\widetilde x}\), a sample from \(p(y|{\widetilde x})\) can be obtained as follows \[ {\widetilde y}^{(i)} \sim N\left(\frac{\alpha^{(i)}{\widetilde x}}{\beta^{(i)}+{\widetilde x}},\sigma^2\right), \qquad\qquad i=1,\ldots,M. \] Therefore, \(\{{\widetilde y}^{(1)},\ldots,{\widetilde y}^{(M)}\}\) is a sample from \(p(y|{\widetilde x})\).

alpha.d = draws.mc[,1]
beta.d  = draws.mc[,2]
N       = 20
xx      = seq(min(x),max(x),length=N)
yy      = matrix(0,N,M)
for (i in 1:N)
  yy[i,] = rnorm(M,alpha.d*xx[i]/(beta.d+xx[i]),sigma)
quant = t(apply(yy,1,quantile,c(0.05,0.5,0.95)))

par(mfrow=c(1,1))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)",
     ylim=range(quant,y),pch=16,cex=0.75)
lines(xx,quant[,1],lty=2,col=2,lwd=2)
lines(xx,quant[,2],col=2,lwd=2)
lines(xx,quant[,3],lty=2,col=2,lwd=2)

quant1 = quant

4 Learning \((\alpha,\beta,\sigma^2)\)

As a final task, let us now consider unknown the error variance, \(\sigma^2\), and assume its prior is a inverse-gamma distribution with parameters \(a_0\) and \(b_0\), denoted by \(\sigma^2 \sim IG(a_0,b_0)\) with density \[ p(\sigma^2|a_0,b_0) \propto (\sigma^2)^{-(a_0+1)}\exp\{-b_0/\sigma^2\}. \] Therefore, the full conditional posterior distribution of \(\sigma^2\) can be easily derived by noticing that \[ z_i \equiv y_i-\frac{\alpha x_i}{\beta+x_i} \sim N(0,\sigma^2), \] has likelihood \[ L(\sigma^2|\mbox{data}) \propto (\sigma^2)^{-n/2}\exp\left\{-\frac12\sum_{i=1}^n z_i^2\sigma^2\right\}. \] Combining \(p(\sigma^2|a_0,b_0)\) and \(L(\sigma^2|\mbox{data})\), leads to \[ p(\sigma^2|\alpha,\beta,\mbox{data}) \propto (\sigma^2)^{-(a_0+n/2+1)}\exp\left\{-\frac{b_0+\sum_{i=1}^n z_i^2/2}{\sigma^2}\right\}, \] which is an inverse-gamma with parameters \(a_0+n/2\) and \(b_0+\sum_{i=1}^n z_i^2/2\). Here we will consider a fairly uninformative prior for \(\sigma^2\) with \(a_0=10\) and \(b_0=0.018\). The prior mean for \(\sigma^2\) is \(0.04^2\). The 95% prior credibility interval (C.I.) for \(\sigma^2\) is \((0.001054,0.003754)\). It is worth noting that the 95% prior C.I. for \(\sigma\) is \((0.0324,0.0613)\). Recall that we fixed \(\sigma=0.04305\) in the previous analyses.

Therefore, we only need to add another layer in the MCMC scheme that was designed to sample from \(p(\alpha,\beta|\mbox{data},\sigma^2)\). The compositional aspect of the Gibbs sampler or, more generally, of all MCMC schemes is one of its main strengths and one can argue that it is the main reason for the massive success of MCMC for modern Bayesian inference that spreaded quickly to virtually all areas of applied sciences.

# Hyperparameters of p(sigma^2)
a0 = 10
b0 = 0.018

# MCMC setup
set.seed(12566)
M0    = 2000
M     = 5000
L     = 40
niter = M0+M*L
v.b   = 0.5
beta  = beta.nls
sigma = 0.04305 
draws = matrix(0,niter,3)
for (iter in 1:niter){
  # sampling alpha (Gibbs step)
  z     = x/(beta+x)
  var   = 1/(1/siga^2+sum(z^2)/sigma^2)
  mean  = var*(mua/siga^2+sum(z*y)/sigma^2)
  alpha = rnorm(1,mean,sqrt(var))
  # sampling beta (random-walk Metropolis step)
  beta1     = rnorm(1,beta,v.b)
  logaccept = min(0,dnorm(beta1,mub,sigb,log=TRUE)+sum(dnorm(y,alpha*x/(beta1+x),sigma,log=TRUE))-
                    dnorm(beta,mub,sigb,log=TRUE)-sum(dnorm(y,alpha*x/(beta+x),sigma,log=TRUE)))
  if (log(runif(1))<logaccept){
    beta = beta1
  }
  e    = y-alpha*x/(beta+x)
  par1 = a0 + n/2
  par2 = b0 + sum(e^2)/2
  sigma = sqrt(1/rgamma(1,par1,par2))
  draws[iter,] = c(alpha,beta,sigma)
}
draws.full = draws[seq(M0+1,niter,by=L),]


names = c("alpha","beta","sigma")
par(mfrow=c(3,3))
for (i in 1:3){
  ts.plot(draws.full[,i],ylab="",main=names[i])
  acf(draws.full[,i],main="")
  hist(draws.full[,i],xlab="",main="",prob=TRUE)
}

4.1 Bivariate marginal posterior distributions

par(mfrow=c(1,3))
plot(draws.full[,1],draws.full[,2],xlab=expression(alpha),ylab=expression(beta))
plot(draws.full[,1],draws.full[,3],xlab=expression(alpha),ylab=expression(sigma))
plot(draws.full[,2],draws.full[,3],xlab=expression(beta),ylab=expression(sigma))

tab = t(apply(draws.full,2,quantile,c(0.05,0.5,0.95)))
rownames(tab) = c("alpha","beta","sigma")
tab
##               5%        50%        95%
## alpha 2.06764694 2.16068167 2.26425048
## beta  2.64535474 3.01337054 3.41879442
## sigma 0.03492138 0.04320209 0.05496438

4.2 Posterior predictive distribution

alpha.d = draws.full[,1]
beta.d  = draws.full[,2]
sigma.d = draws.full[,3]
N       = 20
xx      = seq(min(x),max(x),length=N)
yy      = matrix(0,N,M)
for (i in 1:N)
  yy[i,] = rnorm(M,alpha.d*xx[i]/(beta.d+xx[i]),sigma.d)
quant = t(apply(yy,1,quantile,c(0.05,0.5,0.95)))

par(mfrow=c(1,2))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)",
     ylim=range(quant,y),pch=16,cex=0.75)
lines(xx,quant[,1],lty=2,col=2,lwd=2)
lines(xx,quant[,2],col=2,lwd=2)
lines(xx,quant[,3],lty=2,col=2,lwd=2)
lines(xx,quant1[,1],lty=2,col=4,lwd=2)
lines(xx,quant1[,2],col=4,lwd=2)
lines(xx,quant1[,3],lty=2,col=4,lwd=2)
legend("topleft",legend=c("known sigma","unknown sigma"),col=c(4,2),lty=1,lwd=2,bty="n")

N        = 200
yy       = seq(1,2.5,length=N)
postpred = rep(0,N)
for (i in 1:N)
  postpred[i] = mean(dnorm(yy[i],alpha.d*10/(beta.d+10),sigma.d))

plot(yy,postpred,ylab="Posterior predictive density",xlab="Rate of a reaction (μmol)",
    type="l",col=2,ylim=c(0,8.5),lwd=2)
title("x = 10")
points(1.65,0,pch=16,cex=2)
lines(yy,postpred1[,2],col=4,lwd=2)

---
title: "Nonlinear Gaussian regression"
subtitle: "SIR, MCMC and MC algorithms"
author: "Hedibert F. Lopes"
date: "12/06/2025"
output:
  html_document:
    toc: true
    toc_depth: 3
    toc_collapsed: true
    code_download: yes
    number_sections: true
---

# The Michaelis-Menten nonlinear model

In biochemistry, Michaelis–Menten kinetics, named after Leonor Michaelis and Maud Menten, is the simplest case of enzyme kinetics, applied to enzyme-catalysed reactions involving the transformation of one substrate into one product. It takes the form of a differential equation describing the reaction rate $y$ (μmol/min:Micromoles per minute), as a function of $x$, the concentration of the substrate (mM:millimolar) - Source: https://en.wikipedia.org/wiki/Michaelis-Menten_kinetics.

The dataset below is from a enzyme kinetics experiments, which will be modeled as
$$
y_i = \frac{\alpha x_i}{\beta+x_i} + \epsilon_i \qquad\qquad \epsilon_i \sim N(0,\sigma^2),
$$
where $\alpha$ is the maximum reaction rate and $\beta$ is the Michaelis constant.


```{r}
#install.packages("mvtnorm")
library("mvtnorm")

substrate = c(0.5, 1.0, 2.0, 3.0, 5.0, 7.0, 10.0, 15.0)
rate      = c(0.26, 0.50, 0.85, 1.10, 1.40, 1.55, 1.65, 1.75)
x         = substrate
y         = rate
n         = length(x)
```

## Exploratory data analysis

```{r fig.width=6,fig.height=5,fig.align='center'}
par(mfrow=c(1,1))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)")
```

Here we try to figure out $\alpha$ and $\beta$ by running regressions through the origin of the for
$$
y_i = \alpha z_i + \epsilon_i
$$ 
where $z_i=x_i/(\beta+x_i)$, for each value of $\beta$ on a grid in the interval $(0,5)$, and then choosing the $\beta$ that minimizes the residual sum of squares, $RSS(\beta)$.

```{r fig.width=8,fig.height=4,fig.align='center'}
nn = 100
bs = seq(0,5,length=nn)
rss = rep(0,nn)
for (i in 1:nn){
  x1 = x/(bs[i]+x)
  rss[i] = mean(lm(y~x1-1)$res^2)
}

beta.eda  = bs[rss==min(rss)]
x1        = x/(beta.eda+x)
alpha.eda = lm(y~x1-1)$coef
alpha.eda
beta.eda

par(mfrow=c(1,2))
plot(bs,rss,xlab=expression(beta),ylab="Residual sum of squares",pch=16,cex=0.5)
abline(v=beta.eda)
title(paste("alpha=",round(alpha.eda,3)," - beta=",round(beta.eda,3),sep=""))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)",pch=16)
xx = seq(min(x),max(x),length=100)
lines(xx,alpha.eda*xx/(beta.eda+xx),col=2)
points(x,alpha.eda*x/(beta.eda+x),col=2,pch=16)
```


# Nonlinear least squares estimation

Let us plot the likelihood function on a grid for $(\alpha,\beta)$.

```{r fig.width=5,fig.height=5,fig.align='center'}
sigma   = 0.04305 
alpha.g = seq(1,3,length=200)
beta.g  = seq(0,6,length=200)
like    = matrix(0,200,200)
for (i in 1:200)
for (j in 1:200)
  like[i,j] = sum(dnorm(y,alpha.g[i]*x/(beta.g[j]+x),sigma,log=TRUE))
  
contour(alpha.g,beta.g,exp(like),xlab=expression(alpha),ylab=expression(beta),drawlabels=FALSE)
title("Likelihood function")
```


## Nonlinear fit via function *nls*

```{r}
fit.nls = nls(y~alpha*x/(beta+x),start=list(alpha=alpha.eda,beta=beta.eda))

summary(fit.nls)

beta.nls  = summary(fit.nls)$coef[2]
alpha.nls = summary(fit.nls)$coef[1]
sigma.nls = summary(fit.nls)$sigma
se = sigma.nls*sqrt(diag(summary(fit.nls)$cov.unscaled))
sa.nls = se[1]
sb.nls = se[2]
c(alpha.nls,beta.nls)
c(sa.nls,sb.nls)
```

# Bayesian estimation
Let us rewrite the likelihood function for clarity
$$
L(\alpha,\beta|\mbox{data}) \propto \exp\left\{-\frac{1}{2\sigma^2}\sum_{i=1}^n \left(y_i-\frac{\alpha x_i}{\beta + x_i}\right)^2\right\}.
$$
We will assume that the prior for $(\alpha,\beta)$ is 
$$
p(\alpha,\beta)=p_N(\alpha|\mu_\alpha,\sigma_\alpha^2)p(\beta|\mu_\beta,\sigma_\beta^2),
$$
where $p_N(\cdot|\theta,\tau^2)$ is the density of Gaussian distribution with mean $\theta$ and variance $\tau^2$,  Here we assume that $\mu_\alpha=2$, $\sigma_\alpha^2=1$, $\mu_\beta=3$ and $\sigma_\beta^2=1$.

```{r fig.width=6,fig.height=6,fig.align='center'}
mua   = 2
mub   = 3
siga  = 1
sigb  = 1
prior = matrix(0,200,200)
post  = matrix(0,200,200)
for (i in 1:200)
for (j in 1:200){
  prior[i,j] = dnorm(alpha.g[i],mua,siga,log=TRUE)+dnorm(beta.g[j],mub,sigb,log=TRUE)
  post[i,j]  = like[i,j] + prior[i,j]
}
contour(alpha.g,beta.g,exp(prior),xlab=expression(alpha),ylab=expression(beta),drawlabels=FALSE)
contour(alpha.g,beta.g,exp(post),drawlabels=FALSE,col=2,add=TRUE)
legend("topleft",legend=c("Prior","Posterior"),col=1:2,lwd=2,bty="n",lty=1)
```

## Sampling importance resampling (SIR)

Here, we will use the output of the previous nonlinear least squares estimation to feed the  hyperparameters for the proposal densities:
$$
q(\alpha,\beta) = q(\alpha)q(\beta) \equiv p_N(\alpha|\alpha_{nls},9\sigma_{\alpha,nls}^2)
p_N(\beta|\beta_{nls},9\sigma_{\beta,nls}^2).
$$

```{r fig.width=7,fig.height=7,fig.align='center'}
M     = 10000
ma    = alpha.nls
sa    = 3*sa.nls
mb    = beta.nls
sb    = 3*sb.nls


set.seed(12345)
alphas = rnorm(M,ma,sa)
betas  = rnorm(M,mb,sb)
w      = rep(0,M)
for (i in 1:M)
w[i] = sum(dnorm(y,alphas[i]*x/(betas[i]+x),sigma,log=TRUE))+
       dnorm(alphas[i],mua,siga,log=TRUE)+
       dnorm(betas[i],mub,sigb)-
       dnorm(alphas[i],ma,sa,log=TRUE)-
       dnorm(betas[i],mb,sb,log=TRUE)
w = exp(w-max(w))
ind = sample(1:M,size=M,replace=TRUE,prob=w)
alphas1 = alphas[ind]
betas1 = betas[ind]
draws.sir = cbind(alphas1,betas1)

par(mfrow=c(1,1))
contour(alpha.g,beta.g,exp(prior),xlab=expression(alpha),ylab=expression(beta),
        drawlabels=FALSE,col=2)
points(alphas,betas,pch=16)
points(draws.sir[,1],draws.sir[,2],pch=16,col=3)
contour(alpha.g,beta.g,exp(post),add=TRUE,col=4)
legend("topleft",legend=c("Prior density (grid)","Proposal draws","Posterior (grid)",
       "Posterior draws (SIR)"),col=c(2,1,4,3),pch=16,bty="n")
```



## MCMC1 

In this first MCMC scheme, we use a random-walk Metropolis step for $\beta$ and a Gibbs sampler step for $\alpha$.  The Gibbs sampler for $\alpha$ takes advantage of the fact that conditional on $\beta$ the nonlinear model becomes a linear model as far as $\alpha$ is concerned so the full conditional $p(\alpha|\beta,\mbox{data})$ is an easily derived Gaussian distribution.

* Random walk Metropolis step: $\beta \sim N(\beta^{(i)},v_\beta^2)$ 

* Gibbs sampler step: $\alpha|\beta,\mbox{data} \sim N(\gamma,\omega^2)$, where
$$
\omega^2 = \left( \frac{1}{\sigma_\alpha^2} + \frac{\sum_{i=1}^n z_i^2}{\sigma^2} \right)^{-1} \ \ \ \mbox{and} \ \ \ \gamma = \omega^2\left( \frac{\mu_\alpha}{\sigma_\alpha^2} + 
\frac{\sum_{i=1}^n y_iz_i}{\sigma^2}\right),
$$
where $z_i = x_i/(\beta+x_i)$, for $i=1,\ldots,n$.

```{r fig.width=10,fig.height=7,fig.align='center'}
# MCMC setup
M0    = 10000
M     = 20000
L     = 1
niter = M0+M*L
v.b   = 0.5
beta  = beta.nls
draws = matrix(0,niter,2)
for (iter in 1:niter){
  # sampling alpha (Gibbs step)
  z     = x/(beta+x)
  var   = 1/(1/siga^2+sum(z^2)/sigma^2)
  mean  = var*(mua/siga^2+sum(z*y)/sigma^2)
  alpha = rnorm(1,mean,sqrt(var))
  # sampling beta (random-walk Metropolis step)
  beta1     = rnorm(1,beta,v.b)
  logaccept = min(0,dnorm(beta1,mub,sigb,log=TRUE)+sum(dnorm(y,alpha*x/(beta1+x),sigma,log=TRUE))-
                    dnorm(beta,mub,sigb,log=TRUE)-sum(dnorm(y,alpha*x/(beta+x),sigma,log=TRUE)))
  if (log(runif(1))<logaccept){
    beta = beta1
  }
  draws[iter,] = c(alpha,beta)
}
draws.mcmc1 = draws[seq(M0+1,niter,by=L),]


names = c("alpha","beta")
par(mfrow=c(2,3))
for (i in 1:2){
  ts.plot(draws.mcmc1[,i],ylab="",main=names[i])
  acf(draws.mcmc1[,i],main="")
  hist(draws.mcmc1[,i],xlab="",main="",prob=TRUE)
}
```


## MCMC2

Here, we decided to sample $(\alpha,\beta)$ jointly in the MCM scheme.  In order to do that, we implement an independent Metropolis algorithm for $(\alpha,\beta)$ where the proposal $q(\alpha,\beta)$ is the same one used in our implementation of the above SIR scheme.

```{r fig.width=10,fig.height=7,fig.align='center'}
alpha = alpha.nls
beta  = beta.nls
niter = M0+M*L
draws = matrix(0,niter,2)
for (iter in 1:niter){
  alpha1 = rnorm(1,ma,sa)
  beta1  = rnorm(1,mb,sb)
  post.n = dnorm(alpha1,mua,siga,log=TRUE)+dnorm(beta1,mub,sigb,log=TRUE)+
           sum(dnorm(y,alpha1*x/(beta1+x),sigma,log=TRUE))
  post.d = dnorm(alpha,mua,siga,log=TRUE)+dnorm(beta,mub,sigb,log=TRUE)+
           sum(dnorm(y,alpha*x/(beta+x),sigma,log=TRUE))
  prop.n = dnorm(alpha,ma,sa)+dnorm(beta,mb,sb)
  prop.d = dnorm(alpha1,ma,sa)+dnorm(beta1,mb,sb)
  logaccept = min(0,post.n-post.d+prop.n-prop.d)
  if (log(runif(1))<logaccept){
  	alpha = alpha1
    beta = beta1
  }
  draws[iter,] = c(alpha,beta)
}
draws.mcmc2 = draws[seq(M0+1,niter,by=L),]

par(mfrow=c(2,3))
for (i in 1:2){
  ts.plot(draws.mcmc2[,i],ylab="",main=names[i])
  acf(draws.mcmc2[,i],main="")
  hist(draws.mcmc2[,i],xlab="",main="",prob=TRUE)
}
```

## MC

Here, we integrate out $\alpha$ analytically in order to obtain $p(\beta|\mbox{data})$. Then we sample $\beta$ from this marginal posterior distribution via SIR.  Then, the $\beta$ draws are used to directly sample $\alpha$ from $p(\alpha|\beta,\mbox{data})$.  Therefore, an MCMC is not needed! 

1) Sample $\{\beta^{(1)},\ldots,\beta^{(M)}\}$ from $p(\beta|\mbox{data})$ via SIR with proposal $q(\beta)=p_N(\beta|\beta_{nls},\sigma_{nls}^2)$.  To implement SIR we need the integrated likelihood for $\beta$ given $y=(y_1,\ldots,y_n)'$:
$$
p(y|\beta) = \int_{-\infty}^{\infty} p(y|\alpha,\beta)p(\alpha)d\alpha = 
p_N(y|\mu_\alpha z,\sigma^2 I_n + \sigma_\alpha^2 zz'),
$$
where $z=(z_1,\ldots,z_n)'$ and $z_i=x_i/(\beta+x_i)$, for $i=1,\ldots,n$.  Notice that, by integration $\alpha$ out, the marginal $p(y|\beta)$ is a multivariate normal with non-diagonal variance function of the $(n \times n$) matrix $zz'$.

2) Sample $\alpha^{(i)}$ from $p(\alpha|\beta^{(i)},\mbox{data})$, similar to the Gibbs sampler step of the above MCMC1 algorithm.

```{r fig.width=10,fig.height=7,fig.align='center'}
# sampling beta from p(beta|data)
M    = 10000
beta = rnorm(M,mb,sb)
for (i in 1:M){
 xx = x/(beta[i]+x)
 mean = mua*xx
 var = siga^2*xx%*%t(xx)+diag(sigma^2,n)
 w[i] = dmvnorm(y,mean=mean,sigma=var,log=TRUE)+
        dnorm(beta[i],mub,sigb,log=TRUE)-
        dnorm(beta[i],mb,sb,log=TRUE)
}
w     = exp(w-max(w))
beta1 = sample(beta,size=M,replace=TRUE,prob=w)
beta  = beta1

# sampling alpha from p(alpha|beta,data)
alpha = rep(0,M)
for (i in 1:M){
  z        = x/(beta[i]+x)
  var      = 1/(1/siga^2+sum(z^2)/sigma^2)
  mean     = var*(mua/siga^2+sum(z*y)/sigma^2)
  alpha[i] = rnorm(1,mean,sqrt(var))
}
draws.mc = cbind(alpha,beta)

par(mfrow=c(2,2))
for (i in 1:2){
  ts.plot(draws.mc[,i],ylab="",main=names[i])
  hist(draws.mc[,i],xlab="",main="",prob=TRUE)
}
```

## Comparing MC and MCMC algorithms

```{r fig.width=10,fig.height=8,fig.align='center'}
par(mfrow=c(2,2))
xrange = range(draws.sir[,1],draws.mcmc1[,1],draws.mcmc2[,1],draws.mc[,1])
yrange = range(draws.sir[,2],draws.mcmc1[,2],draws.mcmc2[,2],draws.mc[,2])
plot(draws.sir[,1],draws.sir[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("SIR")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)
plot(draws.mcmc1[,1],draws.mcmc1[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("MCMC 1")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)
plot(draws.mcmc2[,1],draws.mcmc2[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("MCMC 2")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)
plot(draws.mc[,1],draws.mc[,2],xlab=expression(alpha),ylab=expression(beta),pch=16,ylim=yrange,xlim=xrange)
title("MC")
contour(alpha.g,beta.g,exp(post),add=TRUE,col=5,drawlabels=FALSE)
```

```{r fig.width=10,fig.height=6,fig.align='center'}
par(mfrow=c(1,2))
plot(density(draws.sir[,1]),xlab="",main=expression(alpha))
lines(density(draws.mcmc1[,1]),col=2)
lines(density(draws.mcmc2[,1]),col=3)
lines(density(draws.mc[,1]),col=4)
plot(density(draws.sir[,2]),xlab="",main=expression(beta))
lines(density(draws.mcmc1[,2]),col=2)
lines(density(draws.mcmc2[,2]),col=3)
lines(density(draws.mc[,2]),col=4)
```

```{r}
tab.a = rbind(quantile(draws.sir[,1],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc1[,1],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc2[,1],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mc[,1],c(0.05,0.25,0.5,0.75,0.95)))

tab.b = rbind(quantile(draws.sir[,2],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc1[,2],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mcmc2[,2],c(0.05,0.25,0.5,0.75,0.95)),
quantile(draws.mc[,2],c(0.05,0.25,0.5,0.75,0.95)))

rownames(tab.a) = c("SIR","MCMC1","MCMC2","MC")
rownames(tab.b) = c("SIR","MCMC1","MCMC2","MC")

round(tab.a,2)
round(tab.b,2)
```

## Posterior predictive distribution

A final exercise is to obtain a Monte Carlo approximation to the posterior predictive of $y$ for a new value of $x$, say ${\widetilde x}$:
$$
p({\widetilde y}|{\widetilde x}) = \int_{-\infty}^{\infty}\int_{-\infty}^{\infty} 
p_N\left({\widetilde y}|\frac{\alpha{\widetilde x}}{\beta+{\widetilde x}},\sigma^2\right)
p(\alpha,\beta|\mbox{data})d\alpha d\beta,
$$
which can be approximated by
$$
p_{MC}({\widetilde y}|{\widetilde x}) = \frac1M\sum_{i=1}^N p_N\left({\widetilde y}|\frac{\alpha^{(i)}{\widetilde x}}{\beta^{(i)}+{\widetilde x}},\sigma^2\right)
$$
where $\{(\alpha,\beta)^{(1)},\ldots,(\alpha,\beta)^{(M)}\}$ is a Monte Carlo sample from $p(\alpha,\beta|\mbox{data})$, which could be from any of the previous 4 schemes we arleady implemente, ie. SIR, MCMC1, MCMC2 or MC.  

```{r fig.width=8,fig.height=4,fig.align='center'}
# Posterior predictive for xnew=5 and xnew=10
alpha.d  = draws.mc[,1]
beta.d   = draws.mc[,2]
N        = 200
yy       = seq(1,2.5,length=N)
postpred = matrix(0,N,2)
for (i in 1:N){
  postpred[i,1] = mean(dnorm(yy[i],alpha.d*5/(beta.d+5),sigma))
  postpred[i,2] = mean(dnorm(yy[i],alpha.d*10/(beta.d+10),sigma))
}
par(mfrow=c(1,2))
plot(yy,postpred[,1],ylab="Posterior predictive density",xlab="Rate of a reaction (μmol)",type="l")
title("x = 5")
points(1.4,0,pch=16,cex=2)
plot(yy,postpred[,2],ylab="Posterior predictive density",xlab="Rate of a reaction (μmol)",type="l")
title("x = 10")
points(1.65,0,pch=16,cex=2)
postpred1 = postpred
```


Similarly, given ${\widetilde x}$, a sample from $p(y|{\widetilde x})$ can be obtained as follows
$$
{\widetilde y}^{(i)} \sim N\left(\frac{\alpha^{(i)}{\widetilde x}}{\beta^{(i)}+{\widetilde x}},\sigma^2\right), \qquad\qquad i=1,\ldots,M.
$$
Therefore, $\{{\widetilde y}^{(1)},\ldots,{\widetilde y}^{(M)}\}$ is a sample from $p(y|{\widetilde x})$.

```{r fig.width=7,fig.height=7,fig.align='center'}
alpha.d = draws.mc[,1]
beta.d  = draws.mc[,2]
N       = 20
xx      = seq(min(x),max(x),length=N)
yy      = matrix(0,N,M)
for (i in 1:N)
  yy[i,] = rnorm(M,alpha.d*xx[i]/(beta.d+xx[i]),sigma)
quant = t(apply(yy,1,quantile,c(0.05,0.5,0.95)))

par(mfrow=c(1,1))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)",
     ylim=range(quant,y),pch=16,cex=0.75)
lines(xx,quant[,1],lty=2,col=2,lwd=2)
lines(xx,quant[,2],col=2,lwd=2)
lines(xx,quant[,3],lty=2,col=2,lwd=2)
quant1 = quant
```


# Learning $(\alpha,\beta,\sigma^2)$
As a final task, let us now consider unknown the error variance, $\sigma^2$, and assume its prior is a inverse-gamma distribution with parameters $a_0$ and $b_0$, denoted by $\sigma^2 \sim IG(a_0,b_0)$ with density
$$
p(\sigma^2|a_0,b_0) \propto (\sigma^2)^{-(a_0+1)}\exp\{-b_0/\sigma^2\}.
$$
Therefore, the full conditional posterior distribution of $\sigma^2$ can be easily derived by noticing that 
$$
z_i \equiv y_i-\frac{\alpha x_i}{\beta+x_i} \sim N(0,\sigma^2),
$$
has likelihood 
$$
L(\sigma^2|\mbox{data}) \propto (\sigma^2)^{-n/2}\exp\left\{-\frac12\sum_{i=1}^n z_i^2\sigma^2\right\}.
$$
Combining $p(\sigma^2|a_0,b_0)$ and $L(\sigma^2|\mbox{data})$, leads to 
$$
p(\sigma^2|\alpha,\beta,\mbox{data}) \propto (\sigma^2)^{-(a_0+n/2+1)}\exp\left\{-\frac{b_0+\sum_{i=1}^n z_i^2/2}{\sigma^2}\right\},
$$
which is an inverse-gamma with parameters $a_0+n/2$ and $b_0+\sum_{i=1}^n z_i^2/2$.  Here we will consider a fairly uninformative prior for $\sigma^2$ with $a_0=10$ and $b_0=0.018$. The prior mean for $\sigma^2$ is $0.04^2$.  The 95% prior credibility interval (C.I.) for $\sigma^2$ is $(0.001054,0.003754)$. It is worth noting that the 95% prior C.I. for $\sigma$ is 
$(0.0324,0.0613)$.  Recall that we fixed $\sigma=0.04305$ in the previous analyses.

Therefore, we only need to add another layer in the MCMC scheme that was designed to sample from $p(\alpha,\beta|\mbox{data},\sigma^2)$.  The compositional aspect of the Gibbs sampler or, more generally, of all MCMC schemes is one of its main strengths and one can argue that it is the main reason for the massive success of MCMC for modern Bayesian inference that spreaded quickly to virtually all areas of applied sciences.

```{r fig.width=10,fig.height=10,fig.align='center'}
# Hyperparameters of p(sigma^2)
a0 = 10
b0 = 0.018

# MCMC setup
set.seed(12566)
M0    = 2000
M     = 5000
L     = 40
niter = M0+M*L
v.b   = 0.5
beta  = beta.nls
sigma = 0.04305 
draws = matrix(0,niter,3)
for (iter in 1:niter){
  # sampling alpha (Gibbs step)
  z     = x/(beta+x)
  var   = 1/(1/siga^2+sum(z^2)/sigma^2)
  mean  = var*(mua/siga^2+sum(z*y)/sigma^2)
  alpha = rnorm(1,mean,sqrt(var))
  # sampling beta (random-walk Metropolis step)
  beta1     = rnorm(1,beta,v.b)
  logaccept = min(0,dnorm(beta1,mub,sigb,log=TRUE)+sum(dnorm(y,alpha*x/(beta1+x),sigma,log=TRUE))-
                    dnorm(beta,mub,sigb,log=TRUE)-sum(dnorm(y,alpha*x/(beta+x),sigma,log=TRUE)))
  if (log(runif(1))<logaccept){
    beta = beta1
  }
  e    = y-alpha*x/(beta+x)
  par1 = a0 + n/2
  par2 = b0 + sum(e^2)/2
  sigma = sqrt(1/rgamma(1,par1,par2))
  draws[iter,] = c(alpha,beta,sigma)
}
draws.full = draws[seq(M0+1,niter,by=L),]


names = c("alpha","beta","sigma")
par(mfrow=c(3,3))
for (i in 1:3){
  ts.plot(draws.full[,i],ylab="",main=names[i])
  acf(draws.full[,i],main="")
  hist(draws.full[,i],xlab="",main="",prob=TRUE)
}
```

## Bivariate marginal posterior distributions

```{r fig.width=10,fig.height=4,fig.align='center'}
par(mfrow=c(1,3))
plot(draws.full[,1],draws.full[,2],xlab=expression(alpha),ylab=expression(beta))
plot(draws.full[,1],draws.full[,3],xlab=expression(alpha),ylab=expression(sigma))
plot(draws.full[,2],draws.full[,3],xlab=expression(beta),ylab=expression(sigma))

tab = t(apply(draws.full,2,quantile,c(0.05,0.5,0.95)))
rownames(tab) = c("alpha","beta","sigma")
tab
```

## Posterior predictive distribution
```{r fig.width=10,fig.height=6,fig.align='center'}
alpha.d = draws.full[,1]
beta.d  = draws.full[,2]
sigma.d = draws.full[,3]
N       = 20
xx      = seq(min(x),max(x),length=N)
yy      = matrix(0,N,M)
for (i in 1:N)
  yy[i,] = rnorm(M,alpha.d*xx[i]/(beta.d+xx[i]),sigma.d)
quant = t(apply(yy,1,quantile,c(0.05,0.5,0.95)))

par(mfrow=c(1,2))
plot(x,y,ylab="Rate of a reaction (μmol)",xlab="Concentration of a substrate (mM)",
     ylim=range(quant,y),pch=16,cex=0.75)
lines(xx,quant[,1],lty=2,col=2,lwd=2)
lines(xx,quant[,2],col=2,lwd=2)
lines(xx,quant[,3],lty=2,col=2,lwd=2)
lines(xx,quant1[,1],lty=2,col=4,lwd=2)
lines(xx,quant1[,2],col=4,lwd=2)
lines(xx,quant1[,3],lty=2,col=4,lwd=2)
legend("topleft",legend=c("known sigma","unknown sigma"),col=c(4,2),lty=1,lwd=2,bty="n")

N        = 200
yy       = seq(1,2.5,length=N)
postpred = rep(0,N)
for (i in 1:N)
  postpred[i] = mean(dnorm(yy[i],alpha.d*10/(beta.d+10),sigma.d))

plot(yy,postpred,ylab="Posterior predictive density",xlab="Rate of a reaction (μmol)",
    type="l",col=2,ylim=c(0,8.5),lwd=2)
title("x = 10")
points(1.65,0,pch=16,cex=2)
lines(yy,postpred1[,2],col=4,lwd=2)
```