1 The mcyle dataset from MASS

A data frame giving a series of measurements of head acceleration in a simulated motorcycle accident, used to test crash helmets:

Source: Silverman (1985) Some aspects of the spline smoothing approach to non-parametric curve tting. JRSS-B, 47, 1-52.

library(MASS) 
n = nrow(mcycle) 
x = mcycle$times 
y = mcycle$accel 
x = x/max(x) 
y = y/max(y) 
x =-1+2*x 
xt = x 
yt = y 
xt = x[1:132] 
yt = y[1:132] 
n = length(xt) 
x = (xt-mean(xt))/sqrt(var(xt)) 
y = (yt-mean(yt))/sqrt(var(yt))

par(mfrow=c(1,1))
plot(x,y,xlab="Time after impact (standardized)",ylab="Head accelaration (standardized)")

2 Gaussian polynomial regression

graus <- c(1,2,3,4,5,11,12,13,14,15)
par(mfrow=c(2,5))
for (d in 1:length(graus)){
   fit = lm(y ~ poly(x, graus[d]))
   r2adj = summary(fit)$adj.r.squared
   plot(x,y,xlab="Time after impact (standardized)",ylab="Head accelaration (standardized)")
   lines(x, fit$fit, col = 2)
   title(paste("d = ", graus[d] , sep=""))
   legend("bottomright", legend = c(paste("R2adj = ",round(100*r2adj,1),"%", sep = "")),bty="n")
}

2.1 Adjusted \(R^2\) and BIC

dmax = 15
ds    = 1:dmax
N     = length(ds)
r2adj = rep(0,N)
bic   = rep(0,N)
l     = 0


for (d in ds){
  fit      <- lm(y ~ poly(x, d))
  sig2hat  <- summary(fit)$sigma^2
  RSS      <- sum(fit$residuals^2)
  r2adj[d] <- summary(fit)$adj.r.squared
  bic[d]   <- 1/n*(RSS + log(n)*(d + 1)*sig2hat)
}

par(mfrow=c(1,2))
plot(ds, r2adj,xlab = "Order of polynomial", ylab = "Adjusted R2", type = "b", 
     lwd = 2, main = "In-sample Adjusted R2")
points(ds[r2adj == max(r2adj)], max(r2adj), lty = 2, pch = 20, col = 2)

plot(ds, bic, xlab = "Order of polynomial", ylab = "BIC", type="b", 
     main = "In-sample BIC", lwd = 2)
points(ds[bic == min(bic)], min(bic), lty = 2, pch = 20, col =2)

3 Cross-validation: training and testing

set.seed(2718282)
perc = c(0.1,0.15,0.2)
par(mfrow=c(1,3))
for (j in 1:3){
train = sample(1:length(x),length(x)*(1-perc[j]))
test  = (-train)
ytr   = y[train]
xtr   = x[train]
yte   = y[test]
xte   = x[test]
N     = length(ds)
RMSE  = rep(0,N)
xxtr  = NULL
xxte  = NULL

for (d in ds){
  xxtr  = cbind(xxtr,xtr^d)
  xxte  = cbind(xxte,xte^d)
  fit = lm(ytr~xxtr)
  ehat = yte - cbind(1,xxte)%*%fit$coef
  RMSE[d] = sqrt(mean(ehat^2))
}

plot(ds, RMSE, xlab = "Order of polynomial", ylab = "Root MSE",type="b")
abline(v = ds[RMSE==min(RMSE)], lty = 2)
title(paste("Cross validation (",1-perc[j],",",perc[j],")",sep=""))
}

4 Cross-validation: training and testing - 200 replications

set.seed(56789)
perc = c(0.1,0.15,0.2,0.25)
S     = 200
N     = length(ds)
RMSE  = matrix(0,S,N)

par(mfrow=c(2,2))
for (j in 1:4){
freq  = rep(0,N)
for (s in 1:S){
  train = sample(1:length(x),length(x)*(1-perc[j]))
  test  = (-train)
  ytr   = y[train]
  xtr   = x[train]
  yte   = y[test]
  xte   = x[test]
  l     = 0
  xxtr  = NULL
  xxte  = NULL
  for (d in ds){
    l     = l + 1
    xxtr  = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,xte^d)
    fit = lm(ytr~xxtr)
    ehat = yte - cbind(1,xxte)%*%fit$coef
    RMSE[s,l] = sqrt(mean(ehat^2))
  }
  freq[ds[RMSE[s,]==min(RMSE[s,],na.rm=TRUE)][1]] = freq[ds[RMSE[s,]==min(RMSE[s,],na.rm=TRUE)][1]] + 1
}
boxplot(RMSE,outline=FALSE,xlab="Order of polynomial", ylab = "Root MSE",ylim=c(0,2))  
for (i in 1:N)
  text(i,0,paste(round(100*freq[i]/S,1),"%",sep=""),cex=0.75)
title(paste("Cross validation\n Training = ",100*(1-perc[j]),"% - ",S," replications",sep=""))
}

5 Cross-validation: leave-one-out (LOO)

set.seed(2718282)
RMSE  = rep(0,N)
for (i in 1:n){
  test = i
  ytr = y[-i]
  xtr = x[-i]
  N     = length(ds)
  xxtr  = NULL
  xxte = NULL
  for (d in ds){
    xxtr   = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,x[i]^d)
    fit      = lm(ytr~xxtr)
    RMSE[d] = RMSE[d] + (y[i] - cbind(1,xxte)%*%fit$coef)^2
  }
}
par(mfrow=c(1,1))
plot(ds, RMSE, xlab = "Order of polynomial", ylab = "Root MSE",type="b",main="Leave-one-out (LOO)")
abline(v = ds[RMSE==min(RMSE)], lty = 2)

6 10-fold cross-validation

ind = sample(1:n,size=n,replace=FALSE)
x1 = x[ind]
y1 = y[ind]
fold = 10
L = trunc(n/fold)
left = seq(1,n,by=L)[1:fold]
right = seq(L,n,by=L)
right[fold]=n
MSE = rep(0,dmax)
for (f in 1:fold){
  test = left[f]:right[f]
  ytr = y1[-test]
  xtr = x1[-test]
  yte = y1[test]
  xte = x1[test]
  xxtr  = NULL
  xxte  = NULL
  for (d in ds){
    xxtr  = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,xte^d)
    fit = lm(ytr~xxtr)
    ehat = yte - cbind(1,xxte)%*%fit$coef
    MSE[d] = MSE[d] + sum(ehat^2)
  }
}
RMSE = MSE/n

par(mfrow=c(1,1))
plot(ds, RMSE, xlab = "Order of polynomial", ylab = "Root MSE",type="b",main="")
title(paste(fold,"-fold cross validation",sep=""))
abline(v = ds[RMSE==min(RMSE)], lty = 2)

7 10-fold cross-validation - 200 replications

fold = 10
S = 200
RMSE = matrix(0,S,dmax)
order = rep(0,S)
for (s in 1:S){
ind = sample(1:n,size=n,replace=FALSE)
x1 = x[ind]
y1 = y[ind]
L = trunc(n/fold)
left = seq(1,n,by=L)[1:fold]
right = seq(L,n,by=L)
right[fold]=n
MSE = rep(0,dmax)
for (f in 1:fold){
  test = left[f]:right[f]
  ytr = y1[-test]
  xtr = x1[-test]
  yte = y1[test]
  xte = x1[test]
  xxtr  = NULL
  xxte  = NULL
  for (d in ds){
    xxtr  = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,xte^d)
    fit = lm(ytr~xxtr)
    ehat = yte - cbind(1,xxte)%*%fit$coef
    MSE[d] = MSE[d] + sum(ehat^2)
  }
}
RMSE[s,] = MSE/n
order[s] = ds[MSE==min(MSE)]
}

freq = rep(0,dmax)
for (s in 1:S){
  freq[order[s]]=  freq[order[s]]+1
}
freq = 100*freq/S

par(mfrow=c(1,1))
boxplot(RMSE,outline=FALSE,xlab = "Order of polynomial", ylab = "Root MSE",ylim=c(0.1,1))
title(paste(fold,"-fold cross validation\n",S," replications",sep=""))
for (d in 1:dmax)
  text(d,0.1,paste(freq[d],"%",sep=""),cex=0.75)

8 Bootstrap for the best model

dbest = ds[freq==max(freq)]
B = 1000
fit = matrix(0,B,n)
plot(x,y,pch=16,xlab="Time after impact (standardized)",ylab="Head accelaration (standardized)",ylim=c(-4,4))
title(paste("Bootstraping the best model\n B=",B," - best model = ",dbest,sep=""))
for (b in 1:B){
  ind = sample(1:n,size=n,replace=TRUE)
  xx = x[ind]
  yy = y[ind]
  X = NULL
  for (d in 1:dbest)
    X  = cbind(X,xx^d)
  fit[b,] = lm(yy~X)$fit
  ord = order(x[ind])
  lines(xx[ord],fit[b,ord],col=grey(0.8))
}
points(x,y,pch=16)

---
title: "Gaussian polynomial regression"
subtitle: "Revisiting the motorcycle data"
author: "Hedibert Freitas Lopes"
date: "28/10/2025"
output:
  html_document:
    toc: true
    toc_depth: 3
    toc_collapsed: true
    code_download: true
    number_sections: true
  pdf_document:
    toc: true
    toc_depth: '3'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# The mcyle dataset from MASS 

A data frame giving a series of measurements of head acceleration 
in a simulated motorcycle accident, used to test crash helmets:

* $x$: time in miliseconds after impact 

* $y$: head accelaration (in g) 

*Source:* Silverman (1985) Some aspects of the spline smoothing approach to non-parametric curve tting. JRSS-B, 47, 1-52.

```{r fig.align='center', fig.width=8, fig.height=6}
library(MASS) 
n = nrow(mcycle) 
x = mcycle$times 
y = mcycle$accel 
x = x/max(x) 
y = y/max(y) 
x =-1+2*x 
xt = x 
yt = y 
xt = x[1:132] 
yt = y[1:132] 
n = length(xt) 
x = (xt-mean(xt))/sqrt(var(xt)) 
y = (yt-mean(yt))/sqrt(var(yt))

par(mfrow=c(1,1))
plot(x,y,xlab="Time after impact (standardized)",ylab="Head accelaration (standardized)")
```

# Gaussian polynomial regression

```{r fig.align='center', fig.width=15, fig.height=8}
graus <- c(1,2,3,4,5,11,12,13,14,15)
par(mfrow=c(2,5))
for (d in 1:length(graus)){
   fit = lm(y ~ poly(x, graus[d]))
   r2adj = summary(fit)$adj.r.squared
   plot(x,y,xlab="Time after impact (standardized)",ylab="Head accelaration (standardized)")
   lines(x, fit$fit, col = 2)
   title(paste("d = ", graus[d] , sep=""))
   legend("bottomright", legend = c(paste("R2adj = ",round(100*r2adj,1),"%", sep = "")),bty="n")
}
```

## Adjusted $R^2$ and BIC

```{r fig.align='center', fig.width=10, fig.height=5}
dmax = 15
ds    = 1:dmax
N     = length(ds)
r2adj = rep(0,N)
bic   = rep(0,N)
l     = 0


for (d in ds){
  fit      <- lm(y ~ poly(x, d))
  sig2hat  <- summary(fit)$sigma^2
  RSS      <- sum(fit$residuals^2)
  r2adj[d] <- summary(fit)$adj.r.squared
  bic[d]   <- 1/n*(RSS + log(n)*(d + 1)*sig2hat)
}

par(mfrow=c(1,2))
plot(ds, r2adj,xlab = "Order of polynomial", ylab = "Adjusted R2", type = "b", 
     lwd = 2, main = "In-sample Adjusted R2")
points(ds[r2adj == max(r2adj)], max(r2adj), lty = 2, pch = 20, col = 2)

plot(ds, bic, xlab = "Order of polynomial", ylab = "BIC", type="b", 
     main = "In-sample BIC", lwd = 2)
points(ds[bic == min(bic)], min(bic), lty = 2, pch = 20, col =2)
```

# Cross-validation: training and testing 

```{r fig.align='center', fig.width=12, fig.height=4}
set.seed(2718282)
perc = c(0.1,0.15,0.2)
par(mfrow=c(1,3))
for (j in 1:3){
train = sample(1:length(x),length(x)*(1-perc[j]))
test  = (-train)
ytr   = y[train]
xtr   = x[train]
yte   = y[test]
xte   = x[test]
N     = length(ds)
RMSE  = rep(0,N)
xxtr  = NULL
xxte  = NULL

for (d in ds){
  xxtr  = cbind(xxtr,xtr^d)
  xxte  = cbind(xxte,xte^d)
  fit = lm(ytr~xxtr)
  ehat = yte - cbind(1,xxte)%*%fit$coef
  RMSE[d] = sqrt(mean(ehat^2))
}

plot(ds, RMSE, xlab = "Order of polynomial", ylab = "Root MSE",type="b")
abline(v = ds[RMSE==min(RMSE)], lty = 2)
title(paste("Cross validation (",1-perc[j],",",perc[j],")",sep=""))
}
```

# Cross-validation: training and testing - 200 replications
```{r fig.align='center', fig.width=15, fig.height=10}
set.seed(56789)
perc = c(0.1,0.15,0.2,0.25)
S     = 200
N     = length(ds)
RMSE  = matrix(0,S,N)

par(mfrow=c(2,2))
for (j in 1:4){
freq  = rep(0,N)
for (s in 1:S){
  train = sample(1:length(x),length(x)*(1-perc[j]))
  test  = (-train)
  ytr   = y[train]
  xtr   = x[train]
  yte   = y[test]
  xte   = x[test]
  l     = 0
  xxtr  = NULL
  xxte  = NULL
  for (d in ds){
    l     = l + 1
    xxtr  = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,xte^d)
    fit = lm(ytr~xxtr)
    ehat = yte - cbind(1,xxte)%*%fit$coef
    RMSE[s,l] = sqrt(mean(ehat^2))
  }
  freq[ds[RMSE[s,]==min(RMSE[s,],na.rm=TRUE)][1]] = freq[ds[RMSE[s,]==min(RMSE[s,],na.rm=TRUE)][1]] + 1
}
boxplot(RMSE,outline=FALSE,xlab="Order of polynomial", ylab = "Root MSE",ylim=c(0,2))  
for (i in 1:N)
  text(i,0,paste(round(100*freq[i]/S,1),"%",sep=""),cex=0.75)
title(paste("Cross validation\n Training = ",100*(1-perc[j]),"% - ",S," replications",sep=""))
}
```

# Cross-validation: leave-one-out (LOO)

```{r fig.align='center', fig.width=8, fig.height=6}
set.seed(2718282)
RMSE  = rep(0,N)
for (i in 1:n){
  test = i
  ytr = y[-i]
  xtr = x[-i]
  N     = length(ds)
  xxtr  = NULL
  xxte = NULL
  for (d in ds){
    xxtr   = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,x[i]^d)
    fit      = lm(ytr~xxtr)
    RMSE[d] = RMSE[d] + (y[i] - cbind(1,xxte)%*%fit$coef)^2
  }
}
par(mfrow=c(1,1))
plot(ds, RMSE, xlab = "Order of polynomial", ylab = "Root MSE",type="b",main="Leave-one-out (LOO)")
abline(v = ds[RMSE==min(RMSE)], lty = 2)
```

# 10-fold cross-validation
```{r fig.align='center', fig.width=8, fig.height=6}
ind = sample(1:n,size=n,replace=FALSE)
x1 = x[ind]
y1 = y[ind]
fold = 10
L = trunc(n/fold)
left = seq(1,n,by=L)[1:fold]
right = seq(L,n,by=L)
right[fold]=n
MSE = rep(0,dmax)
for (f in 1:fold){
  test = left[f]:right[f]
  ytr = y1[-test]
  xtr = x1[-test]
  yte = y1[test]
  xte = x1[test]
  xxtr  = NULL
  xxte  = NULL
  for (d in ds){
    xxtr  = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,xte^d)
    fit = lm(ytr~xxtr)
    ehat = yte - cbind(1,xxte)%*%fit$coef
    MSE[d] = MSE[d] + sum(ehat^2)
  }
}
RMSE = MSE/n

par(mfrow=c(1,1))
plot(ds, RMSE, xlab = "Order of polynomial", ylab = "Root MSE",type="b",main="")
title(paste(fold,"-fold cross validation",sep=""))
abline(v = ds[RMSE==min(RMSE)], lty = 2)
```

# 10-fold cross-validation - 200 replications

```{r fig.align='center', fig.width=8, fig.height=6}
fold = 10
S = 200
RMSE = matrix(0,S,dmax)
order = rep(0,S)
for (s in 1:S){
ind = sample(1:n,size=n,replace=FALSE)
x1 = x[ind]
y1 = y[ind]
L = trunc(n/fold)
left = seq(1,n,by=L)[1:fold]
right = seq(L,n,by=L)
right[fold]=n
MSE = rep(0,dmax)
for (f in 1:fold){
  test = left[f]:right[f]
  ytr = y1[-test]
  xtr = x1[-test]
  yte = y1[test]
  xte = x1[test]
  xxtr  = NULL
  xxte  = NULL
  for (d in ds){
    xxtr  = cbind(xxtr,xtr^d)
    xxte  = cbind(xxte,xte^d)
    fit = lm(ytr~xxtr)
    ehat = yte - cbind(1,xxte)%*%fit$coef
    MSE[d] = MSE[d] + sum(ehat^2)
  }
}
RMSE[s,] = MSE/n
order[s] = ds[MSE==min(MSE)]
}

freq = rep(0,dmax)
for (s in 1:S){
  freq[order[s]]=  freq[order[s]]+1
}
freq = 100*freq/S

par(mfrow=c(1,1))
boxplot(RMSE,outline=FALSE,xlab = "Order of polynomial", ylab = "Root MSE",ylim=c(0.1,1))
title(paste(fold,"-fold cross validation\n",S," replications",sep=""))
for (d in 1:dmax)
  text(d,0.1,paste(freq[d],"%",sep=""),cex=0.75)
```

# Bootstrap for the best model

```{r fig.align='center', fig.width=8, fig.height=6}
dbest = ds[freq==max(freq)]
B = 1000
fit = matrix(0,B,n)
plot(x,y,pch=16,xlab="Time after impact (standardized)",ylab="Head accelaration (standardized)",ylim=c(-4,4))
title(paste("Bootstraping the best model\n B=",B," - best model = ",dbest,sep=""))
for (b in 1:B){
  ind = sample(1:n,size=n,replace=TRUE)
  xx = x[ind]
  yy = y[ind]
  X = NULL
  for (d in 1:dbest)
    X  = cbind(X,xx^d)
  fit[b,] = lm(yy~X)$fit
  ord = order(x[ind])
  lines(xx[ord],fit[b,ord],col=grey(0.8))
}
points(x,y,pch=16)
```
