# Nonlinear mean function psi = function(x){45*tanh(x/1.9-7)+57} # Simulating some data set.seed(12345) sigma = 4 n = 30 x = sort(runif(n,8,18)) y = rnorm(n,psi(x),sigma) plot(x,y,pch=16,cex=1.5,xlab="x",ylab="y",las=1) curve(psi,lwd=4,col="magenta",add=TRUE) title(paste("n=",n," - sigma=",sigma,sep="")) # Four regression functions dfs = c(2,7,0.8*n) ndf = length(dfs) for (i in 1:ndf){ df = dfs[i] psi_hat=smooth.spline(x,y,df=df, all.knots = TRUE) lines(psi_hat, lwd = 3, col=1+i) } legend("topleft",legend=c(expression(psi),dfs),col=c("magenta",2:(ndf+1)),lty=1,bty="n",lwd=3) # Simulação do perde-ganha viés-variância (1) bias2_variance = function(x, psi, sigma, n, df, N) { x_tr = matrix(runif(N*n, 8, 18), nrow = N) epsilon = matrix(rnorm(N*n, sd = sigma), nrow = N) y_tr = t(apply(x_tr, 1, psi)) + epsilon prediction = numeric(N) for (j in 1:N) { psi_hat=smooth.spline(x_tr[j,], y_tr[j,], df = df, all.knots = TRUE)$fit prediction[j] = predict(psi_hat, x)$y } list(bias2 = (mean(prediction) - psi(x))^2, variance = var(prediction)) } set.seed(1234) par(mfrow=c(3,3)) for (n in c(30,40,50)){ for (sigma in c(2,4,6)){ N = 1000 dfs = 4:20 ndf = length(dfs) bias2 = numeric(ndf) variance = numeric(ndf) test_error = numeric(ndf) x = 12 for (i in 1:ndf) { df = dfs[i] trade_off = bias2_variance(x, psi, sigma, n, df, N) bias2[i] = trade_off$bias2 variance[i] = trade_off$variance test_error[i] = trade_off$bias2 + trade_off$variance + sigma^2 } plot(dfs, bias2, ylim = c(0, max(test_error)), type = "l", lwd = 3,col = "blue", las = 1, xlab = "Spline's df", ylab = "") lines(dfs, variance, type = "l", lwd = 3, col = "red") lines(dfs, test_error, type = "l", lwd = 3, col = "dark green") abline(h = sigma^2, lty = 2) title(paste("n=",n," - sigma=",sigma,"\nOptimal df=", dfs[test_error==min(test_error)],sep="")) if (i==1){ legend("topright", c("Erro de predição", "Viés", "Variância", "Erro irredutível"), lty = c(1, 1, 1, 2), lwd = c(3, 3, 3, 1), col = c("dark green", "blue", "red", "black"), cex = 1.5) } }}