########################################################################################## # # Predicting human development index (hdi) for Brazilian municipalities via # # OLS: Linear regression via ordinary least squares # Horseshoe: Linear regression with regularizing horseshoe prior # Normal-Gamma: Linear regression with regularizing normal-gamma prior # CART: Classification and regression tree # RF: Random forest # BOOST: Boosting trees # BART: Bayesian additive regression trees # # # Copyright by Hedibert Freitas Lopes, PhD # Date: June 19th 2026 # ########################################################################################## ########################################################################################## # # DADOS MUNICIPAIS BRASILEIROS (ipeadata.gov.br) # ########################################################################################## # # 1. Estado # 2. Codigo # 3. Municipio # 4. Renda domiciliar per capita - media do 1o quintil (R$ por mes) # 5. Renda domiciliar per capita - media do 2o quintil (R$ por mes) # 6. Renda domiciliar per capita - media do 3o quintil (R$ por mes) # 7. Renda domiciliar per capita - media do 4o quintil (R$ por mes) # 8. Renda domiciliar per capita - media do 5o quintil (R$ por mes) # 9. Razao entre a renda dos 10% mais ricos e 40% mais pobres # 10. Mulheres chefes de familia sem conjuge e com filhos menores de 15 anos (%) # 11. Medicos residentes (por mil habitantes) # 12. Enfermeiros residentes com curso superior (%) # 13. Alfabetizados - pessoas 15 anos e mais (%) # 14. Indice de Desenvolvimento Humano # 15. Indice de Desenvolvimento Humano - renda # 16. Indice de Desenvolvimento Humano - longevidade # 17. Indice de Desenvolvimento Humano - educacao # 18. Mortalidade ate cinco anos de idade (por mil nascidos vivos) # 19. Probabilidade de sobrevivencia ate 40 anos (%) # 20. Probabilidade de sobrevivencia ate 60 anos (%) # 21. Taxa de fecundidade (%) # 22. Pessoas 65 anos ou mais - morando sozinhas (%) # 23. Pessoas 10 e 14 anos - mulheres com filhos (%) # 24. Professores do fundamental residentes com curso superior (%) # 25. Esperanca de vida ao nascer # 26. Mortalidade infantil (por mil nascidos vivos) # 27. Domicilios - com agua encanada - pessoas (%) # 28. Domicilios - com servico de coleta de lixo - pessoas (%) # ########################################################################################## #install.packages("MASS") #install.packages("monomvn") #install.packages("tree") #install.packages("randomForest") #install.packages("gbm") #install.packages("BART") library("MASS") library("monomvn") library("tree") library("randomForest") library("gbm") library("BART") # IBGE dataset data = read.csv("http://hedibert.org/wp-content/uploads/2014/04/dadosmunicipais-ipea.csv", header=TRUE) attach(data) n = nrow(data) norte <- c("AC", "AM", "AP", "PA", "RO", "RR", "TO") nordeste <- c("AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE") centrooeste <- c("DF", "GO", "MS", "MT") sudeste <- c("ES", "MG", "RJ", "SP") sul <- c("PR", "RS", "SC") D.N <- as.integer(data[, 1] %in% norte) D.NE <- as.integer(data[, 1] %in% nordeste) D.CO <- as.integer(data[, 1] %in% centrooeste) D.SE <- as.integer(data[, 1] %in% sudeste) D.S <- as.integer(data[, 1] %in% sul) regioes = c("Norte","Nordeste","Centro-Oeste","Sudeste","Sul") regiao = rep(1,n) regiao[D.NE==1]=2 regiao[D.CO==1]=3 regiao[D.SE==1]=4 regiao[D.S==1]=5 par(mfrow=c(2,3)) x = rendapc3 x = log10(razao1040) x = log10(medico+0.001) x = log10(enfermeiro+0.045) x = log10(maes10a14+0.001) x = log10(professores+0.001) plot(x,idh) dens = kde2d(x,idh, n = 200) contour(dens, add = TRUE, col = "red", lwd = 1.5) plot(x[D.N==1],idh[D.N==1],xlim=range(x),ylim=range(idh)) dens = kde2d(x[D.N==1],idh[D.N==1], n = 200) contour(dens, add = TRUE, col = "red", lwd = 1.5) plot(x[D.NE==1],idh[D.NE==1],xlim=range(x),ylim=range(idh)) dens = kde2d(x[D.NE==1],idh[D.NE==1], n = 200) contour(dens, add = TRUE, col = "red", lwd = 1.5) plot(x[D.CO==1],idh[D.CO==1],xlim=range(x),ylim=range(idh)) dens = kde2d(x[D.CO==1],idh[D.CO==1], n = 200) contour(dens, add = TRUE, col = "red", lwd = 1.5) plot(x[D.SE==1],idh[D.SE==1],xlim=range(x),ylim=range(idh)) dens = kde2d(x[D.SE==1],idh[D.SE==1], n = 200) contour(dens, add = TRUE, col = "red", lwd = 1.5) plot(x[D.S==1],idh[D.S==1],xlim=range(x),ylim=range(idh)) dens = kde2d(x[D.S==1],idh[D.S==1], n = 200) contour(dens, add = TRUE, col = "red", lwd = 1.5) y = scale(data[,14]) X = scale(as.matrix(data[,c(6,10:13,18:22,24,25,27,28)])) X = scale(as.matrix(data[,c(4:13,18:28)])) Z = cbind(D.SE,D.S,D.CO,D.NE,D.N) tab = round(rbind(summary(y[regiao==1]), summary(y[regiao==2]), summary(y[regiao==3]), summary(y[regiao==4]), summary(y[regiao==5]), summary(y[regiao>0])),3) rownames(tab) = c(regioes,"Todas") tab idhdata = data.frame(y,X,Z) ols = lm(y~.-1,data=idhdata) summary(ols) tree = tree(y~.-1,data=idhdata) summary(tree) tree par(mfrow=c(1,1)) plot(tree,type="uniform") text(tree,pretty=0) # Computing Mean square and absolute errors for various methods # ------------------------------------------------------------- msemae = matrix(0,7,2) # OLS ols = lm(y~.-1,data=idhdata) pred.ols = ols$fit err = y-pred.ols msemae[1,] = c(mean(err^2),mean(abs(err))) # Blasso horse shoe hs = blasso(X,y,T=10000,thin=1,RJ=FALSE,case="hs",icept=FALSE,verb=0) pred.hs = X%*%apply(hs$beta,2,mean) err = y-pred.hs msemae[2,] = c(mean(err^2),mean(abs(err))) # Blasso normal-gamma ng = blasso(X,y,T=10000,thin=1,RJ=FALSE,case="ng",icept=FALSE,verb=0) pred.ng = X%*%apply(ng$beta,2,mean) err = y-pred.ng msemae[3,] = c(mean(err^2),mean(abs(err))) # CART tree.fit = tree(y~.-1,data=idhdata) summary(tree.fit) tree.fit plot(tree.fit,type="uniform") text(tree.fit,pretty=0) pred.cart = predict(tree.fit,data= idhdata) err = y-pred.cart msemae[4,] = c(mean(err^2),mean(abs(err))) # Random forest rf = randomForest(y~.-1,data= idhdata,mtry=7,importance=TRUE) pred.rf = rf$predicted err = y-pred.rf msemae[5,] = c(mean(err^2),mean(abs(err))) # Boosting boost = gbm(y~.-1, data=idhdata, distribution="gaussian",n.trees=5000, interaction.depth=4,shrinkage=0.01, cv.folds=5) best = gbm.perf(boost, method="cv", plot.it=FALSE) # optimal # of trees pred.boost = predict(boost, newdata=idhdata, n.trees=best) err = y - pred.boost msemae[6,] = c(mean(err^2), mean(abs(err))) # BART for classification bart = wbart(X,y) pred.bart = apply(bart$yhat.train,2,mean) err = y-pred.bart msemae[7,] = c(mean(err^2),mean(abs(err))) rownames(msemae) = c("OLS","Horseshoe","Normal-Gamma","CART","RF","BOOST","BART") colnames(msemae) = c("MSE","MAE") round(msemae,4) tab = cor(cbind(y,pred.ols,pred.hs,pred.ng,pred.cart,pred.rf,pred.boost,pred.bart)) rownames(tab) = c("Y","OLS","Horseshoe","Normal-Gamma","CART","RF","BOOST","BART") colnames(tab) = c("Y","OLS","Horseshoe","Normal-Gamma","CART","RF","BOOST","BART") round(tab,3) # Replication of out-of-sample MSE and MAE # ---------------------------------------- set.seed(1243253) Rep = 20 n = nrow(X) gsize = trunc(n/3) n2 = n-gsize mse = array(0,c(Rep,7)) mae = array(0,c(Rep,7)) for (rep in 1:Rep){ ind = sample(1:n) y1 = y[ind] X1 = X[ind,] test = sort(sample(1:n,size=gsize,replace=FALSE)) train = setdiff(1:n,test) ytrain = y1[train] Xtrain = X1[train,] ytest = y1[test] Xtest = X1[test,] traindata = data.frame(ytrain,Xtrain) testdata = data.frame(ytest,Xtest) ols = lm(ytrain~.-1,data=traindata) fit = Xtest%*%ols$coef err = ytest-fit mse[rep,1] = mean(err^2) mae[rep,1] = mean(abs(err)) hs = blasso(Xtrain,ytrain,T=10000,thin=1,RJ=FALSE,case="hs",icept=FALSE,verb=0) fit = Xtest%*%apply(hs$beta,2,mean) err = ytest-fit mse[rep,2] = mean(err^2) mae[rep,2] = mean(abs(err)) ng = blasso(Xtrain,ytrain,T=10000,thin=1,RJ=FALSE,case="ng",icept=FALSE,verb=0) fit = Xtest%*%apply(ng$beta,2,mean) err = ytest-fit mse[rep,3] = mean(err^2) mae[rep,3] = mean(abs(err)) tree = tree(ytrain~.,data=traindata) fit = predict(tree,newdata=testdata) err = ytest-fit mse[rep,4] = mean(err^2) mae[rep,4] = mean(abs(err)) rf = randomForest(ytrain~.,data=traindata,mtry=7,importance=TRUE) fit = predict(rf,newdata=testdata) err = ytest-fit mse[rep,5] = mean(err^2) mae[rep,5] = mean(abs(err)) boost = gbm(ytrain~.-1, data=traindata, distribution="gaussian",n.trees=5000, interaction.depth=4,shrinkage=0.01, cv.folds=5) best = gbm.perf(boost, method="cv", plot.it=FALSE) # optimal # of trees fit = predict(boost, newdata=testdata, n.trees=best) err = ytest-fit mse[rep,6] = mean(err^2) mae[rep,6] = mean(abs(err)) bart = wbart(x.train=Xtrain,y.train=ytrain,x.test=Xtest) fit = apply(bart$yhat.test,2,mean) err = ytest-fit mse[rep,7] = mean(err^2) mae[rep,7] = mean(abs(err)) } method = c("OLS","HS","NG","CART","RF","BOOST","BART") par(mfrow=c(1,2)) boxplot(sqrt(mse),names=method,xlab="Root mean square error",horizontal=TRUE) boxplot(mae,names=method,xlab="Mean absolute error",horizontal=TRUE) par(mfrow=c(1,2)) boxplot(mse[,c(1:3,5:7)],names=method[c(1:3,5:7)],xlab="Root mean square error",horizontal=TRUE) boxplot(mae[,c(1:3,5:7)],names=method[c(1:3,5:7)],xlab="Mean absolute error",horizontal=TRUE) par(mfrow=c(1,2)) boxplot(100*(mse[,c(1:3,5,7)]/mse[,6]-1),names=c("OLS","HS","NG","RF","BART"),xlab="Relative root mean square error (percentage)",ylab="Statistical method",horizontal=TRUE) boxplot(100*(mae[,c(1:3,5,7)]/mae[,6]-1),names=c("OLS","HS","NG","RF","BART"),xlab="Relative mean absolute error (percentage)",ylab="Statistical method",horizontal=TRUE)