########################################################################################## # # Machine learning algorithms and Bayesian statistical learning # for predicting human development index (hdi) for Brazilian municipalities. # # Copyright by Hedibert Freitas Lopes, PhD # Date: March 19th 2019. # ########################################################################################## ########################################################################################## # # DADOS MUNICIPAIS BRASILEIROS (ipeadata.gov.br) # ########################################################################################## # # 1. Estado # 2. Codigo # 3. Municipio # 4. Renda domiciliar per capita - media do 1o quintil (R$ por mes) # 5. Renda domiciliar per capita - media do 2o quintil (R$ por mes) # 6. Renda domiciliar per capita - media do 3o quintil (R$ por mes) # 7. Renda domiciliar per capita - media do 4o quintil (R$ por mes) # 8. Renda domiciliar per capita - media do 5o quintil (R$ por mes) # 9. Razao entre a renda dos 10% mais ricos e 40% mais pobres # 10. Mulheres chefes de familia sem conjuge e com filhos menores de 15 anos (%) # 11. Medicos residentes (por mil habitantes) # 12. Enfermeiros residentes com curso superior (%) # 13. Alfabetizados - pessoas 15 anos e mais (%) # 14. Indice de Desenvolvimento Humano # 15. Indice de Desenvolvimento Humano - renda # 16. Indice de Desenvolvimento Humano - longevidade # 17. Indice de Desenvolvimento Humano - educacao # 18. Mortalidade ate cinco anos de idade (por mil nascidos vivos) # 19. Probabilidade de sobrevivencia ate 40 anos (%) # 20. Probabilidade de sobrevivencia ate 60 anos (%) # 21. Taxa de fecundidade (%) # 22. Pessoas 65 anos ou mais - morando sozinhas (%) # 23. Pessoas 10 e 14 anos - mulheres com filhos (%) # 24. Professores do fundamental residentes com curso superior (%) # 25. Esperanca de vida ao nascer # 26. Mortalidade infantil (por mil nascidos vivos) # 27. Domicilios - com agua encanada - pessoas (%) # 28. Domicilios - com servico de coleta de lixo - pessoas (%) # ########################################################################################## # IBGE dataset data = read.csv("http://hedibert.org/wp-content/uploads/2014/04/dadosmunicipais-ipea.csv",header=TRUE) dados = data[,c(14,4:13,18:28)] n = nrow(dados) p = ncol(dados)-1 ps = 1:p for (i in 1:(p+1)){ m = mean(dados[,i]) d = sqrt(var(dados[,i])) dados[,i] = (dados[,i]-m)/d } # OLS REGRESSION reg1 = lm(idh~.,data=dados) rmse.reg1 = sqrt(mean((dados[,1]-reg1$fit)^2)) pdf(file="pca-graph1.pdf",width=8,height=6) plot(reg1$coef[2:(p+1)],axes=FALSE,xlab="",ylab="OLS coefficients") axis(2);axis(1,at=1:p,lab=names(dados[,2:(p+1)]),las=2,cex.axis=0.8);box() abline(h=0,lty=2) title(paste("RMSE=",round(rmse.reg1,3),sep="")) dev.off() # TOY EXAMPLE S = var(dados[,2:6]) round(S,3) rendapc1 rendapc2 rendapc3 rendapc4 rendapc5 rendapc1 1.000 0.969 0.937 0.892 0.755 rendapc2 0.969 1.000 0.987 0.957 0.821 rendapc3 0.937 0.987 1.000 0.986 0.854 rendapc4 0.892 0.957 0.986 1.000 0.888 rendapc5 0.755 0.821 0.854 0.888 1.000 svd(S) $d [1] 4.62446396 0.28228144 0.07769785 0.01265673 0.00290002 $u [,1] [,2] [,3] [,4] [,5] [1,] -0.4411378 0.4941891 0.608777172 -0.42710055 0.09027785 [2,] -0.4586014 0.2724745 -0.007324998 0.66916141 -0.51731208 [3,] -0.4613133 0.1018765 -0.368647838 0.23092054 0.76654138 [4,] -0.4571075 -0.1142474 -0.575268235 -0.55792360 -0.36849418 [5,] -0.4163232 -0.8112358 0.403115279 0.07214636 0.02940206 $v [,1] [,2] [,3] [,4] [,5] [1,] -0.4411378 0.4941891 0.608777172 -0.42710055 0.09027785 [2,] -0.4586014 0.2724745 -0.007324998 0.66916141 -0.51731208 [3,] -0.4613133 0.1018765 -0.368647838 0.23092054 0.76654138 [4,] -0.4571075 -0.1142474 -0.575268235 -0.55792360 -0.36849418 [5,] -0.4163232 -0.8112358 0.403115279 0.07214636 0.02940206 # PRINCIPAL COMPONENT ANALYSIS pca = princomp(dados[,2:(p+1)]) pdf(file="pca-graph2.pdf",width=10,height=6) par(mfrow=c(1,2)) plot(pca$sdev,ylab="Standard deviation ",xlab="Principal component") plot(pca$sdev^2/sum(pca$sdev^2),xlab="Principal component",ylab="% of variance explained") dev.off() pdf(file="pca-graph3.pdf",width=8,height=6) par(mfrow=c(1,1)) plot(pca$loadings[,1],axes=FALSE,xlab="",ylab="Loadings") axis(2);axis(1,at=1:p,lab=names(dados[,2:(p+1)]),las=2,cex.axis=0.8);box() abline(h=0,lty=2) dev.off() reg = lm(dados[,1]~pca$scores[,1]) rmse.reg = sqrt(mean((dados[,1]-reg$fit)^2)) pdf(file="pca-graph4.pdf",width=10,height=6) par(mfrow=c(1,2)) plot(cor(dados[,1],pca$scores)[1,],xlab="Principal component",ylab="Correlation with HDI",pch=16) abline(h=0,lty=2) plot(pca$scores[,1],dados[,1],xlab="1st principal component",ylab="HDI") abline(reg$coef,col=2) dev.off() pdf(file="pca-graph5.pdf",width=10,height=6) par(mfrow=c(1,2)) plot(reg1$coef[2:(p+1)],axes=FALSE,xlab="",ylab="OLS coefficients") axis(2);axis(1,at=1:p,lab=names(dados[,2:(p+1)]),las=2,cex.axis=0.8);box() abline(h=0,lty=2) title(paste("RMSE=",round(rmse.reg1,3),sep="")) plot(pca$loadings[,1]*reg$coef[2],axes=FALSE,xlab="",ylab="Loadings",ylim=range(reg1$coef[2:(p+1)])) axis(2);axis(1,at=1:p,lab=names(dados[,2:(p+1)]),las=2,cex.axis=0.8);box() abline(h=0,lty=2) title(paste("RMSE=",round(rmse.reg,3),sep="")) dev.off() rmse.reg2 = rep(0,p) for (i in 1:p){ reg2 = lm(dados[,1]~pca$scores[,1:i]) rmse.reg2[i] = sqrt(mean((dados[,1]-reg2$fit)^2)) } pdf(file="pca-graph6.pdf",width=8,height=6) par(mfrow=c(1,1)) plot(rmse.reg2,xlab="Principal components",ylab="RMSE",axes=FALSE) axis(2);axis(1,at=1:p);abline(h=rmse.reg1,lty=2);box() dev.off() reg3 = lm(dados[,1]~pca$scores[,1:15]) reg3coef = pca$loadings[,1:15]%*%reg3$coef[2:16] rmse.reg3 = sqrt(mean((dados[,1]-reg3$fit)^2)) reg4 = lm(dados[,1]~pca$scores) reg4coef = pca$loadings%*%reg4$coef[2:(p+1)] pdf(file="pca-graph7.pdf",width=10,height=6) par(mfrow=c(1,2)) plot(reg1$coef[2:(p+1)],axes=FALSE,xlab="",ylab="coeficients",ylim=range(reg1$coef[2:(p+1)],reg3coef)) axis(2);axis(1,at=1:p,lab=names(dados[,2:(p+1)]),las=2,cex.axis=0.8);box() points(1:p,reg3coef,col=2) abline(h=0,lty=2) title("15 principal components") legend("topleft",legend=c(paste("rmse(ols)=",round(rmse.reg1,3),sep=""), paste("rmse(pca)=",round(rmse.reg3,3),sep="")),col=1:2,pch=16) plot(reg1$coef[2:(p+1)],axes=FALSE,xlab="",ylab="coeficients",ylim=range(reg1$coef[2:(p+1)],reg4coef)) axis(2);axis(1,at=1:p,lab=names(dados[,2:(p+1)]),las=2,cex.axis=0.8);box() points(1:p,reg4coef,col=2) abline(h=0,lty=2) title("ALL principal components") dev.off() factanal(dados[,2:(p+1)],factors=1) regiao = rep(1,n) regiao[(sigla=="MT")|(sigla=="MS")|(sigla=="GO")]=2 regiao[(sigla=="RS")|(sigla=="PR")|(sigla=="SC")]=3 regiao[(sigla=="RJ")|(sigla=="SP")|(sigla=="ES")|(sigla=="MG")]=4 pdf(file="eda1.pdf",width=15,height=10) names = c("Norte/Nordeste","Centro-oeste","Sul","Sudeste") par(mfrow=c(2,4)) for (i in 1:4) plot(rendapc4[regiao==i],idh[regiao==i],xlab="rendapc4",ylab="idh",xlim=range(rendapc4),ylim=range(idh),main=names[i]) for (i in 1:4) plot(alfabetizado[regiao==i],idh[regiao==i],xlab="alfabetizado",ylab="idh",xlim=range(alfabetizado),ylim=range(idh),main=names[i]) dev.off() reg1 = rep(0,n);reg1[regiao==1]=1 reg2 = rep(0,n);reg2[regiao==2]=1 reg3 = rep(0,n);reg3[regiao==3]=1 dados = cbind(dados,reg1,reg2,reg3) n = nrow(dados) p = ncol(dados)-1 ps = 1:p # Standardizing the data #for (i in 1:(p+1)){ for (i in 1:22){ m = mean(dados[,i]) d = sqrt(var(dados[,i])) dados[,i] = (dados[,i]-m)/d } # Correlation matrix corr = round(cor(dados),2) pdf(file="eda2.pdf",width=9,height=6) plot(0,1,xlim=c(0,p+2),ylim=c(0,p+2),col=0,axes=FALSE,xlab="",ylab="") for (i in 1:(p+1)) for (j in 1:(p+1)) points(i,p+2-j,cex=1.5*abs(corr[i,j]),pch=16) axis(1,at=1:(p+1),lab=names(dados),las=2,cex.axis=0.6) axis(2,at=(p+1):1,lab=names(dados),las=1,cex.axis=0.6) dev.off() # Training and testing samples set.seed(31415) train = sample(1:n,size=trunc(n/2),replace=FALSE) ytrain = dados[train,1] ytest = dados[-train,1] Xtrain = as.matrix(cbind(1,dados[train,2:(p+1)])) Xtest = as.matrix(cbind(1,dados[-train,2:(p+1)])) # Ordinary least squares (OLS) ols = lm(idh~.,data=dados[train,]) summary(ols) yhat.ols.in = ols$fit yhat.ols.out = Xtest%*%ols$coef rmse.ols.in = sqrt(mean((ytrain-yhat.ols.in)^2)) rmse.ols.out = sqrt(mean((ytest-yhat.ols.out)^2)) c(rmse.ols.in,rmse.ols.out) # OLS subset regression install.packages("leaps") library("leaps") regs = regsubsets(idh~.,data=dados[train,],nvmax=p) regs.summary = summary(regs) phat = ps[regs.summary$bic==min(regs.summary$bic)] plot(regs.summary$bic,pch=16,xlab="Number of predictors",ylab="BIC") points(phat,regs.summary$bic[phat],pch=16,col=2,cex=2) coef(regs,phat) yhat.regs.in = Xtrain[,summary(regs)$which[phat,]]%*%coef(regs,phat) yhat.regs.out = Xtest[,summary(regs)$which[phat,]]%*%coef(regs,phat) rmse.regs.in = sqrt(mean((ytrain-yhat.regs.in)^2)) rmse.regs.out = sqrt(mean((ytest-yhat.regs.out)^2)) c(rmse.regs.in,rmse.regs.out) install.packages("glmnet") library(glmnet) ### RIDGE REGRESSION grid=10^seq(10,-2,length=100) ridge.mod = glmnet(Xtrain[,2:(p+1)],ytrain,alpha=0,lambda=grid) plot(log(1/grid),coef(ridge.mod)[2,],type="l",ylim=range(coef(ridge.mod)[2:(p+1),]),xlab="log(1/lambda)",ylab="Coefficients") for (i in 3:(p+1)) lines(log(1/grid),coef(ridge.mod)[i,],col=i) cv.out = cv.glmnet(Xtrain[,2:(p+1)],ytrain,alpha=0) plot(cv.out) bestlam = cv.out$lambda.min yhat.ridge.in = predict(ridge.mod,s=bestlam,newx=Xtrain[,2:(p+1)]) yhat.ridge.out = predict(ridge.mod,s=bestlam,newx=Xtest[,2:(p+1)]) rmse.ridge.in = sqrt(mean((yhat.ridge.in-ytrain)^2)) rmse.ridge.out = sqrt(mean((yhat.ridge.out-ytest)^2)) c(rmse.ridge.in,rmse.ridge.out) out = glmnet(Xtrain[,2:(p+1)],ytrain,alpha=0) ridge.coef = predict(out,type="coefficients",s=bestlam) cbind(ols$coef,ridge.coef) ### LASSO REGRESSION grid=10^seq(10,-2,length=100) lasso.mod = glmnet(Xtrain[,2:(p+1)],ytrain,alpha=1,lambda=grid) plot(log(1/grid),coef(lasso.mod)[2,],type="l",ylim=range(coef(lasso.mod)[2:(p+1),]),xlab="log(1/lambda)",ylab="Coefficients") for (i in 3:(p+1)) lines(log(1/grid),coef(lasso.mod)[i,],col=i) cv.out = cv.glmnet(Xtrain[,2:(p+1)],ytrain,alpha=1) plot(cv.out) bestlam = cv.out$lambda.min yhat.lasso.in = predict(lasso.mod,s=bestlam,newx=Xtrain[,2:(p+1)]) yhat.lasso.out = predict(lasso.mod,s=bestlam,newx=Xtest[,2:(p+1)]) rmse.lasso.in = sqrt(mean((yhat.lasso.in-ytrain)^2)) rmse.lasso.out = sqrt(mean((yhat.lasso.out-ytest)^2)) c(rmse.lasso.in,rmse.lasso.out) out = glmnet(Xtrain[,2:(p+1)],ytrain,alpha=1) lasso.coef = predict(out,type="coefficients",s=bestlam) coefs = round(cbind(ols$coef,ridge.coef,lasso.coef),4) colnames(coefs) = c("ols","ridge","lasso") coefs # REGRESSION TREE install.packages("tree") library(tree) tree.fit = tree(idh~.,mindev=.0001,data=dados[train,]) cvtree = cv.tree(tree.fit) prunetree = prune.tree(tree.fit,best=8) yhat.tree.in = predict(prunetree,newdata=dados[train,]) yhat.tree.out = predict(prunetree,newdata=dados[-train,]) rmse.tree.in = sqrt(mean((yhat.tree.in-ytrain)^2)) rmse.tree.out = sqrt(mean((yhat.tree.out-ytest)^2)) c(rmse.tree.in,rmse.tree.out) pdf("tree1.pdf",width=30,height=15) plot(tree.fit,type="u") text(tree.fit,pretty=0,cex=0.25) dev.off() pdf("tree2.pdf",width=10,height=7) plot(prunetree,type="u") text(prunetree,pretty=0) dev.off() # BAGGING TREES install.packages("randomForest") library(randomForest) set.seed(31415) bagtree = randomForest(Xtrain[,2:(p+1)],ytrain,data=dados,mtry=p) yhat.bag.in = predict (bagtree ,newdata=dados[train,2:(p+1)]) yhat.bag.out = predict (bagtree ,newdata=dados[-train,2:(p+1)]) rmse.bag.in = sqrt(mean((yhat.bag.in-ytrain)^2)) rmse.bag.out = sqrt(mean((yhat.bag.out-ytest)^2)) c(rmse.bag.in,rmse.bag.out) # RANDOM FORESTS set.seed(31415) rftree = randomForest(Xtrain[,2:(p+1)],ytrain,data=dados,mtry=5) yhat.rf.in = predict (rftree ,newdata=dados[train,2:(p+1)]) yhat.rf.out = predict (rftree ,newdata=dados[-train,2:(p+1)]) rmse.rf.in = sqrt(mean((yhat.rf.in-ytrain)^2)) rmse.rf.out = sqrt(mean((yhat.rf.out-ytest)^2)) c(rmse.rf.in,rmse.rf.out) pdf("tree3.pdf",width=10,height=7) par(mfrow=c(1,2)) varImpPlot(bagtree,main="Bagging") varImpPlot(rftree,main="Random forest") dev.off() # BOOSTING install.packages("gbm") library(gbm) boost = gbm(idh~.,data=dados[train,],distribution="gaussian",n.trees=5000,interaction.depth=6) yhat.boost.in = predict (boost,newdata=dados[train ,],n.trees=5000) yhat.boost.out = predict (boost,newdata=dados[-train ,],n.trees=5000) rmse.boost.in = sqrt(mean((yhat.boost.in-ytrain)^2)) rmse.boost.out= sqrt(mean((yhat.boost.out-ytest)^2)) c(rmse.boost.in,rmse.boost.out) # BART install.packages("BART") library(BART) bart.fit = wbart(Xtrain[,2:(p+1)],ytrain,x.test=Xtest[,2:(p+1)]) yhat.bart.in = bart.fit$yhat.train.mean yhat.bart.out = bart.fit$yhat.test.mean rmse.bart.in = sqrt(mean((yhat.bart.in-ytrain)^2)) rmse.bart.out = sqrt(mean((yhat.bart.out-ytest)^2)) c(rmse.bart.in,rmse.bart.out) rmse = rbind(c(rmse.ols.in,rmse.ols.out), c(rmse.regs.in,rmse.regs.out), c(rmse.ridge.in,rmse.ridge.out), c(rmse.lasso.in,rmse.lasso.out), c(rmse.tree.in,rmse.tree.out), c(rmse.bag.in,rmse.bag.out), c(rmse.rf.in,rmse.rf.out), c(rmse.boost.in,rmse.boost.out), c(rmse.bart.in,rmse.bart.out)) colnames(rmse) = c("RMSE.in","RMSE.out") rownames(rmse) = c("ols","subset","ridge","lasso","tree","bagging","random forest","boosting","bart") round(rmse,3)