########################################################################################## # # Machine learning algorithms and Bayesian statistical learning # for predicting human development index (hdi) for Brazilian municipalities. # # Copyright by Hedibert Freitas Lopes, PhD # Date: March 23rd 2020. # ########################################################################################## ########################################################################################## # # DADOS MUNICIPAIS BRASILEIROS (ipeadata.gov.br) # ########################################################################################## # # 1. Estado # 2. Codigo # 3. Municipio # 4. Renda domiciliar per capita - media do 1o quintil (R$ por mes) # 5. Renda domiciliar per capita - media do 2o quintil (R$ por mes) # 6. Renda domiciliar per capita - media do 3o quintil (R$ por mes) # 7. Renda domiciliar per capita - media do 4o quintil (R$ por mes) # 8. Renda domiciliar per capita - media do 5o quintil (R$ por mes) # 9. Razao entre a renda dos 10% mais ricos e 40% mais pobres # 10. Mulheres chefes de familia sem conjuge e com filhos menores de 15 anos (%) # 11. Medicos residentes (por mil habitantes) # 12. Enfermeiros residentes com curso superior (%) # 13. Alfabetizados - pessoas 15 anos e mais (%) # 14. Indice de Desenvolvimento Humano # 15. Indice de Desenvolvimento Humano - renda # 16. Indice de Desenvolvimento Humano - longevidade # 17. Indice de Desenvolvimento Humano - educacao # 18. Mortalidade ate cinco anos de idade (por mil nascidos vivos) # 19. Probabilidade de sobrevivencia ate 40 anos (%) # 20. Probabilidade de sobrevivencia ate 60 anos (%) # 21. Taxa de fecundidade (%) # 22. Pessoas 65 anos ou mais - morando sozinhas (%) # 23. Pessoas 10 e 14 anos - mulheres com filhos (%) # 24. Professores do fundamental residentes com curso superior (%) # 25. Esperanca de vida ao nascer # 26. Mortalidade infantil (por mil nascidos vivos) # 27. Domicilios - com agua encanada - pessoas (%) # 28. Domicilios - com servico de coleta de lixo - pessoas (%) # ########################################################################################## # IBGE dataset data = read.csv("http://hedibert.org/wp-content/uploads/2014/04/dadosmunicipais-ipea.csv", header=TRUE) #################################################### # A few scaterplotts #################################################### attach(data) # Notice the nonlinear relationship between "idh" and "Renda domiciliar per capita" par(mfrow=c(2,2)) plot(rendapc2,idh,xlab="Renda domiciliar per capita \n media do 2o quintil (R$ por mes)") plot(rendapc3,idh,xlab="Renda domiciliar per capita \n media do 3o quintil (R$ por mes)") plot(rendapc4,idh,xlab="Renda domiciliar per capita \n media do 4o quintil (R$ por mes)") plot(rendapc5,idh,xlab="Renda domiciliar per capita \n media do 5o quintil (R$ por mes)") # A quasi-linear relationship arises when we use the LOG10 transformation par(mfrow=c(1,1)) plot(log10(rendapc1),idh,xlim=c(0,3.5),xlab="Log10 da renda domiciliar per capita \n media dos quintis (R$ por mes)") points(log10(rendapc2),idh,col=2) points(log10(rendapc3),idh,col=3) points(log10(rendapc4),idh,col=4) points(log10(rendapc5),idh,col=5) #################################################### # Principal component and factor analyses #################################################### X = log10(data[,5:8]) pairs(X) cor(X) # Principal component analysis # ---------------------------- pca = princomp(X) pca summary(pca) # Correlation between variables and principal componentes cor(X,pca$scores) # Variance decomposition par(mfrow=c(1,2)) plot(pca$sdev^2/sum(pca$sdev^2),xlab="Principal component",ylab="% of explained variance",type="b") plot(cumsum(pca$sdev^2/sum(pca$sdev^2)),xlab="Principal component",ylab="% of explained variance (cumulative)",type="b") abline(h=0.95,lty=2) abline(h=0.75,lty=2) # Comparing the original variables with the principal component variables par(mfrow=c(1,2)) plot(log10(rendapc2),idh,xlim=c(0,3.5),xlab="Log10 da renda domiciliar per capita \n media dos quintis (R$ por mes)") points(log10(rendapc3),idh,col=2) points(log10(rendapc4),idh,col=3) points(log10(rendapc5),idh,col=4) legend("topleft",legend=c("Q2","Q3","Q4","Q5"),col=1:4,lty=1,lwd=2) plot(2+pca$scores[,1],idh,col=5,xlab="Principal components") points(2+pca$scores[,2],idh,col=6) points(2+pca$scores[,3],idh,col=7) points(2+pca$scores[,4],idh,col=8) legend("topleft",legend=c("PC1","PC2","PC3","PC4"),col=5:8,lty=1,lwd=2) # Factor analysis # --------------- fac = factanal(X,factors=1,scores="regression") fac par(mfrow=c(1,3)) plot(pca$scores[,1],idh,col=5,xlab="1st principal component") title("Principal component analysis (PCA)") plot(fac$scores[,1],idh,col=5,xlab="1st common factor") title("Factor analysis (FA)") plot(pca$scores[,1],fac$scores[,1],xlab="PCA",ylab="FA") fac$loadings fac$uniquenesses 1 - fac$uniquenesses # ------------------------------------------------------------------------------------------ # PCA and FA with 17 variables/attributes/characteristics # ------------------------------------------------------------------------------------------ # 6. Renda domiciliar per capita - media do 3o quintil (R$ por mes) # 9. Razao entre a renda dos 10% mais ricos e 40% mais pobres # 10. Mulheres chefes de familia sem conjuge e com filhos menores de 15 anos (%) # 11. Medicos residentes (por mil habitantes) # 12. Enfermeiros residentes com curso superior (%) # 13. Alfabetizados - pessoas 15 anos e mais (%) # 18. Mortalidade ate cinco anos de idade (por mil nascidos vivos) # 19. Probabilidade de sobrevivencia ate 40 anos (%) # 20. Probabilidade de sobrevivencia ate 60 anos (%) # 21. Taxa de fecundidade (%) # 22. Pessoas 65 anos ou mais - morando sozinhas (%) # 23. Pessoas 10 e 14 anos - mulheres com filhos (%) # 24. Professores do fundamental residentes com curso superior (%) # 25. Esperanca de vida ao nascer # 26. Mortalidade infantil (por mil nascidos vivos) # 27. Domicilios - com agua encanada - pessoas (%) # 28. Domicilios - com servico de coleta de lixo - pessoas (%) X = as.matrix(cbind(log10(data[,6]),data[,c(9:13,18:28)])) dim(X) pca = princomp(X) par(mfrow=c(1,2)) plot(pca$sdev^2/sum(pca$sdev^2),xlab="Principal component",ylab="% of explained variance",type="b") plot(cumsum(pca$sdev^2/sum(pca$sdev^2)),xlab="Principal component",ylab="% of explained variance (cumulative)",type="b") abline(h=0.95,lty=2) abline(h=0.75,lty=2) fac = factanal(X,factors=3,scores="regression") fac round(cbind(fac$load^2,fac$uniq),3) round(cbind(apply(fac$load^2,1,sum),fac$uniq),3) round(cor(X,fac$scores),3) par(mfrow=c(1,1)) ts.plot(t(cor(X,fac$scores)),type="b",xlab="Common factors",ylab="Correlations",col=1:17) par(mfrow=c(1,3)) plot(fac$scores[,1],idh,col=5,xlab="1st common factor") title("1st common factor") plot(fac$scores[,1],idh,col=5,xlab="1st common factor") title("2nd common factor") plot(fac$scores[,1],idh,col=5,xlab="1st common factor") title("3rd common factor") # ------------------------------------------------------------------------------------------ # Linear regression: variables vs principal components # ------------------------------------------------------------------------------------------ reg.ols = lm(idh~X) summary(reg.ols) r2adj.pca = rep(0,17) for (i in 1:17) r2adj.pca[i] = summary(lm(idh~pca$scores[,1:i]))$adj.r par(mfrow=c(1,2)) plot(cumsum(pca$sdev^2/sum(pca$sdev^2)),xlab="Principal components", ylab="% of explained variance (cumulative)",type="b") title("Explained variability of X\n 17 variables") abline(h=0.95,lty=2) plot(r2adj.pca,ylab="Adjusted R2",xlab="Principal components") abline(h=summary(reg1)$adj.r,lty=2,col=2) title("Explained variability of y\n idh") abline(h=0.95,lty=2) X1 = pca$scores[,1:7] summary(lm(idh~X1))