##########################################################################################
#
# Dataset: More on wages
#
##########################################################################################
#
# Summary: Data on monthly earnings, education, several demographic 
# variables, and IQ scores for 935 men in 1980.
#
#  1. wage                   monthly earnings
#  2. hours                  average weekly hours
#  3. iq                     IQ score
#  4. kww                    knowledge of world work score
#  5. educ                   years of education
#  6. exper                  years of work experience
#  7. tenure                 years with current employer
#  8. age                    age in years
#  9. married                =1 if married
# 10. black                  =1 if black
# 11. south                  =1 if live in south
# 12. urban                  =1 if live in SMSA
# 13. sibs                   number of siblings
# 14. brthord                birth order
# 15. meduc                  mother's education
# 16. feduc                  father's education
# 17. lwage                  natural log of wage
#
##########################################################################################
#
# Source: Wooldridge (2012) 
# Introductory Econometrics: A Modern Approach (5th edition)
# South-Western, Cengage Learning
#
# Basic reference: Blackburn and Newmark (1992) Unobserved ability, 
# efficiency wages and interindustry wage", Quarterly Journal of 
# Economics, 107, 1421-36.
#
##########################################################################################
#
# Copyright of R code by:
# Hedibert Freitas Lopes 
# Professor of Statistics and Econometrics
# Insper - Institute for Education and Research
#
##########################################################################################
#
# datafile = "https://hedibert.org/wp-content/uploads/2014/02/wage2-wooldridge.txt"
# data     = read.table(datafile,header=TRUE)
# n        = nrow(data)
#
#########################################################################################

rm(list=ls())

datafile = "https://hedibert.org/wp-content/uploads/2014/02/wage2-wooldridge.txt"
data     = read.table(datafile)
n        = nrow(data)
head(data)

salary = log(data[,1])
educ = data[,5]
iq = data[,3]

par(mfrow=c(1,3))
plot(educ,iq,xlab="Years of education",ylab="IQ score")
title(paste(n," observations",sep=""))
legend("topleft",legend=paste("cor=",round(cor(educ,iq),3),sep=""))
plot(educ,salary,xlab="Years of education",ylab="Monthly earnings (log)")
legend("topleft",legend=paste("cor=",round(cor(educ,salary),3),sep=""))
plot(iq,salary,xlab="IQ score",ylab="Monthly earnings (log)")
legend("topleft",legend=paste("cor=",round(cor(iq,salary),3),sep=""))

# Let us start with sample of size 50
# -----------------------------------
set.seed(2718282)
n1  = 20
ind = sort(sample(1:n,size=n1,replace=FALSE))
salary1 = log(data[ind,1])
educ1   = data[ind,5]
iq1     = data[ind,3]

x  = sort(unique(educ1))
nx = length(x)
cutoff.x = x[1:(nx-1)]+diff(x)/2
y = sort(unique(iq1))
ny = length(y)
cutoff.y = y[1:(ny-1)]+diff(y)/2

par(mfrow=c(1,1))
plot(educ1,iq1,ylab="IQ score",xlab="Years of education",cex=0.75,pch=16,col=2)
title(paste(n1," observations",sep=""))
for (i in 1:nx)
 abline(v=cutoff.x[i],lty=2)
for (i in 1:ny)
 abline(h=cutoff.y[i],lty=2)
segments(15.5,70,15.5,130,col=6,lwd=3)
segments(10,102,15.5,102,col=6,lwd=3)
text(13,85,"A",col=6,cex=2)
text(13,108,"B",col=6,cex=2)
text(17,95,"C",col=6,cex=2)


library(tree)
fitted = tree(salary1 ~ educ1 + iq1)
summary(fitted)
fitted

par(mfrow=c(1,1))
plot(fitted, type = "uniform")
text(fitted, pretty = 0, all = TRUE, cex = 0.75)


library(tree)

fitted = tree(salary ~ educ + iq)

summary(fitted)

fitted


par(mfrow=c(1,1))
plot(fitted, type = "uniform")
text(fitted, pretty = 0, all = TRUE, cex = 0.75)


wage = log(data[,1])
iq = data[,3]
educ = data[,5]
exper = data[,6]
tenure = data[,7]
age = data[,8]
married = data[,9]
black = data[,10]
meduc = data[,15]

lm.fit    = lm(wage ~ iq + educ + exper + tenure + age + married + black + meduc)
cart.fit  = tree(wage ~ iq + educ + exper + tenure + age + married + black + meduc)

summary(lm.fit)

summary(cart.fit)

cart.fit


par(mfrow=c(1,1))
plot(cart.fit, type = "uniform")
text(cart.fit, pretty = 0, all = TRUE, cex = 0.75)