##########################################################################################
#
# ANOTHER HOUSE PRICE DATASET
#
##########################################################################################
#
# The data set has 546 observations and were gathered by the authors from the 
# Multiple Listing Service for houses sold in Windsor, Ontario, Canada.  The 
# variables are defined as follows:
#
# sell  = sale price of a house
# lot   = the lot size of a property in square feet
# bdms  = the number of bedrooms
# fb    = the number of full bathrooms
# sty   = the number of stories excluding basement
# drv   = 1 if the house has a driveway
# rec   = 1 if the house has a recreational room
# ffin  = 1 if the house has a full finished basement
# ghw   = 1 if the house uses gas for hot water heating
# ca    = 1 if there is central air conditioning
# gar   = the number of garage places
# reg   = 1 if the house is located in the preferred neighbourhood of the city
#
# Data: http://qed.econ.queensu.ca/jae/1996-v11.6/anglin-gencay/readme.ag.txt
# 
# Source: Paul Anglin and Ramazan Gencay, "Semiparametric Estimation of a Hedonic
#         Price Function", Journal of Applied Econometrics, Vol. 11, No. 6, 1996, 
#         pp. 633-648.
#
##########################################################################################
#
# Breusch-Pagan test = 112.93 and White test = 44.97.  chi-square(9) has a 5% critical 
# value is 16.92.  Since both test statistics are greater than critical value, both of 
# these tests indicate that heteroskedasticity is present.  Take logs of all variables: 
# BP=12.96 and W=11.45 which are both less than the 5% critical value.  
#
##########################################################################################
rm(list=ls())

names = c("Sale price (in 1000 reals)","Lot size (in square meters)","Number of bedrooms",
"Number of full bathrooms","Number of stories excluding basement","1=house has a driveway",
"1=house has a recreational room","1=house has a full finished basement",
"1=house uses gas for hot water heating","1=central air conditioning",
"the number of garage places","1=located in the preferred neighbourhood of the city")

lnames = c("Log sale price (in 1000 reals)","Log lot size (in square meters)",
"Log number of bedrooms","Log number of full bathrooms",
"Log number of stories excluding basement","1=house has a driveway",
"1=house has a recreational room","1=house has a full finished basement",
"1=house uses gas for hot water heating","1=central air conditioning",
"the number of garage places","1=located in the preferred neighbourhood of the city")

names1 = c("sell","lot","bdms","fb","sty","drv","rec","ffin","ghw","ca","gar","reg")

lnames1 = c("logsell","loglot","logbdms","logfb","logsty","drv","rec","ffin","ghw","ca","gar","reg")

data = read.table("houseprice-canada.txt",header=TRUE)

n = nrow(data)

# Transformations: from sq-feet to sq-meters and dollars to reals
data[,1] = data[,1]*2.21500132/1000
data[,2] = data[,2]*0.09290304 

attach(data)

pdf(file="houseprice-canada.pdf",width=10,height=8)

par(mfrow=c(1,1))
for (i in 2:12){
  reg1  = lm(data[,1]~data[,i])
  X     = cbind(1,data[,i])
  y     = data[,1]
  coef  = round(reg1$coef,3)
  se    = round(sqrt(diag(sum(reg1$res^2)/(n-2)*solve(t(X)%*%X))),2)
  plot(data[,i],data[,1],xlab=names[i],ylab=names[1])
  abline(coef,col=2,lwd=3)
  title(paste("sell = ",coef[1],"+",coef[2],"*",names1[i],"\n (",se[1],")    (",se[2],")",sep=""))
}

# Full model
reg2   = lm(sell ~ lot+bdms+fb+sty+drv+rec+ffin+ghw+ca+gar+reg)
yhat   = reg2$fit
yhat2  = yhat^2
ehat   = reg2$res
ehat2  = ehat^2

par(mfrow=c(1,2))
plot(yhat,ehat,xlab="Fitted",ylab="Residuals",main="Full model")
abline(h=0,lty=2)
plot(yhat,ehat2,xlab="Fitted",ylab="Square residuals")
abline(lm(ehat2~yhat),col=2,lwd=2)

# Pagan homocedasticity test
summary(lm(ehat2 ~ lot+bdms+fb+sty+drv+rec+ffin+ghw+ca+gar+reg))

# White (simplified) homocedasticity test
summary(lm(ehat2 ~ yhat+yhat2))


################################
# Log model
################################
for (i in 1:5)
  data[,i] = log(data[,i])
lsell = log(sell)
llot  = log(lot)
lbdms = log(bdms)
lfb   = log(fb)
lsty  = log(sty)

par(mfrow=c(1,1))
for (i in 2:12){
  reg1  = lm(data[,1]~data[,i])
  X     = cbind(1,data[,i])
  y     = data[,1]
  coef  = round(reg1$coef,3)
  se    = round(sqrt(diag(sum(reg1$res^2)/(n-2)*solve(t(X)%*%X))),2)
  plot(data[,i],data[,1],xlab=names[i],ylab=lnames[1])
  abline(coef,col=2,lwd=3)
  title(paste("logsell = ",coef[1],"+",coef[2],"*",lnames1[i],"\n (",se[1],")    (",se[2],")",sep=""))
}

reg3   = lm(lsell ~ llot+lbdms+lfb+lsty+drv+rec+ffin+ghw+ca+gar+reg)
yhat   = reg3$fit
yhat2  = yhat^2
ehat   = reg3$res
ehat2  = ehat^2

par(mfrow=c(1,2))
plot(yhat,ehat,xlab="Fitted",ylab="Residuals",main="Full model (log)")
abline(h=0,lty=2)
plot(yhat,ehat2,xlab="Fitted",ylab="Square residuals")
abline(lm(ehat2~yhat),col=2,lwd=2)

# Pagan homocedasticity test
summary(lm(ehat2 ~ llot+lbdms+lfb+lsty+drv+rec+ffin+ghw+ca+gar+reg))

# White (simplified) homocedasticity test
summary(lm(ehat2 ~ yhat+yhat2))

dev.off()