####################################################################### # # TOYOTA COROLLA # ####################################################################### # # Source: Ledolter (2013) Data Mingin and Business Analytics with R # # Description: The data set includes sale prices and vehicle characteristics # of 1436 used Toyota Corollas. The objective here is to predict the sale # price of a used automobile. # # Price: Offer price in EUROs # Age: Age in months as in August 2004 # KM: Accumulated kilometers on odometer # FuelType: Fuel type (petrol, diesel, CNG) # HP: Horsepower # MetColor: Metallic color (Yes=1, No=0) # Automatic: Automatic (Yes=1, No=0) # CC: Cylinder volume in cubic centimeters # Doors: Doors Number of doors # Weight: Weight in kilograms # # Additional references: # # Shmueli, Patel and Bruce (2010) Data Mining for Business Intelligence. # Second edition. Hoboken, NJ: John Wiley & Sons, Inc. # ####################################################################### rm(list=ls()) pdf(file="toyotacorolla.pdf",width=15,height=10) data = read.csv("toyotacorolla.csv",header=TRUE) attach(data) n = nrow(data) # Data transformation y = log(Price) km = KM/1000 weight = Weight/1000 # Exploratory data analysis par(mfrow=c(1,2)) plot(Age,y,xlab="Age in months as in August 2004",ylab="Log price ") plot(Age,km,xlab="Age in months as in August 2004", ylab="Kilometers (in thousands)") par(mfrow=c(1,4)) boxplot(y[Automatic==0],y[Automatic==1],names=c("Automatic","Not automatic"),ylab="Log Price") abline(h=median(y),col=2) boxplot(y[Doors==3],y[Doors==4],y[Doors==5],names=c("3 doors","4 doors","5 doors"),ylab="Log Price") abline(h=median(y),col=2) boxplot(y[FuelType=="CNG"],y[FuelType=="Diesel"],y[FuelType=="Petrol"], names=c("CNG","Diesel","Petrol"),ylab="Log Price") abline(h=median(y),col=2) boxplot(km[FuelType=="CNG"],km[FuelType=="Diesel"],km[FuelType=="Petrol"], names=c("CNG","Diesel","Petrol"),ylab="Kilometers (in thousands)") abline(h=median(km),col=2) ################################################################################# # REGRESSION OF LOG PRICE ON AGE AND DOORS ################################################################################# SST = sum((y-mean(y))^2) par(mfrow=c(1,4)) # Regression of log price on Age reg1 = lm(y~Age) summary(reg1) plot(Age,y,xlab="Age in months as in August 2004",ylab="Log offer price in EUROs") abline(reg1$coef,col=2,lwd=3) title("Log price on age") R2 = round(100-100*sum(reg1$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) # Regression of log price on Age and Doors # Doors is considered a quantitative variable reg2 = lm(y~Age+Doors) summary(reg2) plot(Age,y,col=Doors-1,xlab="Age in months as in August 2004", ylab="Log offer price in EUROs") abline(reg2$coef[1]+3*reg2$coef[3],reg2$coef[2],col=2,lwd=3) abline(reg2$coef[1]+4*reg2$coef[3],reg2$coef[2],col=3,lwd=3) abline(reg2$coef[1]+5*reg2$coef[3],reg2$coef[2],col=4,lwd=3) legend(0,8.5,legend=c("3 Doors","4 Doors","5 Doors"),col=2:4,lwd=2,bty="n") title("Log price on age+doors\nDoors = quantitative variable") R2 = round(100-100*sum(reg2$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) # Regression of log price on Age and Doors # Doors is considered a qualitative variable D3 = rep(0,n) D4 = rep(0,n) D3[Doors==2]=1 D3[Doors==3]=1 D4[Doors==4]=1 reg3 = lm(y~Age+D3+D4) summary(reg3) plot(Age,y,col=Doors-1,xlab="Age in months as in August 2004", ylab="Log offer price in EUROs") abline(reg3$coef[1],reg3$coef[2],col=2,lwd=3) abline(reg3$coef[1]+reg3$coef[3],reg3$coef[2],col=3,lwd=3) abline(reg3$coef[1]+reg3$coef[4],reg3$coef[2],col=4,lwd=3) legend(0,8.5,legend=c("3 Doors","4 Doors","5 Doors"),col=2:4,lwd=2,bty="n") title("Log price on age+doors\nDoors = qualitative variable") R2 = round(100-100*sum(reg3$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) # Regression of log price on Age, Doors and interactions # Doors is considered a qualitative variable D3 = rep(0,n) D4 = rep(0,n) D3[Doors==2]=1 D3[Doors==3]=1 D4[Doors==4]=1 reg4 = lm(y~Age+D3+D4+Age*D3+Age*D4) summary(reg4) plot(Age,y,col=Doors-1,xlab="Age in months as in August 2004", ylab="Log offer price in EUROs") abline(reg4$coef[1],reg4$coef[2],col=2,lwd=3) abline(reg4$coef[1]+reg4$coef[3],reg4$coef[2]+reg4$coef[5],col=3,lwd=3) abline(reg4$coef[1]+reg4$coef[4],reg4$coef[2]+reg4$coef[6],col=4,lwd=3) legend(0,8.5,legend=c("3 Doors","4 Doors","5 Doors"),col=2:4,lwd=2,bty="n") title("Log price on age*doors\nDoors = qualitative variable") R2 = round(100-100*sum(reg4$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) par(mfrow=c(1,4)) plot(reg1$fit,reg1$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age") plot(reg2$fit,reg2$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age+doors\nDoors = quantitative variable") plot(reg3$fit,reg3$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age+doors\nDoors = qualitative variable") plot(reg4$fit,reg4$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age*doors\nDoors = qualitative variable") ################################################################################# # REGRESSION OF LOG PRICE ON AGE AND AUTOMATIC ################################################################################# par(mfrow=c(1,3)) # Regression of log price on Age reg1 = lm(y~Age) summary(reg1) plot(Age,y,xlab="Age in months as in August 2004",ylab="Log offer price in EUROs") abline(reg1$coef,lwd=4) title("Log price on age") R2 = round(100-100*sum(reg1$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) # Regression of log price on Age+Automatic reg2 = lm(y~Age+Automatic) summary(reg2) plot(Age,y,col=Automatic+2,xlab="Age in months as in August 2004", ylab="Log offer price in EUROs") abline(reg2$coef[1],reg2$coef[2],col=2,lwd=3) abline(reg2$coef[1]+reg2$coef[3],reg2$coef[2],col=3,lwd=3) legend(0,8.5,legend=c("Automatic=0","Automatic=1"),col=2:3,lwd=2,bty="n") title("Log price on age+automatic") R2 = round(100-100*sum(reg2$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) # Regression of log price on Age*Automatic reg3 = lm(y~Age+Automatic+Age*Automatic) summary(reg3) plot(Age,y,col=Automatic+2,xlab="Age in months as in August 2004", ylab="Log offer price in EUROs") abline(reg3$coef[1],reg3$coef[2],col=2,lwd=3) abline(reg3$coef[1]+reg3$coef[3],reg3$coef[2]+reg3$coef[4],col=3,lwd=3) legend(0,8.5,legend=c("Automatic=0","Automatic=1"),col=2:3,lwd=2,bty="n") title("Log price on age*automatic") R2 = round(100-100*sum(reg3$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) par(mfrow=c(1,3)) plot(reg1$fit,reg1$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age") plot(reg2$fit,reg2$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age+automatic") plot(reg3$fit,reg3$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age*automatic") ################################################################################# # REGRESSION OF LOG PRICE ON AGE AND FUEL TYPE ################################################################################# par(mfrow=c(1,3)) fuel = rep(0,n) fuel[FuelType=="CNG"]=2 fuel[FuelType=="Diesel"]=3 fuel[FuelType=="Petrol"]=4 # Regression of log price on FuelType reg1 = lm(y~Age) summary(reg1) plot(Age,y,xlab="Age in months as in August 2004",ylab="Log offer price in EUROs") abline(reg1$coef,lwd=4) title("Log price on age") R2 = round(100-100*sum(reg1$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) # Regression of log price on Age+FuelType reg2 = lm(y~Age+FuelType) summary(reg2) plot(Age,y,col=fuel,xlab="Age in months as in August 2004", ylab="Log offer price in EUROs") abline(reg2$coef[1],reg2$coef[2],col=2,lwd=3) abline(reg2$coef[1]+reg2$coef[3],reg2$coef[2],col=3,lwd=3) abline(reg2$coef[1]+reg2$coef[4],reg2$coef[2],col=4,lwd=3) legend(0,8.5,legend=c("CNG","Diesel","Petrol"),col=2:4,lwd=2,bty="n") title("Log price on age+fueltype") R2 = round(100-100*sum(reg2$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) # Regression of log price on Age*FuelType reg3 = lm(y~Age*FuelType) summary(reg3) plot(Age,y,col=fuel,xlab="Age in months as in August 2004", ylab="Log offer price in EUROs") abline(reg3$coef[1],reg3$coef[2],col=2,lwd=3) abline(reg3$coef[1]+reg3$coef[3],reg3$coef[2]+reg3$coef[5],col=3,lwd=3) abline(reg3$coef[1]+reg3$coef[4],reg3$coef[2]+reg3$coef[6],col=4,lwd=3) legend(0,8.5,legend=c("CNG","Diesel","Petrol"),col=2:4,lwd=2,bty="n") title("Log price on age*fueltype") R2 = round(100-100*sum(reg3$res^2)/SST,1) text(70,10.4,paste("R2=",R2,"%",sep="")) par(mfrow=c(1,3)) plot(reg1$fit,reg1$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age") plot(reg2$fit,reg2$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age+fueltype") plot(reg3$fit,reg3$res,xlab="Fitted",ylab="Residuals",ylim=c(-1.2,1.2)) abline(h=0,col=2,lwd=2) title("Log price on age*fueltype") ################################################################################# # REGRESSION OF KM ON AGE AND FUEL TYPE ################################################################################# z = fuel[fuel>2] z = z-3 y = km[fuel>2] x = Age[fuel>2] SST = sum((y-mean(y))^2) reg = lm(y~x*z) coef = round(reg$coef,3) par(mfrow=c(1,1)) plot(x,y,col=1+z,xlab="Age in months as in August 2004", ylab="Thousands of Km") abline(reg$coef[1],reg$coef[2],col=1,lwd=3) abline(reg$coef[1]+reg$coef[3],reg$coef[2]+reg$coef[4],col=2,lwd=3) legend(0,250,legend=c("Diesel","Petrol"),col=1:2,lwd=2,bty="n") title(paste("Diesel: km=",coef[1],"+",coef[2],"age\n Petrol: km=",coef[1]+coef[3],"+",coef[2]+coef[4],"age",sep="")) R2 = round(100-100*sum(reg$res^2)/SST,1) text(5,200,paste("R2=",R2,"%",sep="")) xxx = sort(unique(x)) k = length(xxx) nn = rep(0,k) for (i in 1:k) nn[i] = sum(x==xxx[i]) xxx = xxx[nn>5] k = length(xxx) sig2 = rep(0,k) for (i in 1:k) sig2[i] = var(y[x==xxx[i]]) sig = sqrt(sig2) par(mfrow=c(1,1)) plot(x,reg$res,col=1+z,ylim=c(-160,160),xlab="Age in months as in August 2004", ylab="Residuals") abline(3,1.2,lwd=2,col=4) abline(-3,-1.2,lwd=2,col=4) abline(h=0,lty=2,lwd=2,col=4) par(mfrow=c(1,1)) yy = log(sig2) plot(xxx,yy,xlab="Age in months as in August 2004",ylab="variance") reg1 = lm(yy~xxx) coef1 = reg1$coef abline(coef1,col=2,lwd=2) coef1 = round(coef1,3) title(paste("log(sig2)=",coef1[1],"+",coef1[2],"age",sep="")) SST = sum((yy-mean(yy))^2) R2 = round(100-100*sum(reg1$res^2)/SST,1) text(10,7,paste("R2=",R2,"%",sep="")) par(mfrow=c(1,2)) plot(x,y,xlab="Age in months as in August 2004",ylab="Kilometers (in thousands)", col=0,ylim=c(-10,250)) title(paste("Diesel: km=",coef[1],"+",coef[2],"age\n log(sig2)=",coef1[1],"+",coef1[2],"age",sep="")) xxx = seq(0,80,length=1000) f1 = reg$coef[1]+reg$coef[2]*xxx lines(xxx,f1,lwd=2) lines(xxx,f1+2*exp((coef1[1]+coef1[2]*xxx)/2),lty=2,lwd=2) lines(xxx,f1-2*exp((coef1[1]+coef1[2]*xxx)/2),lty=2,lwd=2) plot(x,y,xlab="Age in months as in August 2004",ylab="Kilometers (in thousands)", col=0,ylim=c(-10,250)) title(paste("Petrol: km=",coef[1]+coef[3],"+",coef[2]+coef[4],"age\n log(sig2)=",coef1[1],"+",coef1[2],"age",sep="")) xxx = seq(0,80,length=1000) f2 = (reg$coef[1]+reg$coef[3])+(reg$coef[2]+reg$coef[4])*xxx lines(xxx,f2,lwd=2) lines(xxx,f2+2*exp((coef1[1]+coef1[2]*xxx)/2),lty=2,lwd=2) lines(xxx,f2-2*exp((coef1[1]+coef1[2]*xxx)/2),lty=2,lwd=2) dev.off()