# The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. # The classification goal is to predict if the client will subscribe a term deposit (variable y). # # https://archive.ics.uci.edu/ml/datasets/Bank+Marketing# # # Input variables: # 1 - age (numeric) # 2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid", # "entrepreneur","student","blue-collar","self-employed", # "retired","technician","services") # 3 - marital : marital status (categorical: "married","divorced","single"; # note: "divorced" means divorced or widowed) # 4 - education (categorical: "unknown","secondary","primary","tertiary") # 5 - default: has credit in default? (binary: "yes","no") # 6 - balance: average yearly balance, in euros (numeric) # 7 - housing: has housing loan? (binary: "yes","no") # 8 - loan: has personal loan? (binary: "yes","no") # # related with the last contact of the current campaign: # 9 - contact: contact communication type (categorical: "unknown","telephone","cellular") # 10 - day: last contact day of the month (numeric) # 11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec") # 12 - duration: last contact duration, in seconds (numeric) # # other attributes: # 13 - campaign: number of contacts performed during this campaign and for this client # (numeric, includes last contact) # 14 - pdays: number of days that passed by after the client was last contacted from a # previous campaign (numeric, -1 means client was not previously contacted) # 15 - previous: number of contacts performed before this campaign and for this client (numeric) # 16 - poutcome: outcome of the previous marketing campaign # (categorical: "unknown","other","failure","success") # # Output variable (desired target): # 17 - y - has the client subscribed a term deposit? (binary: "yes","no") bank = read.csv("bank.csv",sep=";") summary(bank) table = rbind(c(mean(bank$balance<0),mean(bank$balance==0)), c(mean(bank$balance[bank$y=="yes"]<0),mean(bank$balance[bank$y=="yes"]==0)), c(mean(bank$balance[bank$y=="no"]<0),mean(bank$balance[bank$y=="no"]==0))) rownames(table) = c("all","yes","no") colnames(table) = c("negative","zero") round(table,3) par(mfrow=c(1,2)) boxplot(balance~y,data=bank) boxplot(balance~y,data=bank,outline=FALSE) tab=table(bank$job,bank$y) round(tab/apply(tab,1,sum),3) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) tab=table(bank$marital,bank$y) round(tab/apply(tab,1,sum),3) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) tab = table(bank$education,bank$y) round(tab/apply(tab,1,sum),3) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) tab=table(bank$housing,bank$y) round(tab/apply(tab,1,sum),3) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) tab=table(bank$loan,bank$y) round(tab/apply(tab,1,sum),3) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) tab=table(bank$contact,bank$y) round(tab/apply(tab,1,sum),3) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) tab=table(bank$month,bank$y) round(tab/apply(tab,1,sum),3) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) tab=table(bank$campaign,bank$y) plot(round(tab/apply(tab,1,sum),3)) round(t(rbind(tab[,1]/sum(tab[,1]),tab[,2]/sum(tab[,2]))),3) glout = glm(y~.,data=bank,family=binomial) summary(glout) library("MASS") stepout = stepAIC(glout) stepout length(coef(stepout)) yhat1 = round(glout$fitted.values) yhat2 = round(stepout$fitted.values) n = nrow(bank) y = rep(0,n) y[bank$y=="yes"]=1 mean(yhat1 == y) mean(yhat2 == y)