Académique Documents
Professionnel Documents
Culture Documents
#-------------------------------------------------------
## Business Understanding:- Prospect Profiling
#The business objective is to achieve 80% of total responders at the minimum
possible cost.
#The total number of responders is the total number of prospects who responded,
#from the available data of about 45,000 data points.
#-------------------------------------------------------
#----------------------------------------------------------
# The process followed in this assignment is:
# 1. Data Understanding, Preparation and EDA
# 2. Model building, evaluation based on test data and
# finding probabilities of response for entire original
# dataset based on the same.
# 3. Creating a new dataframe with relevant variables and
# finding the average cost of call for each prospect
# 4. Find the number of top X% prospects you should target to meet the business
objective
# Report the average call duration for targeting the top X% prospects to the CMO
# 5. Creating a lift chart (no of prospects contacted vs response rate)
#----------------------------------------------------------
bank_data<- read.csv("D:\\PG_Diploma\\Elective\\bank_marketing.csv")
str(bank_data)
# Summary of dataset
summary(bank_data)
#-------------------------------------------------------
sum(is.na(bank_data))
#-------------------------------------------------------
# Box plot
boxplot(bank_data$age)
bank_data[(which(bank_data$age>71)),]$age <- 71
# Adding No.of_prospect
count <- data.frame(table(bank_data$binning.age))
count <- count[,-1]
agg_age <- cbind(agg_age,count)
agg_age
#-------------------------------------------------------
# Let's see the response rate of each age bucket in the plot
View(Bank_data_age20)
summary(Bank_data_age20)
##--------------------------------------------------------
levels(bank_data$job)
plot_response(bank_data$job, "job")
##--------------------------------------------------------
str(bank_data)
summary(bank_data$marital)
plot_response(bank_data$marital,"marital")
plot_response(bank_data$education,"Education")
plot_response(bank_data$education,"Education_levels")
#-------------------------------------------------------
# Let's see the default variable
table(bank_data$default)
plot_response(bank_data$default, "Default")
bank_data <- bank_data[,-5]
#-------------------------------------------------------
summary(bank_data$housing)
plot_response(bank_data$housing, "Housing")
#-------------------------------------------------------
summary(bank_data$loan)
# Next variable is Contact, Let's see the response rate of each mode
summary(bank_data$contact)
plot_response(bank_data$contact,"Contact_mode")
#-------------------------------------------------------
plot_response(bank_data$month,"Contact_month")
#-------------------------------------------------------
plot_response(bank_data$day_of_week,"day_of_week")
#-------------------------------------------------------
ggplot(bank_data,aes(duration))+geom_histogram()
# Average duration
bank_data$response_1 <- as.factor(bank_data$response)
Avg_duration <- aggregate(duration~response_1,bank_data,mean)
quantile(bank_data$duration,seq(0,1,0.01))
ggplot(bank_data,aes(duration))+geom_histogram()
#-------------------------------------------------------
summary(bank_data$campaign)
boxplot(bank_data$campaign)
quantile(bank_data$campaign,seq(0,1,0.01))
bank_data[which(bank_data$campaign>14),]$campaign <- 14
ggplot(bank_data,aes(campaign))+geom_histogram()
#-------------------------------------------------------
#-- Next variable is "pdays"
# Let's first convert this variable to factor type
bank_data$pdays<- as.factor(bank_data$pdays)
# Checking summary
summary(bank_data$pdays)
levels(bank_data$pdays)
# Reducing the levels of this variable to 3.
plot_response(bank_data$pday,"Pday")
table(bank_data$pdays)
#-------------------------------------------------------
summary(bank_data$previous)
# Max=7, best is to convert this variable to factor
levels(bank_data$previous)[1]<-"Never_contacted"
levels(bank_data$previous)[2:4] <- "Less_than_3_times"
levels(bank_data$previous)[3:6] <- "More_than_3_times"
summary(bank_data$previous)
plot_response(bank_data$previous,"Previous_contacts")
# Now, the next variable is "Poutcome" i.e outcome of the previous marketing
campaign
# (categorical: 'failure','nonexistent','success')
summary(bank_data$poutcome)
plot_response(bank_data$poutcome,"Outcome_of_Previous_contacts")
#-------------------------------------------------------
# Required Packages
library(caret)
library(caTools)
library(dummies)
#---------------------------------------------------------
#creating dummy variables by converting all categorical variables into numeric ones
bank_data$response <- as.integer(bank_data$response)
k1 <- bank_data
set.seed(100)
nrow(train)/nrow(bank_data)
nrow(test)/nrow(bank_data)
#removing 'duration' column from the training and testing data sets as it is not
required for model building
train_bank <- train[,-45]
test_bank <- test[,-45]
#duration is the 45th column of the training and testing data sets (in that order)
library(MASS)
library(car)
#model building
summary(logistic_1)
vif(logistic_2)
# stepAIC has removed some variables and only the following ones remain
logistic_3 <- glm(formula = response ~ age + jobretired + loanno + contactcellular
+
monthaug + monthdec + monthjun + monthmar + monthmay + monthnov +
day_of_weekfri + day_of_weekmon + day_of_weekthu + day_of_weektue +
campaign + pdaysContacted_in_first_10days +
pdaysContacted_after_10days +
previousLess_than_3_times + poutcomefailure + emp.var.rate +
cons.price.idx +
cons.conf.idx + euribor3m + nr.employed + educationTertiary_Education
+
jobblue_collar + jobservices, family = "binomial", data = train_bank)
vif(logistic_3)
summary(logistic_3)
vif(logistic_4)
summary(logistic_4)
vif(logistic_5)
summary(logistic_5)
vif(logistic_6)
summary(logistic_6)
#Now all variables have VIF <2, hence remove variables based on p-values
#cons.price.idx is removed since it has the highest p-value (0.253)
vif(logistic_7)
summary(logistic_7)
#Now all variables have VIF <2, hence remove variables based on p-values
#educationTertiary_Education is removed since it has the highest p-value (0.1643)
vif(logistic_8)
summary(logistic_8)
#Now all variables have VIF <2, hence remove variables based on p-values
#day_of_weekthu is removed since it has the highest p-value (0.1657)
vif(logistic_9)
summary(logistic_9)
#Now all variables have VIF <2, hence remove variables based on p-values
#day_of_weektue is removed since it has the highest p-value (0.2492)
vif(logistic_10)
summary(logistic_10)
#Now all variables have VIF <2, hence remove variables based on p-values
#day_of_weekfri is removed since it has the highest p-value (0.07824)
vif(logistic_11)
summary(logistic_11)
#Now all variables have VIF <2, hence remove variables based on p-values
#loanno is removed since it has the highest p-value (0.05279)
vif(logistic_12)
summary(logistic_12)
#Now all variables have VIF <2, hence remove variables based on p-values
#monthaug is removed since it has the highest p-value (0.0533)
vif(logistic_13)
summary(logistic_13)
#Now all variables have VIF <2 and p-value less then 0.05, but the number of
#variables is still too large. So we will remove the variables till all p-values
are less than 0.001
#monthdec is removed since it has the highest p-value (0.01676)
vif(logistic_14)
summary(logistic_14)
#Now all variables have VIF <2 and p-value less then 0.05, but the number of
#variables is still too large. So we will remove the variables till all p-values
are less than 0.001
#age is removed since it has the highest p-value (0.02)
vif(logistic_15)
summary(logistic_15)
vif(logistic_16)
summary(logistic_16)
#Now all variables have VIF<=2 and p-value less then 0.05, but the number of
#variables is still too large. So we will remove the variables till all p-values
are less than 0.001
#jobservices is removed since it has the highest p-value (0.00827)
vif(logistic_17)
summary(logistic_17)
#Now all variables have VIF<2 and p-value less then 0.001
#Hence, logistic_17 is chosen as the final model
#The major assumption is that p-values of each variable in the final model are less
than 0.001
conf
#---------------------------------------------------------
# Creating cutoff values from 0.01 to 0.99 for plotting and initiallizing a matrix
of 1000 X 4.
s = seq(.01,.99,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i])
}
#---------------------------------------------------------
# plotting cutoffs
plot(s,
OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lw
d=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Speci
ficity","Accuracy"))
#---------------------------------------------------------
acc
#accuracy
#0.7635966
sens
#sensitivity
#0.6702586
spec
#specificity
#0.7754469
#sensitivity is less than 70%, so choose the cutoff slightly less than 7.93%
acc
#accuracy
#0.7183555
sens
#sensitivity
#0.6975575
spec
#specificity
#0.720996
#accuracy, sensitivity and specificity all are around or more than 70%
#For model evaluation and for finding cutoff we used the test data frame test_bank.
#However we now have to again find the predicted response based on entire dataset
#hence, the original data frame bank_data is involved in this step
#Finding and analysing predicted response and probability of responses for the
enitire original data set
#3. Creating new dataframe "test_predictions" and calculating cost of call for each
propect
#Adding required columns from the original data set
#calculate the cost of call for each prospect using the below formula
#Cost per call (INR) = 0.033*(duration_in_seconds) + 0.8
test_predictions$costOFcall <- (0.033*test_predictions$duration)+0.8
#4. Filtering top X% prospects to be targeted and determine the average call
duration for the same
# Gain Chart
plot(LG$Total_Respondents,LG$Gain,col="red",type="l",main="Gain Chart",xlab="total
number of respondents",ylab = "% of positive Response")
#Thus the average call duration for top 50% respondents is 289.0375 seconds.
#The ratio: response rate using the model/ response rate without using the model is
nothing but the cumulative lift
# Lift Chart
plot(LG$Total_Respondents,LG$Cumlift,col="red",type="l",main="Lift
Chart",xlab="total number of respondents",ylab = "Cumulative Lift")