Bank Analytics

##------------Bank Marketing Analysis---------------------##
#-------------------------------------------------------
## Business Understanding:- Prospect Profiling
#The business objective is to achieve 80% of total responders at the minimum
possible cost.
#The total number of responders is the total number of prospects who responded,
#from the available data of about 45,000 data points.
#-------------------------------------------------------
#----------------------------------------------------------
# The process followed in this assignment is:
# 1. Data Understanding, Preparation and EDA
# 2. Model building, evaluation based on test data and
# finding probabilities of response for entire original
# dataset based on the same.
# 3. Creating a new dataframe with relevant variables and
# finding the average cost of call for each prospect
# 4. Find the number of top X% prospects you should target to meet the business
objective
# Report the average call duration for targeting the top X% prospects to the CMO
# 5. Creating a lift chart (no of prospects contacted vs response rate)
#----------------------------------------------------------
#Loading bank marketing data in the working directory.
bank_data<- read.csv("D:\\PG_Diploma\\Elective\\bank_marketing.csv")
# Checking structure of dataset
str(bank_data)
# Summary of dataset
summary(bank_data)
#-------------------------------------------------------
#1. Data cleaning, understanding and EDA
# Checking response rate of prospect customer
response <- 4640/(36548+4640)

response
#0.1126542 (response rate)
# Checking missing values
sum(is.na(bank_data))
#-------------------------------------------------------
# Loading ggplot2 library

library(ggplot2)
# Plotting Age histogram

ggplot(bank_data,aes(age))+geom_histogram()
# Let's check the outlier in the variables

quantile(bank_data$age,seq(0,1,0.01))
# Box plot
boxplot(bank_data$age)
# Capping the upper values of age with 71.
bank_data[(which(bank_data$age>71)),]$age <- 71
# Binning the age variable and store it into "binning.age".
bank_data$binning.age <- as.factor(cut(bank_data$age, breaks = c(16, 20, 30, 40,

50, 60, 70, 80)))
# Change the response value to numbers i.e"yes-no" to "1-0"
bank_data$response <- ifelse(bank_data$response == "yes", 1, 0)
# Check the numeric value of response rate in each bucket
agg_age <- merge(aggregate(response ~ binning.age, bank_data,

mean),aggregate(response~binning.age, bank_data, sum),by = "binning.age")
# Adding No.of_prospect
count <- data.frame(table(bank_data$binning.age))
count <- count[,-1]
agg_age <- cbind(agg_age,count)
# changing column name of each variables in agg_age dataframe
colnames(agg_age) <- c("age", "response_rate", "count_prospects","No.of_prospect")
# Round Off the values
agg_age$response_rate <- format(round(agg_age$response_rate, 2))
agg_age
#-------------------------------------------------------
# Let's see the response rate of each age bucket in the plot
ggplot(agg_age, aes(age, No.of_prospect,label = response_rate)) +

geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust
= 1)) +
geom_text(size = 3, vjust = -0.5)
# Let's check the dataset of age less than 20 years.

Bank_data_age20 <- subset(bank_data,age <20)
View(Bank_data_age20)
summary(Bank_data_age20)
##--------------------------------------------------------

str(bank_data)
#-----Next Variable is "job"
# Checking the levels of the job
levels(bank_data$job)
# Plotting bar graph for job variable.
# Writing a function "plot_response" to do the same task for each variable
plot_response <- function(cat_var, var_name){

a <- aggregate(response~cat_var, bank_data, mean)
count <- data.frame(table(cat_var))
count <- count[,-1]
agg_response <- cbind(a, count)
colnames(agg_response) <- c(var_name, "response_rate","No.of_Prospect")

agg_response[, 2] <- format(round(agg_response[, 2], 2))
ggplot(agg_response, aes(agg_response[, 1], count, label = response_rate)) +

geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust =
1)) + geom_text(size = 3, vjust = -0.5) + xlab(var_name)
plot_response(bank_data$job, "job")
##--------------------------------------------------------
str(bank_data)
# Checking Marital status
summary(bank_data$marital)
# Let's replace Unknown level to married
levels(bank_data$marital)[4] <- "married"
# Plotting marital status
plot_response(bank_data$marital,"marital")
# Let's see the education variables
plot_response(bank_data$education,"Education")
# Reducing the levels of education variable
levels(bank_data$education)[c(1:3,5)] <- "Primary_Education"

levels(bank_data$education)[2] <- "Secondary_Education"
levels(bank_data$education)[4]<- "Tertiary_Education"
# Let's again check the education plot
plot_response(bank_data$education,"Education_levels")
#-------------------------------------------------------
# Let's see the default variable
table(bank_data$default)
plot_response(bank_data$default, "Default")
bank_data <- bank_data[,-5]
#-------------------------------------------------------
# Let's understand the housing variables
summary(bank_data$housing)
plot_response(bank_data$housing, "Housing")
#-------------------------------------------------------
#-- Let's see the next variable which is "loan"
summary(bank_data$loan)
plot_response(bank_data$loan, "Loan Status")

#-------------------------------------------------------
# Next variable is Contact, Let's see the response rate of each mode
summary(bank_data$contact)
plot_response(bank_data$contact,"Contact_mode")
#-------------------------------------------------------
# Next variable is "Month" i.e contact month.
plot_response(bank_data$month,"Contact_month")
#-------------------------------------------------------
# Let's do the same of "day_of_week" variable
plot_response(bank_data$day_of_week,"day_of_week")
#-------------------------------------------------------
# Now, Let's see the "duration" variable: Which is Quantitative variable
# Let's check the histogram
ggplot(bank_data,aes(duration))+geom_histogram()
# Let's see the summary of this variable once

summary(bank_data$duration)
# Average duration
bank_data$response_1 <- as.factor(bank_data$response)
Avg_duration <- aggregate(duration~response_1,bank_data,mean)
bank_data <- bank_data[,-22]
## Definitely the outlier is present in the dataset
# So let's check the percentile distribution of duration
quantile(bank_data$duration,seq(0,1,0.01))
# So, capping the duration seconds at 99% which is 1271.3sec
bank_data[(which(bank_data$duration>1271.13)),]$duration <- 1271.13
# Now, again plot the histogram
ggplot(bank_data,aes(duration))+geom_histogram()
#-------------------------------------------------------
# the next variable is "campaign" variable

#(number of contacts performed during this campaign and for this client
# numeric, includes last contact)
# So let's check the summay of this variable
summary(bank_data$campaign)
# Let's see the percentile distribution of this variable
boxplot(bank_data$campaign)
quantile(bank_data$campaign,seq(0,1,0.01))
# Capping this at 99% which the value is 14
bank_data[which(bank_data$campaign>14),]$campaign <- 14
# Visualizing it with plot
ggplot(bank_data,aes(campaign))+geom_histogram()
#-------------------------------------------------------
#-- Next variable is "pdays"
# Let's first convert this variable to factor type
bank_data$pdays<- as.factor(bank_data$pdays)
# Checking summary
summary(bank_data$pdays)
levels(bank_data$pdays)
# Reducing the levels of this variable to 3.
levels(bank_data$pdays)[1:10] <- "Contacted_in_first_10days"

levels(bank_data$pdays)[2:17] <-"Contacted_after_10days"
levels(bank_data$pdays)[3] <- "First_time_contacted"
# Also,lets see the respose rate of each levels.
plot_response(bank_data$pday,"Pday")
# Number of prospects under each category
table(bank_data$pdays)
#-------------------------------------------------------
# Next variable is "previous" i.e number of contacts performed before

# this campaign and for this client (numeric)
summary(bank_data$previous)
# Max=7, best is to convert this variable to factor
bank_data$previous <- as.factor(bank_data$previous)
levels(bank_data$previous)[1]<-"Never_contacted"
levels(bank_data$previous)[2:4] <- "Less_than_3_times"
levels(bank_data$previous)[3:6] <- "More_than_3_times"
summary(bank_data$previous)
plot_response(bank_data$previous,"Previous_contacts")
# Now, the next variable is "Poutcome" i.e outcome of the previous marketing
campaign
# (categorical: 'failure','nonexistent','success')
summary(bank_data$poutcome)
plot_response(bank_data$poutcome,"Outcome_of_Previous_contacts")
#-------------------------------------------------------
#2. Model Building and Evaluation Using Logistic Regression
# Required Packages
library(caret)
library(caTools)
library(dummies)
#---------------------------------------------------------
# Removing binning variables
bank_data <- bank_data[, -21]
#creating dummy variables by converting all categorical variables into numeric ones
bank_data$response <- as.integer(bank_data$response)
k1 <- bank_data
#using inbuilt function from 'dummies' library to create dummies variables
bank_data <- dummy.data.frame(bank_data)
#converting response as factor type for ease of analysis
bank_data$response <- as.factor(ifelse(bank_data$response == 1, "yes", "no"))
#splitting into train and test data
set.seed(100)
split_indices <- sample.split(bank_data$response, SplitRatio = 0.70)
train <- bank_data[split_indices, ]
test <- bank_data[!split_indices, ]
nrow(train)/nrow(bank_data)
nrow(test)/nrow(bank_data)
#removing 'duration' column from the training and testing data sets as it is not
required for model building
train_bank <- train[,-45]
test_bank <- test[,-45]
#duration is the 45th column of the training and testing data sets (in that order)
#rename incorrectly formatted variable

names(train_bank)[3] <- "jobblue_collar"
names(test_bank)[3] <- "jobblue_collar"
names(bank_data)[3] <- "jobblue_collar"
#loading required libraries
library(MASS)
library(car)
#model building
logistic_1 <- glm(response ~ ., family = "binomial", data = train_bank)
summary(logistic_1)
# Using stepwise algorithm for removing insignificant variables
logistic_2 <- stepAIC(logistic_1, direction = "both")
#checking vif of the final selected variables
vif(logistic_2)
# stepAIC has removed some variables and only the following ones remain
logistic_3 <- glm(formula = response ~ age + jobretired + loanno + contactcellular
+
monthaug + monthdec + monthjun + monthmar + monthmay + monthnov +
day_of_weekfri + day_of_weekmon + day_of_weekthu + day_of_weektue +
campaign + pdaysContacted_in_first_10days +
pdaysContacted_after_10days +
previousLess_than_3_times + poutcomefailure + emp.var.rate +
cons.price.idx +
cons.conf.idx + euribor3m + nr.employed + educationTertiary_Education
+
jobblue_collar + jobservices, family = "binomial", data = train_bank)
# checking vif for logistic_3
vif(logistic_3)
summary(logistic_3)
#emp.var.rate is removed as it has the highest VIF (107.443)

+
previousLess_than_3_times + poutcomefailure + cons.price.idx +
cons.conf.idx + euribor3m + nr.employed + educationTertiary_Education
+
vif(logistic_4)
summary(logistic_4)
#euribor3m is removed as it has the highest VIF (82.881)

+
previousLess_than_3_times + poutcomefailure + cons.price.idx +
cons.conf.idx + nr.employed + educationTertiary_Education +
vif(logistic_5)
summary(logistic_5)
#previousLess_than_3_times is removed as it has high VIF (10.72) and low

significance (p-value = 0.053 > 0.05)

+
poutcomefailure + cons.price.idx + cons.conf.idx + nr.employed +
educationTertiary_Education + jobblue_collar + jobservices, family =
"binomial", data = train_bank)
vif(logistic_6)
summary(logistic_6)
#Now all variables have VIF <2, hence remove variables based on p-values
#cons.price.idx is removed since it has the highest p-value (0.253)

+
poutcomefailure + cons.conf.idx + nr.employed +
educationTertiary_Education +
vif(logistic_7)
summary(logistic_7)
#educationTertiary_Education is removed since it has the highest p-value (0.1643)

+
poutcomefailure + cons.conf.idx + nr.employed + jobblue_collar +
jobservices, family = "binomial", data = train_bank)
vif(logistic_8)
summary(logistic_8)
#day_of_weekthu is removed since it has the highest p-value (0.1657)

+
day_of_weekfri + day_of_weekmon + day_of_weektue +
poutcomefailure + cons.conf.idx + nr.employed + jobblue_collar +
jobservices, family = "binomial", data = train_bank)
vif(logistic_9)
summary(logistic_9)
#day_of_weektue is removed since it has the highest p-value (0.2492)
logistic_10 <- glm(formula = response ~ age + jobretired + loanno +

contactcellular +
day_of_weekfri + day_of_weekmon + campaign +
pdaysContacted_in_first_10days +
pdaysContacted_after_10days + poutcomefailure + cons.conf.idx +
nr.employed +
vif(logistic_10)
summary(logistic_10)
#day_of_weekfri is removed since it has the highest p-value (0.07824)

+
day_of_weekmon + campaign + pdaysContacted_in_first_10days +
nr.employed +
vif(logistic_11)
#loanno is removed since it has the highest p-value (0.05279)
logistic_12 <- glm(formula = response ~ age + jobretired + contactcellular +

nr.employed +
vif(logistic_12)
#monthaug is removed since it has the highest p-value (0.0533)

monthdec + monthjun + monthmar + monthmay + monthnov +
nr.employed +
vif(logistic_13)
#Now all variables have VIF <2 and p-value less then 0.05, but the number of
#variables is still too large. So we will remove the variables till all p-values
are less than 0.001
#monthdec is removed since it has the highest p-value (0.01676)

monthjun + monthmar + monthmay + monthnov + day_of_weekmon +
vif(logistic_14)
#Now all variables have VIF <2 and p-value less then 0.05, but the number of
are less than 0.001
#age is removed since it has the highest p-value (0.02)
logistic_15 <- glm(formula = response ~ jobretired + contactcellular +

monthjun + monthmar + monthmay + monthnov + day_of_weekmon +
vif(logistic_15)
#Now all variables have VIF <2

#jobretired is removed since it has the highest p-value (0.1468 > 0.05)
logistic_16 <- glm(formula = response ~ contactcellular + monthjun + monthmar +

monthmay + monthnov + day_of_weekmon + campaign +
pdaysContacted_in_first_10days + pdaysContacted_after_10days +
vif(logistic_16)
#Now all variables have VIF<=2 and p-value less then 0.05, but the number of
are less than 0.001
#jobservices is removed since it has the highest p-value (0.00827)
logistic_17 <- glm(formula = response ~ contactcellular + monthjun + monthmar +

monthmay + monthnov + day_of_weekmon + campaign +
pdaysContacted_in_first_10days + pdaysContacted_after_10days +
jobblue_collar, family = "binomial", data = train_bank)
vif(logistic_17)
#Now all variables have VIF<2 and p-value less then 0.001
#Hence, logistic_17 is chosen as the final model
#The major assumption is that p-values of each variable in the final model are less
than 0.001
#the method for eliminating variables after each model, is as follows:

#1. If the variables have high VIF and/or low signifance(p-value>0.05), the
variable can be removed
#2. If the variables all have p-values less than 0.05 (or 0.01 if applicable), the
variable having the highest VIF is considered (if VIF>3)
#3. If the variables all have VIF less than 2, the variable having the highest p-
value is considered if the p-value is more than 0.05
#4. Still, if variables are remaining, they are eliminated in order of p-values
until all have p-value<0.001
#5. This means finally variable selection is based on high VIF and then p-values
# Predicting probabilities of response for the test data
predictions_logit <- predict(logistic_17, newdata = test_bank[, -60], type =

"response")
summary(predictions_logit)
#Model Evaluation on Test Data
# Let's use the probability cutoff of 50%.
predicted_response <- factor(ifelse(predictions_logit >= 0.50, "yes", "no"))
# Creating confusion matrix for identifying the model evaluation.

conf <- confusionMatrix(predicted_response, test_bank$response, positive = "yes")
conf
# Let's find out the optimal probalility cutoff
perform_fn <- function(cutoff)

{
predicted_response <- factor(ifelse(predictions_logit >= cutoff, "yes", "no"))
conf <- confusionMatrix(predicted_response, test_bank$response, positive = "yes")
acc <- conf$overall[1]
sens <- conf$byClass[1]
spec <- conf$byClass[2]
out <- t(as.matrix(c(sens, spec, acc)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)
}
#---------------------------------------------------------
# Creating cutoff values from 0.01 to 0.99 for plotting and initiallizing a matrix
of 1000 X 4.
s = seq(.01,.99,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i])
}
#---------------------------------------------------------
# plotting cutoffs
plot(s,
OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lw
d=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Speci
ficity","Accuracy"))
#---------------------------------------------------------
cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.12)]

cutoff
# 0.06939394 0.07929293 0.08919192 0.09909091 0.10898990
# Let's choose the cutoff value of 7.93% for final model
conf_final <- confusionMatrix(predicted_response, test_bank$response, positive =

"yes")
acc <- conf_final$overall[1]
sens <- conf_final$byClass[1]
spec <- conf_final$byClass[2]
acc
#accuracy
#0.7635966
sens
#sensitivity
#0.6702586
spec
#specificity
#0.7754469
#sensitivity is less than 70%, so choose the cutoff slightly less than 7.93%
# Let's choose the cutoff value of 7.5% for final model
conf_final <- confusionMatrix(predicted_response, test_bank$response, positive =

"yes")
acc <- conf_final$overall[1]
sens <- conf_final$byClass[1]
spec <- conf_final$byClass[2]
acc
#accuracy
#0.7183555
sens
#sensitivity
#0.6975575
spec
#specificity
#0.720996
#accuracy, sensitivity and specificity all are around or more than 70%
#For model evaluation and for finding cutoff we used the test data frame test_bank.
#However we now have to again find the predicted response based on entire dataset
#hence, the original data frame bank_data is involved in this step
#Finding and analysing predicted response and probability of responses for the
enitire original data set
#Appending the probabilities and response variables to the original data
bank_data$predicted_probs <- predict(logistic_17, newdata = bank_data[, -61], type

= "response")
bank_data$predicted_response <- factor(ifelse(bank_data$predicted_probs >= 0.50,

"yes", "no"))
#finding the response rate
response_rate <- table(bank_data$response)[2]/(table(test_predictions$response)[1]
+ table(test_predictions$response)[2])
response_rate
#0.112654
#sorting the probabilities in decreasing order

bank_data <- bank_data[order(bank_data$predicted_probs, decreasing = T), ]
#3. Creating new dataframe "test_predictions" and calculating cost of call for each
propect
#Adding required columns from the original data set
test_predictions <- bank_data[, c("response", "predicted_probs",

"predicted_response","duration")]
#generating unique response ID for analysis

test_predictions$prospectID <- seq.int(nrow(test_predictions))
#calculate the cost of call for each prospect using the below formula
#Cost per call (INR) = 0.033*(duration_in_seconds) + 0.8
test_predictions$costOFcall <- (0.033*test_predictions$duration)+0.8
#checking the structure of the new data frame

str(test_predictions)
#thus, all the six required variables are present in the new data frame created.
#4. Filtering top X% prospects to be targeted and determine the average call
duration for the same
# Loading dplyr package

require(dplyr)
library(dplyr)
lift <- function(labels , predicted_prob, groups=10) {
if(is.factor(labels)) labels <- as.integer(as.character(labels ))

if(is.factor(predicted_prob)) predicted_prob <-
as.integer(as.character(predicted_prob))
helper = data.frame(cbind(labels , predicted_prob))
helper[,"bucket"] = ntile(-helper[,"predicted_prob"], groups)
gaintable = helper %>% group_by(bucket) %>%
summarise_at(vars(labels ), funs(total = n(),
totalresp=sum(., na.rm = TRUE))) %>%
mutate(Cumresp = cumsum(totalresp),
Gain=Cumresp/sum(totalresp)*100,
Cumlift=Gain/(bucket*(100/groups))
Total_Respondents = cumsum(total))
return(gaintable)
}
# Create a Table of cumulative gain and lift
bank_data$response <- as.factor(ifelse(bank_data$response =="yes",1,0))
LG = lift(bank_data$response, bank_data$predicted_probs, groups = 10)
# Gain Chart
plot(LG$Total_Respondents,LG$Gain,col="red",type="l",main="Gain Chart",xlab="total
number of respondents",ylab = "% of positive Response")
# Let's say if you have spent 1Re for each customer

View(LG)
#duration is the 45th variable in the original bank_data data frame

#at 80% gain, in the 5th decile, we will get around 3758 responsers
mean(bank_data[0:3758,45])
#289.0375
#For the top 50% responders, we will get 80% responses

#there are 41188 columns,
#therefore, number of responders to be contacted is 41188 * 0.5
i.e. 20594.
#hence, the value of X is 50.
#Thus the average call duration for top 50% respondents is 289.0375 seconds.
#5. plotting the lift chart
#The ratio: response rate using the model/ response rate without using the model is
nothing but the cumulative lift
# Lift Chart
plot(LG$Total_Respondents,LG$Cumlift,col="red",type="l",main="Lift
Chart",xlab="total number of respondents",ylab = "Cumulative Lift")
#The following are the assumptions made in this assignment.

#1. The model building is carried out without using the variable duration.
#2. The cutoff is calculated and model evalution is carried out based on the test
data.
#3. However, the predicted responses and the respective probability of responses
are again
# calculated based on the entire original data set for later analysis as mentioned
in relevant
# comments.
#4. The cutoff values, accuracy, sensitivity and specificity are low since the
variable duration is
# not taken into consideration during model building and evaluation (on the test
data set).
#5. The cutoff value 0.075 is chosen instead of 0.0793 as all the three lines
converge closer to that point.
#5. the value of X determined may vary with different logistic regression model.

Bank Analytics

Transféré par

Informations du document

Copyright

Formats disponibles

Partager ce document

Partager ou intégrer le document

Options de partage

Avez-vous trouvé ce document utile ?

Ce contenu est-il inapproprié ?

Droits d'auteur :

Formats disponibles

Bank Analytics

Transféré par

Droits d'auteur :

Formats disponibles

##------------Bank Marketing Analysis---------------------##

#Loading bank marketing data in the working directory.

# Checking structure of dataset

#1. Data cleaning, understanding and EDA

# Checking response rate of prospect customer

response <- 4640/(36548+4640)

# Checking missing values

# Loading ggplot2 library

# Plotting Age histogram

# Let's check the outlier in the variables

# Capping the upper values of age with 71.

# Binning the age variable and store it into "binning.age".

bank_data$binning.age <- as.factor(cut(bank_data$age, breaks = c(16, 20, 30, 40,

# Change the response value to numbers i.e"yes-no" to "1-0"

bank_data$response <- ifelse(bank_data$response == "yes", 1, 0)

# Check the numeric value of response rate in each bucket

agg_age <- merge(aggregate(response ~ binning.age, bank_data,

# changing column name of each variables in agg_age dataframe

colnames(agg_age) <- c("age", "response_rate", "count_prospects","No.of_prospect")

# Round Off the values

agg_age$response_rate <- format(round(agg_age$response_rate, 2))

ggplot(agg_age, aes(age, No.of_prospect,label = response_rate)) +

# Let's check the dataset of age less than 20 years.

# Checking structure of dataset

#-----Next Variable is "job"

# Checking the levels of the job

# Plotting bar graph for job variable.

# Writing a function "plot_response" to do the same task for each variable

plot_response <- function(cat_var, var_name){

colnames(agg_response) <- c(var_name, "response_rate","No.of_Prospect")

ggplot(agg_response, aes(agg_response[, 1], count, label = response_rate)) +

# Checking structure of dataset

# Checking Marital status

# Let's replace Unknown level to married

levels(bank_data$marital)[4] <- "married"

# Plotting marital status

# Let's see the education variables

# Reducing the levels of education variable

levels(bank_data$education)[c(1:3,5)] <- "Primary_Education"

# Let's understand the housing variables

#-- Let's see the next variable which is "loan"

plot_response(bank_data$loan, "Loan Status")

# Next variable is "Month" i.e contact month.

# Let's do the same of "day_of_week" variable

# Now, Let's see the "duration" variable: Which is Quantitative variable

# Let's check the histogram

# Let's see the summary of this variable once

bank_data <- bank_data[,-22]

## Definitely the outlier is present in the dataset

# So let's check the percentile distribution of duration

# So, capping the duration seconds at 99% which is 1271.3sec

bank_data[(which(bank_data$duration>1271.13)),]$duration <- 1271.13

# Now, again plot the histogram

# the next variable is "campaign" variable

# So let's check the summay of this variable

# Let's see the percentile distribution of this variable

# Capping this at 99% which the value is 14

# Visualizing it with plot

levels(bank_data$pdays)[1:10] <- "Contacted_in_first_10days"

# Also,lets see the respose rate of each levels.

# Number of prospects under each category

# Next variable is "previous" i.e number of contacts performed before

bank_data$previous <- as.factor(bank_data$previous)

#2. Model Building and Evaluation Using Logistic Regression