Vous êtes sur la page 1sur 5

final =

read.csv("https://raw.githubusercontent.com/VictorManuelGP/datamining/master/FINAL_
MD", sep = ";")

##Exploracion de Data
head(final)
dim(final)
str(final)
summary(final)
str(final)

##Histogramas de variables cuantitativas


par(mfrow=c(2,2))
hist(final$pemanencia, col = 5)
hist(final$ImporteTotal, col = 2)
hist(final$CargoMensual, col = 3)
hist(final$Mayor65a�.os, col = 4)
##Cajas y bigotes de variables cuantitativas
par(mfrow=c(2,2))
boxplot(final$pemanencia, col = 3, main="Pemanencia" )
boxplot(final$ImporteTotal, col = 4, main="ImporteTotal")
boxplot(final$CargoMensual, col = 2, main="CargoMensual")
boxplot(final$Mayor65a�.os, col = 1, main="Mayor65a�.os")

##Graficos Pie
par(mfrow=c(2,2))
gen<-prop.table(table(final.imputacion$Genero))*100
grafico<-paste(names,(table(final.imputacion$Genero)), "\n",
prop.table(table(final.imputacion$Genero))*100, "%", sep = )
pie(gen, labels = grafico, main="Genero")

soc<-prop.table(table(final.imputacion$Socio))*100
grafico<-paste(names(table(final.imputacion$Socio)), "\n",
prop.table(table(final.imputacion$Socio))*100, "%", sep = )
pie(soc, labels = grafico, main="Socio")

dep<-prop.table(table(final.imputacion$Dependientes))*100
grafico<-paste(names(table(final.imputacion$Dependientes)), "\n",
prop.table(table(final.imputacion$Dependientes))*100, "%", sep = )
pie(dep, labels = grafico, main="Dependientes")

mult<-prop.table(table(final.imputacion$MultipleLineas))*100
grafico<-paste(names(table(final.imputacion$MultipleLineas)), "\n",
prop.table(table(final.imputacion$MultipleLineas))*100, "%", sep = )
pie(mult, labels = grafico, main="Multilineas")

par(mfrow=c(2,2))
int<-prop.table(table(final.imputacion$Internet))*100
grafico<-paste(names(table(final.imputacion$Internet)), "\n",
prop.table(table(final.imputacion$Internet))*100, "%", sep = )
pie(int, labels = grafico, main="Internet")

seg<-prop.table(table(final.imputacion$SeguridadOnline))*100
grafico<-paste(names(table(final.imputacion$SeguridadOnline)), "\n",
prop.table(table(final.imputacion$SeguridadOnline))*100, "%", sep
= )
pie(seg, labels = grafico, main="Seguridad Online")
bac<-prop.table(table(final.imputacion$Backup))*100
grafico<-paste(names(table(final.imputacion$Backup)), "\n",
prop.table(table(final.imputacion$Backup))*100, "%", sep = )
pie(bac, labels = grafico, main="Backup")

pro<-prop.table(table(final.imputacion$ProteccionDispositivo))*100
grafico<-paste(names(table(final.imputacion$ProteccionDispositivo)), "\n",
prop.table(table(final.imputacion$ProteccionDispositivo))*100, "%",
sep = )
pie(pro, labels = grafico, main="Proteccion Dispositivo")

par(mfrow=c(2,2))
no<-prop.table(table(final.imputacion$No.servicio.Internet))*100
grafico<-paste(names(table(final.imputacion$No.servicio.Internet)), "\n",
prop.table(table(final.imputacion$No.servicio.Internet))*100, "%",
sep = )
pie(no, labels = grafico, main="No Servicio internet")

tv<-prop.table(table(final.imputacion$TVPago))*100
grafico<-paste(names(table(final.imputacion$TVPago)), "\n",
prop.table(table(final.imputacion$TVPago))*100, "%", sep = )
pie(tv, labels = grafico, main="Tv Pago")

pel<-prop.table(table(final.imputacion$PeliculasStreaming))*100
grafico<-paste(names(table(final.imputacion$PeliculasStreaming)), "\n",
prop.table(table(final.imputacion$PeliculasStreaming))*100, "%", sep
= )
pie(pel, labels = grafico, main="Peliculas Streaming")

ter<-prop.table(table(final.imputacion$TerminoContrato))*100
grafico<-paste(names(table(final.imputacion$TerminoContrato)), "\n",
prop.table(table(final.imputacion$TerminoContrato))*100, "%", sep
= )
pie(ter, labels = grafico, main="Termino Contrato")

par(mfrow=c(2,2))
fac<-prop.table(table(final.imputacion$FactElectronica))*100
grafico<-paste(names(table(final.imputacion$FactElectronica)), "\n",
prop.table(table(final.imputacion$FactElectronica))*100, "%", sep
= )
pie(fac, labels = grafico, main="Facturacion Electronica")

met<-prop.table(table(final.imputacion$MetodoPago))*100
grafico<-paste(names(table(final.imputacion$MetodoPago)), "\n",
prop.table(table(final.imputacion$MetodoPago))*100, "%", sep = )
pie(met, labels = grafico, main="Metodo pago")

fono<-prop.table(table(final.imputacion$FonoServicio))*100
grafico<-paste(names(table(final.imputacion$FonoServicio)), "\n",
prop.table(table(final.imputacion$FonoServicio))*100, "%", sep = )
pie(fono, labels = grafico, main="Fono Servicio")

churn<-prop.table(table(final.imputacion$Churn))*100
grafico<-paste(names(table(final.imputacion$Churn)), "\n",
prop.table(table(final.imputacion$Churn))*100, "%", sep = )
pie(churn, labels = grafico, main="Churn")
##Eliminando la variable id.cliente
library(VIM)
final.Vna = final[,-1]
##grafico mostrando valores nulos
aggr(final.Vna, numbers=TRUE)

##Imputacion de datos
library(DMwR)
library(kknn)
final.imputacion = knnImputation(final.Vna,k = 20, scale = T, meth =
"weighAvg",distData = NULL)

##grafico despues de imputar valores nulos


aggr(final.imputacion, numbers=TRUE)
summary(final.imputacion)
str(final.imputacion)

##Regresion logisticas con variables significativas


modelo = glm(Churn ~ ., data = final.imputacion, family = "binomial")
summary(modelo)

final.imputacion$ImporteTotal = as.numeric(final.imputacion$ImporteTotal)
final.imputacion$CargoMensual = as.numeric(final.imputacion$CargoMensual)
final.imputacion$Genero = as.numeric(final.imputacion$Genero)
final.imputacion$Socio = as.numeric(final.imputacion$Socio)
final.imputacion$Dependientes = as.numeric(final.imputacion$Dependientes)
final.imputacion$MultipleLineas = as.numeric(final.imputacion$MultipleLineas)
final.imputacion$Internet = as.numeric(final.imputacion$Internet)
final.imputacion$SeguridadOnline = as.numeric(final.imputacion$SeguridadOnline)
final.imputacion$FactElectronica = as.numeric(final.imputacion$FactElectronica)
final.imputacion$MetodoPago = as.numeric(final.imputacion$MetodoPago)
final.imputacion$FonoServicio = as.numeric(final.imputacion$FonoServicio)
final.imputacion$TerminoContrato = as.numeric(final.imputacion$TerminoContrato)
final.imputacion$PeliculasStreaming =
as.numeric(final.imputacion$PeliculasStreaming)
final.imputacion$TVPago = as.numeric(final.imputacion$TVPago)
final.imputacion$No.servicio.Internet =
as.numeric(final.imputacion$No.servicio.Internet)
final.imputacion$Backup = as.numeric(final.imputacion$Backup)
final.imputacion$ProteccionDispositivo =
as.numeric(final.imputacion$ProteccionDispositivo)

final.imputacion$Churn = as.factor(final.imputacion$Churn)

summary(final.imputacion)
##
library(unbalanced)
n = ncol(final.imputacion)
output = final.imputacion$Churn
input = final.imputacion[,-n]

finalB = ubSMOTE(X=input, Y= output)


newData = cbind(finalB$X, finalB$Y)

str(newData)
finalChurn = newData[,-c(1:6,8:9,11:12,16:17,19)]
str(finalChurn)
dim(finalChurn)

##separacion de data
muestra<-sample(25352,10866)
train<-finalChurn[-muestra,]
test<-finalChurn[muestra,]

##-------------------------------------------------------------------##
##?Modelo
library(rpart)
library(caret)
set.seed(123)
modeloA<-rpart(`finalB$Y`~.,data =train,method = "class", cp=.0001)
predA<-predict(modeloA, test, type = "class")
resulA <- confusionMatrix(predA, test$`finalB$Y`)
resulA

##Curba ROC
library(ROCR)
predAA<-predict(modeloA, test, type = "prob")[,2]
predA1 <- prediction(predAA, test$`finalB$Y`)
predA2<-ROCR::performance(predA1, "tpr", "fpr")

plot(predA2, colorize = T)
lines(x=c(0, 1), y=c(0, 1), col=" blue", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="red", lwd=1, lty=4)

##GINI
AUROC <- round(ROCR::performance(predA1, measure = "auc")@y.values[[1]]*100, 2)
giniA <- (2*AUROC - 100)
giniA

##-------------------------------------------------------------------##
##NAIVE BAYES
library(e1071)
library(caret)
set.seed(123)
modeloB<-naiveBayes(`finalB$Y`~.,data =train,method = "class")
predB<-predict(modeloB, test, type = "class")
resulB <- confusionMatrix(predB, test$`finalB$Y`)
resulB

##Curba ROC
library(ROCR)
predBB<-predict(modeloB, test, type = "raw")[,2]
predB1 <- prediction(predBB, test$`finalB$Y`)
predB2<-ROCR::performance(predB1, "tpr", "fpr")

plot(predB2, colorize = T)
lines(x=c(0, 1), y=c(0, 1), col=" blue", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="red", lwd=1, lty=4)

##GINI
ROCB <- round(ROCR::performance(predB1, measure = "auc")@y.values[[1]]*100, 2)
giniB <- (2*ROCB - 100)
giniB
##-------------------------------------------------------------------##
##Redes Neuronales
library(nnet)
library(caret)
set.seed(123)
modeloR<-nnet(`finalB$Y`~.,data = train,size=10,trace=FALSE, maxit=1000)
predR<-predict(modeloR, test, type = "class")
predR<-as.factor(predR)
resulR <- confusionMatrix(predR, test$`finalB$Y`)
resulR

##Curba ROC
library(ROCR)
predRN<-predict(modeloR, test, type = "raw")
predR1 <- prediction(predRN, test$`finalB$Y`)
predR2<-ROCR::performance(predR1, "tpr", "fpr")

plot(predR2, colorize = T)
lines(x=c(0, 1), y=c(0, 1), col=" blue", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="red", lwd=1, lty=4)

##GINI
ROCRN <- round(ROCR::performance(predR1, measure = "auc")@y.values[[1]]*100, 2)
giniRN <- (2*ROCRN - 100)
giniRN