Vous êtes sur la page 1sur 4

FILENAME REFFILE '/folders/myfolders/Loan.

csv';
PROC IMPORT DATAFILE=REFFILE
DBMS=CSV
OUT=WORK.LOAN;
GETNAMES=YES;
RUN;
/* CHECK FOR MISSING VALUES */
PROC MEANS DATA = LOAN N NMISS MEAN MEDIAN MODE ;
RUN;
/* UNIVARIATE ANALYSIS OF NON-MISSING VALUES */
PROC UNIVARIATE DATA=LOAN PLOTS;
VAR coAPPLICANTINCOME;
RUN;
PROC UNIVARIATE DATA=LOAN PLOTS;
VAR APPLICANTINCOME;
RUN;
/* DELETING MISSING VALUES */
DATA NOMISS;
SET LOAN;
IF NMISS(OF _NUMERIC_)>0 THEN DELETE;
IF CMISS(OF _CHARACTER_)>0 THEN DELETE;
RUN;
/* MISSING VALUE CHECK */
PROC MEANS DATA=NOMISS N NMISS MEAN MEDIAN MODE;
RUN;
/* UNIVARITE OF OTHER CONTINOUS VARIABLES */
PROC UNIVARIATE DATA=NOMISS PLOTS;
VAR LOANAMOUNT;
RUN;
/* DATA DISTRIUTION */
PROC SGPLOT DATA=NOMISS;
HISTOGRAM DEPENDENTS ;
RUN;
PROC SGPLOT DATA=NOMISS;
HISTOGRAM LOAN_AMOUNT_TERM ;
RUN;
PROC GCHART DATA=NOMISS;
PIE MARRIED / PERCENT=ARROW;
RUN;
/* CROSS TABULATION GRAPHS */
PROC SGPANEL DATA=NOMISS;
PANELBY MARRIED / ONEPANEL;
HISTOGRAM APPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY MARRIED / ONEPANEL;
HISTOGRAM COAPPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY DEPENDENTS / ONEPANEL;
HISTOGRAM APPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY DEPENDENTS / ONEPANEL;
HISTOGRAM COAPPLICANTINCOME;
RUN;

PROC SGPANEL DATA=NOMISS;


PANELBY CREDIT_HISTORY;
HISTOGRAM APPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY CREDIT_HISTORY;
HISTOGRAM COAPPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY PROPERTY_AREA / ONEPANEL;
HISTOGRAM APPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY PROPERTY_AREA / ONEPANEL;
HISTOGRAM COAPPLICANTINCOME;
RUN;
/* CHISQ TEST OF INDEPENDENCE */
PROC FREQ DATA=NOMISS;
TABLES GENDER*MARRIED / CHISQ;
RUN;
PROC FREQ DATA=NOMISS;
TABLES GENDER*EDUCATION / CHISQ;
RUN;
PROC FREQ DATA=NOMISS;
TABLES GENDER*SELF_EMPLOYED / CHISQ;
RUN;
PROC FREQ DATA=NOMISS;
TABLES GENDER*CREDIT_HISTORY / CHISQ;
RUN;
PROC FREQ DATA=NOMISS;
TABLES GENDER*PROPERTY_AREA / CHISQ;
RUN;
PROC FREQ DATA=NOMISS;
TABLES MARRIED*SELF_EMPLOYED / CHISQ;
RUN;
PROC FREQ DATA=NOMISS;
TABLES MARRIED*CREDIT_HISTORY / CHISQ;
RUN;
PROC FREQ DATA = NOMISS;
TABLES MARRIED * PROPERTY_AREA / CHISQ;
RUN;
PROC FREQ DATA = NOMISS;
TABLES MARRIED * EDUCATION / CHISQ;
RUN;
PROC FREQ DATA = NOMISS;
TABLES EDUCATION * SELF_EMPLOYED / CHISQ;
RUN;

PROC FREQ DATA = NOMISS;


TABLES EDUCATION * PROPERTY_AREA/ CHISQ;
RUN;
PROC FREQ DATA = NOMISS;
TABLES SELF_EMPLOYED * PROPERTY_AREA / CHISQ;
RUN;
PROC FREQ DATA = NOMISS;
TABLES CREDIT_HISTORY * PROPERTY_AREA / CHISQ;
RUN;

/* CREATING DUMMIES */
DATA DUMMIES;
SET NOMISS;
IF Gender = 'MALE' then Male_Code = 1 ;
else Male_Code = 0;
if Married = 'YES' then Married_Code = 1 ;
else Married_Code = 0;
if Property_area = 'Rural' then Property_Area_Code_Rural = 1;
else Property_Area_Code_Rural = 0;
if Property_Area = 'Urban' then Property_Area_Code_Urban = 1;
else Property_Area_Code_Urban = 0;
if Education = 'Graduate' then Grad_Code = 1;
else Grad_Code= 0;
if self_employed = 'Yes' then self_emp_code = 1;
else self_emp_code = 0;
drop gender married property_area education self_employed;
run;
/* RESPONSE RATE */
PROC FREQ DATA= DUMMIES;
TABLES LOAN_STATUS;
RUN;
/* OVERSAMPLE ADJUSTMENT */
DATA YES NO;
SET DUMMIES;
IF LOAN_STATUS = 'Y' THEN OUTPUT YES;
ELSE OUTPUT NO;
RUN;
PROC SURVEYSELECT DATA = YES SAMPSIZE= 148 OUT=SUCCESS SEED=1234567;
RUN;
DATA OVERSAMPLE;
MERGE SUCCESS NO;
BY LOAN_STATUS;
RUN;
PROC FREQ DATA=oversample;
TABLES LOAN_STATUS;
RUN;
/* SPLITTING IN TRAINING AND VALIDATION */
PROC SURVEYSELECT DATA=OVERSAMPLE OUTALL OUT=SPLIT SAMPRATE=0.7;
RUN;

DATA TRAIN VALID;


SET SPLIT;
IF SELECTED = 1 THEN OUTPUT TRAIN;
ELSE OUTPUT VALID;
RUN;
PROC FREQ DATA=TRAIN ;
TABLES LOAN_STATUS;
RUN;
PROC FREQ DATA=VALID ;
TABLES LOAN_STATUS;
RUN;
/* LOGISTIC REGRESSION */
PROC LOGISTIC DATA=TRAIN OUTEST=LOGITS;
MODEL LOAN_STATUS(EVENT = 'Y') = dependents applicantincome coapplicantincome lo
anamount loan_amount_term
credit_history male_code married_code grad_code self_emp_code property_area_code
_rural property_area_code_urban
/ outroc= troc; roc;
score data=VALID out=SCORED outroc= Vroc;
run;
/* CONFUSION MATRIX */
PROC FREQ DATA=SCORED;
TABLES F_LOAN_STATUS * I_LOAN_STATUS;
RUN;