Académique Documents
Professionnel Documents
Culture Documents
csv';
PROC IMPORT DATAFILE=REFFILE
DBMS=CSV
OUT=WORK.LOAN;
GETNAMES=YES;
RUN;
/* CHECK FOR MISSING VALUES */
PROC MEANS DATA = LOAN N NMISS MEAN MEDIAN MODE ;
RUN;
/* UNIVARIATE ANALYSIS OF NON-MISSING VALUES */
PROC UNIVARIATE DATA=LOAN PLOTS;
VAR coAPPLICANTINCOME;
RUN;
PROC UNIVARIATE DATA=LOAN PLOTS;
VAR APPLICANTINCOME;
RUN;
/* DELETING MISSING VALUES */
DATA NOMISS;
SET LOAN;
IF NMISS(OF _NUMERIC_)>0 THEN DELETE;
IF CMISS(OF _CHARACTER_)>0 THEN DELETE;
RUN;
/* MISSING VALUE CHECK */
PROC MEANS DATA=NOMISS N NMISS MEAN MEDIAN MODE;
RUN;
/* UNIVARITE OF OTHER CONTINOUS VARIABLES */
PROC UNIVARIATE DATA=NOMISS PLOTS;
VAR LOANAMOUNT;
RUN;
/* DATA DISTRIUTION */
PROC SGPLOT DATA=NOMISS;
HISTOGRAM DEPENDENTS ;
RUN;
PROC SGPLOT DATA=NOMISS;
HISTOGRAM LOAN_AMOUNT_TERM ;
RUN;
PROC GCHART DATA=NOMISS;
PIE MARRIED / PERCENT=ARROW;
RUN;
/* CROSS TABULATION GRAPHS */
PROC SGPANEL DATA=NOMISS;
PANELBY MARRIED / ONEPANEL;
HISTOGRAM APPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY MARRIED / ONEPANEL;
HISTOGRAM COAPPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY DEPENDENTS / ONEPANEL;
HISTOGRAM APPLICANTINCOME;
RUN;
PROC SGPANEL DATA=NOMISS;
PANELBY DEPENDENTS / ONEPANEL;
HISTOGRAM COAPPLICANTINCOME;
RUN;
/* CREATING DUMMIES */
DATA DUMMIES;
SET NOMISS;
IF Gender = 'MALE' then Male_Code = 1 ;
else Male_Code = 0;
if Married = 'YES' then Married_Code = 1 ;
else Married_Code = 0;
if Property_area = 'Rural' then Property_Area_Code_Rural = 1;
else Property_Area_Code_Rural = 0;
if Property_Area = 'Urban' then Property_Area_Code_Urban = 1;
else Property_Area_Code_Urban = 0;
if Education = 'Graduate' then Grad_Code = 1;
else Grad_Code= 0;
if self_employed = 'Yes' then self_emp_code = 1;
else self_emp_code = 0;
drop gender married property_area education self_employed;
run;
/* RESPONSE RATE */
PROC FREQ DATA= DUMMIES;
TABLES LOAN_STATUS;
RUN;
/* OVERSAMPLE ADJUSTMENT */
DATA YES NO;
SET DUMMIES;
IF LOAN_STATUS = 'Y' THEN OUTPUT YES;
ELSE OUTPUT NO;
RUN;
PROC SURVEYSELECT DATA = YES SAMPSIZE= 148 OUT=SUCCESS SEED=1234567;
RUN;
DATA OVERSAMPLE;
MERGE SUCCESS NO;
BY LOAN_STATUS;
RUN;
PROC FREQ DATA=oversample;
TABLES LOAN_STATUS;
RUN;
/* SPLITTING IN TRAINING AND VALIDATION */
PROC SURVEYSELECT DATA=OVERSAMPLE OUTALL OUT=SPLIT SAMPRATE=0.7;
RUN;