/* In this example, X1 and X2 are highly correlated with each other and shouldn't be used in a regression. This program examines all the possible pairwise correlation and bivariate (correlations with dependent variable) in an effort to eliminate variables that are too highly correlated - i.e. above a threshold - say .7. The program looks at those variables above the threshold and determines which ones are more correlated with the dependent variable and eliminates the others. For variables where pairwise correlation is not a problem (below the threshold, they are included as part of the final listing of "good" variables. This program should let you eliminate pairwise correlation problems - the most common form of correlation in regression models. However, it does not eliminate instances where the combination of three or more variables causes a collinearity problem. This problem is a rarer occurance. In this example, X1 is slightly more correlated with the dependent variable (Y) than X2, so the listing at the end excludes X2, as it should. All other variables in this example do not have correlation problems, so they are included. Macro Parameters: 1st position = your dataset 2nd position = your correlation threshold - .7 is a safe bet. 3rd position = dependent variable 4th position = print correlation table (could be huge). Notes - the final correlation table is called HERE2, in case you are interested. - if there are NO pairwise correlations above the threshold, you will get an ERROR Message at end. In that case, check the top of HERE2 and look at variable ABS_PCORR. Example macro call... %correlation(MYDATA,.7,Y,Y); */ data MYDATA; input ACCOUNT $10. X1 X2 X3 X4 X5 X6 X7 X8 X9 Y; CARDS; ACCOUNT1 10 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7 ACCOUNT2 9 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3 ACCOUNT3 12 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1 4.0 ACCOUNT4 8 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7 4.2 ACCOUNT5 10 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0 ACCOUNT6 10 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4 ACCOUNT7 9 8.4 11.6 3.7 11.1 5.4 24.6 6.5 0.8 3.6 ACCOUNT8 9 9.5 4.9 2.7 33.7 5.8 26.3 5.1 1.0 1.4 ACCOUNT9 18 18.0 9.9 3.3 19.5 5.7 28.1 4.8 2.4 6.5 ACCOUNT10 9 10.2 3.0 2.8 17.6 5.9 41.7 2.2 7.8 6.5 ACCOUNT11 5 5.3 12.4 2.9 9.7 0.3 40.1 4.0 5.4 4.2 ACCOUNT12 12 13.9 10.0 4.7 25.8 2.2 24.0 6.2 1.6 2.9 ACCOUNT13 8 9.0 5.1 2.9 13.7 3.4 36.8 2.1 4.3 6.7 ACCOUNT14 8 9.5 13.6 3.6 23.4 2.5 22.4 4.2 1.8 3.7 ACCOUNT15 9 9.4 4.7 2.7 23.3 9.7 23.0 4.6 1.6 2.7 ACCOUNT16 7 6.9 10.2 2.7 19.3 3.0 36.1 5.9 2.0 6.6 ACCOUNT17 6 6.2 3.7 1.1 4.9 14.2 27.0 5.9 4.7 7.9 ACCOUNT18 6 6.2 6.3 1.5 11.1 1.0 49.6 3.1 5.3 2.8 ACCOUNT19 6 7.1 3.4 3.1 8.6 7.0 29.2 5.7 5.9 7.2 ACCOUNT20 9 9.9 7.8 3.5 4.7 7.5 19.5 3.7 1.4 2.0 ; RUN; %macro correlation(data,cut2,depv,printme); %let cut=0; OPTIONS NOTES SOURCE SOURCE2 MPRINT MLOGIC MERROR SYMBOLGEN; title1 Pairwise CORRELATION Analysis; data tmp; set &data; proc corr data=tmp noprint out=mycorr; run; proc corr data=&data outp=p noprint; data _null_; set p (firstobs=4 drop=_name_ _type_) end= last; array l(*) _numeric_; length name $32; file './lista.txt'; if _n_ = 1 then put '%macro lista;' ; do i=1 to dim(l); call vname (l(i), name); if _n_ =1 then put name; end; if last then put '%mend lista;'; file './listb.txt'; if _n_ = 1 then put '%macro listb;' ; do i=1 to dim(l)-1; call vname (l(i), name); if _n_ =1 then put "'" name "',"; end; do j=dim(l) to dim(l); call vname (l(j), name); if _n_ =1 then put "'" name "'"; end; if last then do; put '%mend listb;'; call symput ('nvar', _n_ ); end; run; %inc './lista.txt'; %inc './listb.txt'; ************ extract unique pairwise correlation coefficients *************; data getr (keep=VAR_1_LABEL VAR_2_LABEL p_corr abscorr); set p (firstobs=4); array pcorr {&nvar} %lista; array names {&nvar} $32 (%listb); if _type_='CORR' then do; do i=_n_ to dim (pcorr); if _n_ ^=i then do; p_corr=pcorr{i}; VAR_1_LABEL=_NAME_; VAR_2_LABEL=names{i}; abscorr=abs(p_corr); output; end; end; end; run; data getr; set getr; *if abscorr<&cut then delete; run; *Creating Bivariate data (Getting rid of Pairwise); data bivariate; set getr; if VAR_1_LABEL ^="&depv" AND VAR_2_LABEL ^="&depv" then delete; run; *Creating Only Pairwise table (not including depend. var); data getr2; set getr; if VAR_1_LABEL =&depv then delete; if VAR_2_LABEL =&depv then delete; run; proc sort data=getr out=getr(drop=abscorr); by descending abscorr; run; data answer; set getr2; run; data answer; set answer; run; data bivariate2(drop=abscorr var_2_label p_corr); set bivariate; bivar_var1=p_corr; if var_1_label="&depv" then do; var_1_label=var_2_label; end; n=_n_; run; proc sort data=answer; by var_1_label; run; proc sort data=bivariate2; by var_1_label; run; data here; merge answer bivariate2; by var_1_label; run; data bivariate2(drop=bivar_var1 VAR_1_LABEL); set bivariate2; bivar_var2=bivar_var1; VAR_2_LABEL=VAR_1_LABEL; run; proc sort data=bivariate2; by var_2_label; RUN; proc sort data=here; by var_2_label; run; data here2(keep=abs_pcorr var_1_label bivar_var1 var_2_label bivar_var2 p_corr); merge here bivariate2; by VAR_2_LABEL; ABS_PCORR=abs(p_corr); if p_corr=. then delete; if var_1_label="&depv" then delete; if var_2_label="&depv" then delete; if abs_pcorr<&cut then delete; run; proc sort data=here2; by descending ABS_PCORR; RUN; data here2; set here2; length A1 $32; length A2 $32; if abs(bivar_var1)>abs(bivar_var2) then A1=VAR_1_LABEL; if abs(bivar_var1)<=abs(bivar_var2) then A1=VAR_2_LABEL; if abs(bivar_var1)<=abs(bivar_var2) then A2=VAR_1_LABEL; if abs(bivar_var1)>abs(bivar_var2) then A2=VAR_2_LABEL; run; %if &printme=Y %then %do; title1 'BIVARIATE / PAIRWISE CORRELATIONS'; footnote; OPTIONS LS=145 ps=45 center ERROR=1 nodate nonumber; PROC PRINT DATA=here2 NOOBS SPLIT='/'; LABEL P_CORR = 'PAIRWISE/CORRELATION/=========' VAR_1_LABEL= 'NAME 1/=========' VAR_2_LABEL= 'NAME 2/=========' A1= 'SELECTED/VARIABLE/=========' A2= 'UNSELECTED/VARIABLE/=========' BIVAR_VAR1 = "NAME 1/BIVARIATE/CORRELATION/========" BIVAR_VAR2 = "NAME 2/BIVARIATE/CORRELATION/========"; VAR P_CORR VAR_1_LABEL VAR_2_LABEL A1 A2 BIVAR_VAR1 BIVAR_VAR2; RUN; %end; /* First collect collinear varlist that have pairwise greater than threshold and call it ADDME. These are the winners when compared to correlation to dependent variable. A1 is the column for winner vars. A2 is for loser vars. */ data addme(keep=A1); set here2; if abs_pcorr<=&cut2 then delete; run; /* Next collect varlist with no collinearity problems (lower than threshold). Call it KEEPME. We have to use both var_1_label and var_2_label to capture all the pairwise listings. We do this by appending and then deduping. */ data keepme(keep=var_1_label); set here2; if abs_pcorr>&cut2 then delete; run; data keepme2(keep=var_1_label); set here2; var_1_label=var_2_label; if abs_pcorr>&cut2 then delete; run; proc append base=keepme data=keepme2 force;run; /* Append these non collinear and winner variables together - use FORCE. */ proc append base=keepme data=addme force;run; /* Dedupe variables - there are alot of duplications cause of many combination of pairwise. */ proc sort nodup;by var_1_label;run; /* Now we must get rid of the column A2 var list - losers of the ones that were collinear - i.e. didn't win when compared to correlation with dependent variable. */ data dropme(keep=a2); set here2; if abs_pcorr<=&cut2 then delete; run; proc sort nodup;by a2;run; %macro last; OPTIONS NOTES SOURCE SOURCE2 MPRINT MLOGIC MERROR SYMBOLGEN; /* DROPME is from A2 and we use these vars to delete from our good vars from KEEPME. First we find out how many there are for looping process. */ data _null_; set dropme end=last; if last then call symput('nfiles',_n_); run; /* Start the looping process to delete. */ %do i=1 %to &nfiles; data _null_; set dropme; /* The line of code below is the key here */ if &i=_n_ then call symput('remove',trim(A2)); run; data keepme; set keepme; if var_1_label="&remove" then flag=1; run; %end; data keepme(drop=flag); set keepme; if var_1_label='' then delete; if flag=1 then delete; run; %mend last; %last; title Uncorrelated Variable List; proc print data=keepme;run; %mend correlation; *====== call macro ========; %correlation(MYDATA,.7,Y,Y); *==========================;