/* This is a variable clustering program useful in grouping your data into homogenous groups according to their correlation with one another. No "dependent" variable is considered in this analysis. Great when you have a bunch of variables and are trying to decide what to include in a regression model. The list printed out at the end indicates what variables to select if you were to include one variable from each cluster in your regression. Make sure your dataset for clustering does not have your dependent variable in it. In other words, each group or cluster is attempting to explain the same thing. Macro Example call: %varclustering(mydata,3); 1st paramter = dataset name 2nd paramter = number of clusters desired. */ title Variable Clustering; data MYDATA; input ACCOUNT $10. X1 X2 X3 X4 X5 X6 X7 X8 X9; CARDS; ACCOUNT1 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7 ACCOUNT2 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3 ACCOUNT3 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1 4.0 ACCOUNT4 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7 4.2 ACCOUNT5 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0 ACCOUNT6 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4 ACCOUNT7 8.4 11.6 3.7 11.1 5.4 24.6 6.5 0.8 3.6 ACCOUNT8 9.5 4.9 2.7 33.7 5.8 26.3 5.1 1.0 1.4 ACCOUNT9 18.0 9.9 3.3 19.5 5.7 28.1 4.8 2.4 6.5 ACCOUNT10 10.2 3.0 2.8 17.6 5.9 41.7 2.2 7.8 6.5 ACCOUNT11 5.3 12.4 2.9 9.7 0.3 40.1 4.0 5.4 4.2 ACCOUNT12 13.9 10.0 4.7 25.8 2.2 24.0 6.2 1.6 2.9 ACCOUNT13 9.0 5.1 2.9 13.7 3.4 36.8 2.1 4.3 6.7 ACCOUNT14 9.5 13.6 3.6 23.4 2.5 22.4 4.2 1.8 3.7 ACCOUNT15 9.4 4.7 2.7 23.3 9.7 23.0 4.6 1.6 2.7 ACCOUNT16 6.9 10.2 2.7 19.3 3.0 36.1 5.9 2.0 6.6 ACCOUNT17 6.2 3.7 1.1 4.9 14.2 27.0 5.9 4.7 7.9 ACCOUNT18 6.2 6.3 1.5 11.1 1.0 49.6 3.1 5.3 2.8 ACCOUNT19 7.1 3.4 3.1 8.6 7.0 29.2 5.7 5.9 7.2 ACCOUNT20 9.9 7.8 3.5 4.7 7.5 19.5 3.7 1.4 2.0 ; %macro varclustering(mydata,maxclusters); %let minclusters=&maxclusters; PROC CORR DATA=&MYDATA noprint;RUN; proc varclus data=&MYDATA minc=&minclusters maxclusters=&maxclusters; ods output rsquare(match_all)=rsq; ods select rsquare; run; /* fill in the CLUSTER variable values */ data rsq; set rsq; drop old_cluster; retain old_cluster; if cluster eq '' then cluster = old_cluster; old_cluster=cluster; run; proc print data=rsq noobs; run; /* pick out the variable corresponding to smallest 1-rsq ratio */ proc sort data=rsq out=rsq; by cluster RSquareRatio; run; data got_it; set rsq; by cluster; if first.cluster; run; proc print data=got_it noobs; run; %mend varclustering; %varclustering(mydata,3);