/* *WARNING: This program has some pathing statements hardcoded; The easiest way to choose a proxy for missing information is to select the mean, median, or mode values for data that you do have information on. However, if you are developing a binary classification model such is often done in credit scoring or new product forecasting, then an additional method becomes available which may prove even better. For example, let's say you have a variable called BAD and X1 for about 300 observations. Let BAD represent the classification category and let X1 represent a predictor variable which contain some missing values. If there is information content (predictability) in the simple fact that you have missing information for X1, then you would want to capture it where possible. One way to do this is to examine the classification rates for X1. For example, maybe for the missing category, your classification rate for X1 is 7.2%. This means that 7.2% of your X1 variable with missing information has an event value of 1. (BAD=1 if event is present, otherwise 0). In your search for a proxy for missing, you may want to try to find a X1 value to use that has the closest event classification rate to your missing category. This method accomplishes this task by collapsing your X1 values into a smaller set of interval categories so a comparison can be made with your missing information and the event variable. This process is sometimes called discretizing. Discretized code is produced and used to transform your original data. Next, the closest proxy for your missing information is obtained for each variable in your dataset by making comparisons with the event variable. */ data mydata; input BAD VAR1 VAR2 VAR3; cards; 1 . 0.314821 0.099593 0 . 0.648579 0.950966 1 . 0.636076 0.145751 1 . 0.330113 0.801662 0 . 0.610691 0.7352 0 . 0.025624 0.676796 1 . 0.103856 0.85479 1 . 0.499845 0.501913 0 . 0.52121 0.404369 1 . 0.327999 0.866115 0 . 0.614016 0.345729 0 . 0.32601 0.569811 0 . 0.757832 0.631763 0 . 0.315479 0.416465 1 . 0.603023 0.714122 0 . 0.407301 0.643335 1 . 0.695868 0.04587 0 . 0.818007 0.939681 0 . 0.497257 0.791191 0 . 0.169324 0.23154 0 . 0.853136 0.88694 1 . 0.282407 0.207318 0 . 0.472449 0.416685 0 . 0.545364 0.723382 0 . 0.830659 0.540843 0 . 0.701782 0.192597 0 . 0.116723 0.554313 0 . 0.961239 0.985007 1 . 0.334251 0.433777 0 . 0.229602 0.201509 1 . 0.601339 0.561184 1 . 0.40107 0.139007 0 . 0.442432 0.695574 0 . 0.12529 0.552583 0 . 0.478303 0.180966 0 . 0.424646 0.406803 0 . 0.490991 0.48501 1 . 0.363494 0.152037 1 . 0.381093 0.25936 1 . 0.098726 0.95875 0 . 0.576553 0.98008 1 . 0.412747 0.874575 1 . 0.31668 0.033307 0 . 0.124703 0.320427 1 . 0.500406 0.185448 0 . 0.130126 0.65746 1 . 0.408636 0.980674 1 . 0.038087 0.276533 1 . 0.418588 0.258549 0 29 0.596229 0.389511 1 49 0.415939 0.886183 1 28 0.642911 0.531354 1 23 0.425187 0.992996 0 45 0.77542 0.954543 0 22 0.536117 0.323829 1 66 0.269605 0.512479 0 30 0.504115 0.05033 0 43 0.033432 0.326222 0 34 0.824359 0.073884 0 32 0.254308 0.911622 1 67 0.493281 0.460878 0 25 0.930062 0.539937 1 49 0.969525 0.197104 0 33 0.893668 0.076233 0 30 0.573458 0.650887 1 34 0.269346 0.39691 0 33 0.921761 0.947023 0 32 0.850215 0.049449 0 30 0.726416 0.089249 1 66 0.519936 0.460881 0 29 0.949457 0.666606 1 25 0.593561 0.706892 1 55 0.911603 0.428337 0 22 0.610379 0.974992 1 28 0.856064 0.799679 0 22 0.245071 0.562064 0 28 0.914548 0.183861 0 45 0.212329 0.69718 1 65 0.015549 0.695327 1 78 0.394434 0.660542 1 66 0.528462 0.603128 1 99 0.004284 0.276122 0 21 0.876159 0.840817 0 23 0.165018 0.353422 0 22 0.438785 0.050018 1 88 0.992341 0.530983 0 28 0.273392 0.292034 1 49 0.806084 0.716688 1 55 0.008057 0.207087 0 29 0.989654 0.741959 0 87 0.446867 0.560659 1 66 0.229361 0.568274 1 77 0.784757 0.625453 1 26 0.615188 0.539701 0 45 0.946377 0.154845 0 43 0.332143 0.393353 1 64 0.490094 0.475744 0 45 0.918481 0.06086 0 30 0.734093 0.665973 0 56 0.203579 0.347113 0 30 0.882788 0.712216 0 30 0.837742 0.094861 1 78 0.351134 0.099393 1 77 0.478132 0.500994 1 49 0.647013 0.844561 0 32 0.045798 0.694821 0 29 0.539531 0.446831 1 89 0.388354 0.207227 0 30 0.876641 0.959644 1 55 0.904474 0.103624 1 49 0.475321 0.885375 0 33 0.82756 0.542981 0 23 0.608773 0.943427 1 64 0.036046 0.614584 1 49 0.694035 0.177933 1 57 0.982775 0.506748 1 28 0.288058 0.141032 0 22 0.733673 0.56914 0 22 0.600035 0.379838 0 33 0.855031 0.606419 0 46 0.469211 0.04045 0 22 0.818988 0.588393 1 59 0.700961 0.908308 0 22 0.605883 0.496222 1 59 0.673742 0.888449 1 33 0.963604 0.285995 0 55 0.034625 0.456046 0 49 0.496689 0.356103 1 77 0.909181 0.730241 0 22 0.983117 0.624862 1 55 0.605892 0.174361 1 22 0.762288 0.729615 1 44 0.601913 0.94587 0 21 0.087109 0.016788 1 49 0.062094 0.970456 1 55 0.232084 0.458 0 33 0.088426 0.303928 0 30 0.072814 0.139687 0 29 0.661764 0.111134 0 49 0.386728 0.713489 1 21 0.022579 0.727911 1 55 0.113909 0.485175 0 22 0.064137 0.498407 0 28 0.004839 0.310233 0 25 0.581785 0.526257 1 98 0.090562 0.194978 1 43 0.336567 0.733543 0 49 0.822901 0.26979 1 76 0.782889 0.98173 0 30 0.095162 0.357284 0 32 0.033208 0.715184 1 59 0.705644 0.961663 0 21 0.023922 0.088974 0 55 0.568638 0.481046 1 77 0.052067 0.195178 0 34 0.601193 0.693859 1 59 0.994898 0.537601 1 65 0.130559 0.055892 1 78 0.108592 0.078597 0 19 0.280197 0.333782 1 65 0.263348 0.459836 1 59 0.161888 0.366251 1 55 0.798812 0.5498 0 29 0.007605 0.575379 1 49 0.437032 0.968051 0 23 0.195624 0.481008 1 34 0.445839 0.794648 0 30 0.650404 0.326183 1 59 0.597828 0.881721 1 22 0.315203 0.337145 0 22 0.193094 0.301088 1 78 0.394816 0.759208 1 66 0.040272 0.119522 0 25 0.460029 0.729584 0 43 0.69717 0.860043 1 66 0.778796 0.668523 0 21 0.601797 0.965929 0 23 0.941311 0.854977 0 29 0.65502 0.84558 1 56 0.620271 0.091275 0 24 0.009893 0.760391 1 55 0.365422 0.146742 1 59 0.724334 0.418849 0 34 0.249225 0.363205 0 34 0.721374 0.004328 1 66 0.153219 0.211174 0 34 0.201097 0.032582 0 32 0.726892 0.795593 0 33 0.688666 0.0000486 1 88 0.710448 0.997403 0 29 0.232084 0.722434 0 22 0.612402 0.236982 1 17 0.527581 0.282692 1 54 0.764135 0.738372 1 77 0.923658 0.103524 1 59 0.677911 0.254299 0 33 0.806426 0.367844 0 32 0.72313 0.087823 1 59 0.884718 0.781455 0 34 0.549183 0.388997 0 33 0.043759 0.475291 0 30 0.314277 0.491101 0 45 0.386056 0.73365 0 29 0.850467 0.504596 0 78 0.090033 0.090508 0 30 0.267298 0.936876 0 22 0.010279 0.349638 1 49 0.674207 0.503255 1 49 0.524574 0.702812 1 33 0.435955 0.234215 1 57 0.658304 0.569336 1 17 0.767949 0.127233 1 55 0.539106 0.37861 1 76 0.88163 0.222143 0 29 0.669774 0.256334 0 23 0.218532 0.278587 0 33 0.508075 0.913583 0 29 0.823546 0.100459 0 29 0.896003 0.345045 0 29 0.16764 0.182767 0 28 0.210887 0.813766 0 33 0.053346 0.176035 1 59 0.849553 0.651322 1 55 0.394241 0.136054 1 66 0.542956 0.337474 1 67 0.668135 0.63318 0 30 0.380535 0.216696 0 21 0.252422 0.13553 0 30 0.47547 0.549258 0 22 0.339097 0.019563 0 30 0.74235 0.31102 0 45 0.093139 0.200986 0 28 0.898954 0.153313 1 65 0.900928 0.980826 0 30 0.742124 0.177735 1 76 0.883943 0.563657 1 49 0.346703 0.793721 1 34 0.75462 0.546639 0 22 0.905003 0.300855 0 29 0.322278 0.612576 1 44 0.706533 0.816362 0 31 0.977669 0.891433 0 29 0.095139 0.824039 0 33 0.696932 0.866255 0 33 0.204931 0.77548 0 31 0.882071 0.63422 1 59 0.694104 0.636237 0 22 0.114065 0.548407 0 22 0.007479 0.400753 1 55 0.273739 0.250795 1 43 0.920749 0.898962 0 22 0.826653 0.048734 0 25 0.439581 0.812466 0 31 0.258062 0.596176 0 29 0.01674 0.55471 1 44 0.800906 0.49838 0 45 0.070814 0.6097 0 29 0.565107 0.614943 1 55 0.453773 0.092785 0 30 0.824868 0.160942 0 30 0.573673 0.167433 1 49 0.6281 0.956462 0 30 0.799247 0.585438 0 25 0.38251 0.057344 0 29 0.583994 0.765642 0 33 0.415387 0.983118 1 55 0.187298 0.625799 0 44 0.98875 0.508276 0 29 0.450917 0.882928 1 36 0.366209 0.651782 0 21 0.375269 0.302532 1 66 0.662063 0.597745 0 49 . . 0 36 . . 0 30 . . 1 88 . . 1 49 . . 0 45 . . 1 46 . . 0 21 . . 1 66 . . 0 23 . . 1 59 . . 0 28 . . 1 59 . . 0 29 . . 1 55 . . 0 25 . . 0 29 . . 0 55 . . 0 21 . . 0 28 . . %macro discretor_test(data, mybad,mybuckets); OPTIONS NOTES SOURCE SOURCE2 MPRINT MLOGIC MERROR SYMBOLGEN; *WARNING: This program has some pathing statements hardcoded; * First, we take the dataset and write it to a temp file; * as we will be changing the data values through a discretizing; * process. The code for the process will reside in discretor_code.txt; data tmp(drop=increments); set &data; file 'c:\test\discretor_code.txt'; increments=100/&mybuckets; call symput('increments',increments); run; * 2nd we get a listing of variable names; proc contents data=tmp out=test noprint; run; * Saving number and names of variables as a macro variable; data test; set test end=last; if last then call symput ('nvars',_n_); do; n+1; call symput('vname'||trim(left(n)),trim(name)); end; run; *This macro creates variable name list; %macro names (name, number); %do n=1 %to &number; &name&n %end; %mend names; *This is the main macro which does all the discretized crunching; %macro interative(nvars); %do n=1 %to &nvars; PROC UNIVARIATE data=tmp noprint; var &&vname&n; *Increments can be changed to collapse or expand the descritizing process; output out=xxx pctlpre=pct1 pctlpts=0 to 100 by &increments min=min; run; *Preparing the data for crunching process; data xxx2 (keep=group value); length value $12; set xxx end=last; if last then call symput ('minimum',min); array xyz{*} _numeric_; length group $32; do i = 1 to dim(xyz); if (xyz{i} ne .) then call vname (xyz{i},group); value=(xyz{i}); put group; output; end; run; data xxx2(keep=group value0 value1); set xxx2; value0=lag(value); value1=value; if value0=value1 then delete; run; data _null_; set xxx2 end=last; if last then call symput ('nobs',_n_); run; *Now we read off of table xxx2 to get the discretized components; data _null_; set xxx2 end=last; %if &nobs>1 %then %do; file 'c:\test\discretor.txt'; if _n_=1 then do; put '%macro discretor;'; put 'if ' "&&vname&n" '^=. and ' "&&vname&n" '<=' value1 ' then ' "&&vname&n" '='value1';'; end; if _n_>1 then do; put 'if ' "&&vname&n" ' >'value0 ' and ' "&&vname&n" '<='value1 ' then ' "&&vname&n" '='value1';'; end; if last then do; put '%mend discretor;'; end; run; data tmp; set tmp; * The following file will be overwritten each time as we process a new variable; %include 'c:\test\discretor.txt'; *Now we run the discretor program and it changes our data to the discretized form; %discretor; run; %end; ****************************************************************************; **** This is inefficient (repetitive), but appends a list of discretor code.; ****************************************************************************; data _null_; set xxx2 end=last; %if &nobs>1 %then %do; file 'c:\test\discretor_code.txt' mod; if _n_=1 then do; put '%macro discretor;'; put 'if ' "&&vname&n" '^=. and ' "&&vname&n" '<=' value1 ' then ' "&&vname&n" '='value1';'; end; if _n_>1 then do; put 'if ' "&&vname&n" ' >'value0 ' and ' "&&vname&n" '<='value1 ' then ' "&&vname&n" '='value1';'; end; if last then do; put '%mend discretor;'; end; run; %end; *********************************************************************************; %end; %mend interative; %interative(&nvars); quit; %mend discretor_test; *Now the data has been discretized so we can analyze it to determine the closest; * proxy for a missing value. Here is the code in a table; %macro discretor_proxy(mybad); data discretor_code(keep=code); *Bringing in discretor results for viewing; infile 'c:\test\discretor_code.txt' lrecl=32767 truncover; input Code $char32767.; mystr=index(Code,'%'); if mystr^=0 then Code='*--------------------------------------------------------------------;'; run; title Discretor Code; proc print data=discretor_code;run; *In order to use proc freq, we change missing (.) values to -999999; data tmp; * note the retain statement before the set command puts the bad variable first; * so we can skip over it as we cycle through the variables; retain &mybad; set tmp; array myMISSING(*) _numeric_; do i = 1 to dim(mymissing); if mymissing(i)=. then mymissing(i)=-999999; end; run; *Now we cycle through variables except we skip the first variable and the last variable; %let dsid=%sysfunc(open(tmp)); %let cnt=(%sysfunc(attrn(&dsid,nvars))); %do i = 2 %to &cnt-1; %let myname=%sysfunc(varname(&dsid,&i)); *Printing out frequencies to show we discretized data; Title Frequencies of Discretized data; proc freq data=tmp;tables &myname ;run; proc sort data=tmp out=testing;by &mybad ; proc freq data=testing noprint; tables &mybad*&myname/out=myout outpct sparse; run; data myout_&i (keep=varname myvalue count percent); retain varname myvalue count percent; set myout; length varname $32; varname="&myname"; myvalue=&myname; * We are only interested in the bad rate associated with a value of '1' below; if &mybad^=1 then delete; *TRATE = the target rate for missing values we wish to find a proxy for; if myvalue=-999999 then call symput('trate',percent); run; *Here we find the closest absolute difference to the target rate; data myout_&i; set myout_&i; trate="&trate"; Diff_rate=abs(trate-percent); run; proc sort;by diff_rate;run; *Here we create a separate table containing just the selected proxies; data proxy_&i; set myout_&i; if myvalue=-999999 then delete; run; data proxy_&i; set proxy_&i; if _n_>1 then delete; run; *Now we append the individual variable specific tables for each variable; %if &i=2 %then %do; data myappend; set myout_&i; run; data myproxy; set proxy_&i; run; %end; %if &i>2 %then %do; proc append base=myappend data=myout_&i; proc append base=myproxy data=proxy_&i; run; %end; %end; title Proxy for Missing using Discretor Method (where, &mybad = 1); proc print data=myproxy;run; proc print data=myappend;run; *Finally, we close down the sysfunc routine; %let rc=%sysfunc(close(&dsid)); %mend discretor_proxy; *====================================================================; libname perm 'c:\test\'; * where mydata=dataset; * where bad=your event variable; * where 5=the number of buckets desired for discretizing; %discretor_test(mydata,bad,5); * where bad=your event variable; %discretor_proxy(bad);