/* Macro allows you to use mean, median, or mode as proxies for missing */ *First we read in a test data set for you; data mydata; input Y VAR1 VAR2 VAR3; cards; 1 10 33 4 0 . 21 3 1 30 . 2 1 20 76 1 0 . 24 3 0 20 22 . 1 10 . 2 1 . 49 2 0 30 . 2 1 20 59 2 1 20 76 1 0 . 24 3 0 20 22 . 1 10 . 2 1 . 49 2 0 30 . 2 ; %macro means_sub(datain,dataout,mtype); *=================================================================================; * Cleaning up Missing values; * New dataset called MISS_SUB will be created with proxy fill in data; * The missing proxies are ALWAYS derived from the development dataset(datain); * Proxies are applied to dataout dataset; * Pass the proxy type in the macro - mean, median, or mode; *=================================================================================; title Data with Missing Values = &datain; proc print data=&datain;run; title PRE - MISSING COUNTS: Dataset = &datain; proc means nmiss data=&datain; run; data contents(keep=_numeric_); set &datain; run; proc contents data=contents out=contents(keep=name) noprint; run; * Saving number of variables as a macro variable; data _null_; set contents end=last; file 'c:\numeric_vars_1234.txt' mod; if _n_=1 then put '%macro numeric_vars;'; put name; if last then put '%mend numeric_vars;'; if last then call symput ('nvars',trim(left(_n_))); run; %include 'c:\numeric_vars_1234.txt'; PROC UNIVARIATE data=&datain noprint; VAR %numeric_vars; %IF &mtype=median %then %do; OUTPUT OUT=meandat median=m1-m&nvars ; %end; %IF &mtype=mode %then %do; OUTPUT OUT=meandat mode=m1-m&nvars ; %end; %IF &mtype=mean %then %do; OUTPUT OUT=meandat mean=m1-m&nvars ; %end; RUN; *This should replace missings for either the dev. or valid. dataset; DATA miss_sub (DROP=m1-m&nvars i); IF _N_ = 1 THEN SET meandat; SET &dataout; ARRAY old(&nvars) %numeric_vars; ARRAY means(&nvars) m1-m&nvars; DO i = 1 TO &nvars; IF old(i) EQ . THEN old(i) = means(i); END; RUN; *=================================================================================; *Print out results; proc transpose data=meandat out=meandat2(drop=_NAME_); run; title 'Subsitution Proxy Summary'; proc print;run; *=================================================================================; * Writing back to output dataset; data &dataout; set miss_sub; proc print; title POST_MISSING COUNTS: Dataset =&dataout; proc means nmiss; run; %mend means_sub; * Therefore, I am deriving the means from the dataset called mydata; * and I am applying the substitution to the same dataset, mydata. If; * I was doing a scoring model, I would want to also apply these same; * proxies to the validation dataset. In that case, the validation; * dataset would be specified as "dataout" in the macro call below...; * In the last parameter below, tell the macro to use either ; * mean, median, or mode in lower case. That's it!; %means_sub(mydata,mydata,mode);