/*This is the study notes from SAS online training. It includes why use Logistic Regression, how to clean the data */
/*(impute missing value, cluster rare events levels, variable clustering, variable screening), how to build */
/*logistic regression and how to measure the performance of the model */
libname mydata "D:\SkyDrive\sas_temp";
proc datasets lib=mydata;
contents data=_all_;
run;
*** How to impute the missing data by proc stdize ***;
/*data preparation*/
data pva(drop=control_number);
set mydata.pva_raw_data;
run;
/*use proc means with nmiss to check how many obs are missing for each variable*/
proc means data=pva nmiss min max median;
var donor_age income_group wealth_rating;
run;
/*use array to set indicator for missing obs, if missing then indicatd as 1*/
data pva;
set pva;
array a_mi{*} mi_donor_age mi_income_group mi_wealth_rating;
array a_var{*} donor_age income_group wealth_rating;
do i=1 to dim(a_mi);
a_mi(i)=(a_var(i)=.);
end;
run;
/*group data into 3 groups by recent_response_prop from low to high: first 1/3 grp_resp is 0, next 1/3 of data is 1 */
/*the last 1/3 is 2. In the same way for grp_amt. So totally there are 9 groups of data considering grp_resp and grp_ant */
/*obs numbers in grp_resp level or grp_amt level are similar, but at grp_resp*grp_amt level is different */
proc rank data=pva out=pva groups=3;
var recent_response_prop recent_avg_gift_amt;
ranks grp_resp grp_amt;
run;
proc freq data=pva;
table grp_resp grp_amt / missing list;
table grp_resp*grp_amt / missing list;
run;
/*sort the data by grp_resp and grp_amt*/
proc sort data=pva;
by grp_resp grp_amt;
run;
/*impute the missing data in each group formed by grp_resp*grp_amt, by the median value of non-missing data in that group*/
/*after the imputation, the data will have imputed value as well as the missing value indicator */
proc stdize data=pva method=median reponly out=pva1;
by grp_resp grp_amt;
var donor_age income_group wealth_rating;
run;
/*check the imputed value in each group*/
proc means data=pva median;
class grp_resp grp_amt;
var donor_age income_group wealth_rating;
run;
/*there are some other ways to impute, like cluster imputation using proc fastclus */
/*or using EM, MCMC, Regression to impute in proc mi(see UCLA ATS) */
/*proc mi can also use logistic regression to impute the categorical variables */
/* http://www.ats.ucla.edu/stat/sas/seminars/missing_data/part1.htm */
/* http://www.ats.ucla.edu/stat/sas/seminars/missing_data/part2.htm */
No comments:
Post a Comment