Tuesday, March 5, 2013

Compare the variables distribution of two data sets

*** The purpose here is to compare the variables distribution of two data sets    ;

*** For example, if we use January data to build model and to score Feburary data ;

*** then we need to make sure the distribution of predictors are the same         ;

 

%let changed=0;

 

%macro chisq(vars, change_level_vars, timeframe);

 

data _null_;

     array a_var(*) &vars;

     call symput('nvar', ktrim(kleft(put(dim(a_var),8.))));

run;

 

%do i=1 %to &nvar;

     %let var=%scan(&vars, &i);

    

     %if &var = &change_level_vars %then %do;

           proc freq data=forchisq;

                tables &var * &timeframe / out=comp_&var (drop=percent);

           run;

 

           data comp_&var;

                merge comp_&var(in=in1 where=(&timeframe=1) rename=(count=old)) comp_&var(in=in2 where=(&timeframe=2) rename=(count=new));

                by &var;

                drop timeframe;

                if in1 and in2 then delete;

           run;

 

           proc sql;

                select count(*) into: changed from comp_&var;

           quit;

 

           %if &changed > 0 %then %do;

                proc printto file=toemail;

                run;

                title "New or deleted levels for &var";

                proc print data=comp_&var width=min;

                run;

                proc printto;

                run;

           %end;

     %end;

 

     %else %do;

 

           proc sql;

                select count(distinct &var) into: m_new_&var from forchisq where &timeframe=2;

                select count(distinct &var) into: m_old_&var from forchisq where &timeframe=1;

           quit;

 

           %if &&m_new_&var ne &&m_old_&var %then %do;

                %let changed=1;

                proc printto file=toemail;

                run;

                title "&var has different values of levels:";

                proc freq data=forchisq;

                     tables &var * &timeframe / missing nocum nopercent;

                run;

                proc printto;

                run;

           %end;

 

           %else %do;

                proc freq data=forchisq(where=(&timeframe=1));

                     tables &var / missing out=old_dist;

                run;

 

                proc sql;

                     select percent into: percents separated by " " from old_dist;

                quit;

 

                     proc freq data=forchisq(where=(&timeframe=2));

                     *weight &wt;

                     table &var / missing chisq testp=(&percents);

                     output out=chisq_&var(rename=(_PCHI_=Chisq P_PCHI=p_value)) chisq pchi;

                run;

 

                proc print data=chisq_&var width=min;

                     title "Chisq values for &var";

                run;

 

                proc sql;

                     select round(p_value, 0.001) into: pvalue from chisq_&var;

                quit;

 

                %if &pvalue<0.01 %then %do;

                     %let changed=1;

                     proc printto file=toemail;

                     run;

 

                     proc freq data=forchisq;

                           title "&var distribution has changed.";

                           table &var * &timeframe / missing norow nocum nopct;

                     run;

 

                     proc printto;

                     run;

                %end;

           %end;

     %end;

%end;

 

%if &changed>0 %then %do;

     x cat toemail.lst | mail -s "Data distribution change report" &notify; %end;

 

%mend chisq;

 

 

***  test the macro by the Titanic Train data set from kaggle;

***  The first 448 obs were set as timefram=1, the rest is 2;

 

proc import datafile="/home/test/train.csv" out=forchisq(rename=(time=timeframe)) replace;

     getnames=yes;

run;

               

proc print data=forchisq width=min;

run;

 

%let vars=survived pclass sex sibsp parch embarked;

 

%let change_level_vars=cabin;

 

%let notify=hsong@nextag.com;

 

%chisq(&vars, &change_level_vars, timeframe);

 

The result is:

1)For survived, we cannot reject that the distribution of the two data sets are the same since p-value is .3299.

2) For pclass, the result is:

    

3) For sex, the result shows p-value<.01 so we think the distribution of this variable from two data sets is different

    

4) For sibsp, the result is:

    

5) For parch, we will check if there is new levels or delted levels, there is new new level = 6 from the result:

    

6) Fot embarked, the result is:

    

 

No comments:

Post a Comment