Thursday, March 28, 2013

group obs almost evenly and calculate cumulative stats in SAS and R

The input is 99 records, the requirement is to split it into 10 groups as evenly as possible(10 records each for first 9 groups, and 9 records in the last group). And then get the cumulative sum/mean in each group.

 

 

The output is like:

 

 

In SAS, a loop is required to do this because of cumulative sum.

 

data test;

     i=1;

     p=.99;

     output;

     do i=2 to 98;

           p=.8;

           output;

     end;

     i=99;

     p=.2;

     output;

run;

 

proc print data=test;

run;

 

* if use proc rank, it cannot group data evenly because of ties;

proc rank data=test out=t group=10;

     var p;

     ranks rank;

run;

 

proc print data=t;

run;

 

* so need to mannuly do it;

%let dsid=%sysfunc(open(test));          *open the file;

%let nobs=%sysfunc(attrn(&dsid,nobs));   *count the obs in file; %let ngroup=10; %let overall_pct=.5; %put &nobs;

 

* data n_per_group only has one obs;

data n_per_group;

     n_per_grp=int(&nobs/&ngroup.);  * get quotient;

     remainder=mod(&nobs,&ngroup.);  * get remainder;

     array ps {&ngroup} ps1-ps&ngroup;

     keep ps1-ps&ngroup;

     do i=1 to &ngroup;

           if remainder>0 then do;

                     ps{i}=n_per_grp+1;

                     remainder=remainder-1;

           end;

           else ps{i}=n_per_grp;

     end;

     output;

run;

 

proc print data=n_per_group;

run;

 

* read in  the only one obs, and keep it in PVM until the end by using if _n_=1 then do statement;

data out(drop=freq _count_ i p);

     if _n_=1 then do;

           set n_per_group;

           index=1;

     end;

    

     retain freq  _count_ 0 index ;

    

     array ps(&ngroup) ps1-ps&ngroup;

    

     set test end=last;

    

     * a liitle tricky: keep on adding p together unitl the # of added obs = n_per_group as expected;

     * if the # of added obs = n_per_group, calculate the stats we want, otherwise, keep on adding;

     if _count_=ps(index) then do;

           num_obs=ps(index);

           avg_pred_p=sum_p/num_obs;

           lift=avg_pred_p/&overall_pct;

           output;

           index+1;

           _count_=0;

           sum_p=0;

     end;

    

     sum_p+p;

     _count_+1;

    

     if last then do;

           num_obs=ps(index);

           avg_pred_p=sum_p/num_obs;

           lift=avg_pred_p/&overall_pct;

           output;

     end;

run;

 

proc print data=out;

run;

 

 

## It is very easy to do this in R

##  a simple way

rm(list=ls())

x=c(.9,rep(.8,97),.2)

ngrp=10

nobs=rep(length(x)%/%ngrp, ngrp)+c(rep(1,length(x)%%ngrp), rep(0,ngrp-length(x)%%ngrp))

levl=rep(1:ngrp, nobs)

df=data.frame(cbind(x,levl))

aggregate(x~levl, df, mean)


Tuesday, March 12, 2013

examples of sas proc sql connect to db with execute

*** sas proc sql connect to db with execute;

 

* in sas to run proc sql contains a query passedthrough sas to sas sql processor;

*SAS;

proc sql;

select employee_title as title, avg(employee_years),

   freq(employee_id)

      from sql.employee

      group by title

      order by title;

* Query passed through;

select * from connection to remote

   (select employee_title as title,

    avg(employee_years),

    freq(employee_id)

       from sql.employee

       group by title

       order by title);

 

* drop table, create table and insert an obs into table;

proc sql;

    execute(drop table ' My Invoice ') by db;

    execute(create table ' My Invoice '(

       ' Invoice Number ' LONG not null,

       ' Billed To '  VARCHAR(20),

       ' Amount '  CURRENCY,

       ' BILLED ON '  DATETIME)) by db;

    execute(insert into  ' My Invoice '

       values( 12345, 'John Doe', 123.45, #11/22/2003#)) by db;

quit;

 

* insert into table base from another table named one;

proc sql;

insert into base ( source, a, b, d   )

select            source, a, b, ' '

  from one;

quit;

 

* join a db table facebook_ads with another table sasprod.fb_ad_fromsas by ad_id;

proc sql;

   connect to oracle (user=sas password=sas path=db1);

   execute (update db1.facebook_ads m set

            (m.rank, m.ad_work_item_id)=

            (select a.rank, a.ad_work_item_id

             from sasprod.fb_ad_fromsas a

             where m.ad_wid=a.ad_id)

            where exists (select 1 from sasprod.fb_ad_fromsas a where m.ad_id=a.ad_id)

           )

   by oracle;

   disconnect from oracle;

quit;

 

* update mysql db;

* update the conversion_rate and updated_at in table projected_conversion_rates on mysql db named reporting_production by the values in sasdb.projected_merchant_conv_sas on equalling of payer_id, event_id and cutoff values;

proc sql;

        connect to mysql (user=xxx password=xxx database=reporting_production server="server_m1" port=8888);

        execute

        (update projected_conversion_rates pcr inner join sasdb.projected_merchant_conv_sas a on (pcr.payer_id=a.payer_id and pcr.event_id=a.event_id and pcr.cutoff_min=a.cutoff_min and pcr.cutoff_max=a.cutoff_max)

        set pcr.conversion_rate=a.conversion_rate, pcr.updated_at=a.updated_at)

        by mysql;

        disconnect from mysql;

quit;

 

 

 

libname mylib 'c:\sales';

 

* another example to join db table with client table;

proc sql;

   connect to remote

      (server=tso.shr1 dbms=db2

       dbmsarg=(ssid=db2p));

   select * from mylib.sales08,

      connection to remote

         (select qtr, division,

                 sales, pct

            from revenue.all08

            where region='Southeast')

      where sales08.div=division;

 

 

*** docs: http://support.sas.com/documentation/cdl/en/connref/61908/HTML/default/viewer.htm#srspt.htm;

 


Tuesday, March 5, 2013

Compare the variables distribution of two data sets

*** The purpose here is to compare the variables distribution of two data sets    ;

*** For example, if we use January data to build model and to score Feburary data ;

*** then we need to make sure the distribution of predictors are the same         ;

 

%let changed=0;

 

%macro chisq(vars, change_level_vars, timeframe);

 

data _null_;

     array a_var(*) &vars;

     call symput('nvar', ktrim(kleft(put(dim(a_var),8.))));

run;

 

%do i=1 %to &nvar;

     %let var=%scan(&vars, &i);

    

     %if &var = &change_level_vars %then %do;

           proc freq data=forchisq;

                tables &var * &timeframe / out=comp_&var (drop=percent);

           run;

 

           data comp_&var;

                merge comp_&var(in=in1 where=(&timeframe=1) rename=(count=old)) comp_&var(in=in2 where=(&timeframe=2) rename=(count=new));

                by &var;

                drop timeframe;

                if in1 and in2 then delete;

           run;

 

           proc sql;

                select count(*) into: changed from comp_&var;

           quit;

 

           %if &changed > 0 %then %do;

                proc printto file=toemail;

                run;

                title "New or deleted levels for &var";

                proc print data=comp_&var width=min;

                run;

                proc printto;

                run;

           %end;

     %end;

 

     %else %do;

 

           proc sql;

                select count(distinct &var) into: m_new_&var from forchisq where &timeframe=2;

                select count(distinct &var) into: m_old_&var from forchisq where &timeframe=1;

           quit;

 

           %if &&m_new_&var ne &&m_old_&var %then %do;

                %let changed=1;

                proc printto file=toemail;

                run;

                title "&var has different values of levels:";

                proc freq data=forchisq;

                     tables &var * &timeframe / missing nocum nopercent;

                run;

                proc printto;

                run;

           %end;

 

           %else %do;

                proc freq data=forchisq(where=(&timeframe=1));

                     tables &var / missing out=old_dist;

                run;

 

                proc sql;

                     select percent into: percents separated by " " from old_dist;

                quit;

 

                     proc freq data=forchisq(where=(&timeframe=2));

                     *weight &wt;

                     table &var / missing chisq testp=(&percents);

                     output out=chisq_&var(rename=(_PCHI_=Chisq P_PCHI=p_value)) chisq pchi;

                run;

 

                proc print data=chisq_&var width=min;

                     title "Chisq values for &var";

                run;

 

                proc sql;

                     select round(p_value, 0.001) into: pvalue from chisq_&var;

                quit;

 

                %if &pvalue<0.01 %then %do;

                     %let changed=1;

                     proc printto file=toemail;

                     run;

 

                     proc freq data=forchisq;

                           title "&var distribution has changed.";

                           table &var * &timeframe / missing norow nocum nopct;

                     run;

 

                     proc printto;

                     run;

                %end;

           %end;

     %end;

%end;

 

%if &changed>0 %then %do;

     x cat toemail.lst | mail -s "Data distribution change report" &notify; %end;

 

%mend chisq;

 

 

***  test the macro by the Titanic Train data set from kaggle;

***  The first 448 obs were set as timefram=1, the rest is 2;

 

proc import datafile="/home/test/train.csv" out=forchisq(rename=(time=timeframe)) replace;

     getnames=yes;

run;

               

proc print data=forchisq width=min;

run;

 

%let vars=survived pclass sex sibsp parch embarked;

 

%let change_level_vars=cabin;

 

%let notify=hsong@nextag.com;

 

%chisq(&vars, &change_level_vars, timeframe);

 

The result is:

1)For survived, we cannot reject that the distribution of the two data sets are the same since p-value is .3299.

2) For pclass, the result is:

    

3) For sex, the result shows p-value<.01 so we think the distribution of this variable from two data sets is different

    

4) For sibsp, the result is:

    

5) For parch, we will check if there is new levels or delted levels, there is new new level = 6 from the result:

    

6) Fot embarked, the result is: