## Thursday, March 28, 2013

### group obs almost evenly and calculate cumulative stats in SAS and R

The input is 99 records, the requirement is to split it into 10 groups as evenly as possible(10 records each for first 9 groups, and 9 records in the last group). And then get the cumulative sum/mean in each group.

The output is like:

In SAS, a loop is required to do this because of cumulative sum.

data test;

i=1;

p=.99;

output;

do i=2 to 98;

p=.8;

output;

end;

i=99;

p=.2;

output;

run;

proc print data=test;

run;

* if use proc rank, it cannot group data evenly because of ties;

proc rank data=test out=t group=10;

var p;

ranks rank;

run;

proc print data=t;

run;

* so need to mannuly do it;

%let dsid=%sysfunc(open(test));          *open the file;

%let nobs=%sysfunc(attrn(&dsid,nobs));   *count the obs in file; %let ngroup=10; %let overall_pct=.5; %put &nobs;

* data n_per_group only has one obs;

data n_per_group;

n_per_grp=int(&nobs/&ngroup.);  * get quotient;

remainder=mod(&nobs,&ngroup.);  * get remainder;

array ps {&ngroup} ps1-ps&ngroup;

keep ps1-ps&ngroup;

do i=1 to &ngroup;

if remainder>0 then do;

ps{i}=n_per_grp+1;

remainder=remainder-1;

end;

else ps{i}=n_per_grp;

end;

output;

run;

proc print data=n_per_group;

run;

* read in  the only one obs, and keep it in PVM until the end by using if _n_=1 then do statement;

data out(drop=freq _count_ i p);

if _n_=1 then do;

set n_per_group;

index=1;

end;

retain freq  _count_ 0 index ;

array ps(&ngroup) ps1-ps&ngroup;

set test end=last;

* a liitle tricky: keep on adding p together unitl the # of added obs = n_per_group as expected;

* if the # of added obs = n_per_group, calculate the stats we want, otherwise, keep on adding;

if _count_=ps(index) then do;

num_obs=ps(index);

avg_pred_p=sum_p/num_obs;

lift=avg_pred_p/&overall_pct;

output;

index+1;

_count_=0;

sum_p=0;

end;

sum_p+p;

_count_+1;

if last then do;

num_obs=ps(index);

avg_pred_p=sum_p/num_obs;

lift=avg_pred_p/&overall_pct;

output;

end;

run;

proc print data=out;

run;

## It is very easy to do this in R

##  a simple way

rm(list=ls())

x=c(.9,rep(.8,97),.2)

ngrp=10

nobs=rep(length(x)%/%ngrp, ngrp)+c(rep(1,length(x)%%ngrp), rep(0,ngrp-length(x)%%ngrp))

levl=rep(1:ngrp, nobs)

df=data.frame(cbind(x,levl))

aggregate(x~levl, df, mean)

## Tuesday, March 12, 2013

### examples of sas proc sql connect to db with execute

*** sas proc sql connect to db with execute;

* in sas to run proc sql contains a query passedthrough sas to sas sql processor;

*SAS;

proc sql;

select employee_title as title, avg(employee_years),

freq(employee_id)

from sql.employee

group by title

order by title;

* Query passed through;

select * from connection to remote

(select employee_title as title,

avg(employee_years),

freq(employee_id)

from sql.employee

group by title

order by title);

* drop table, create table and insert an obs into table;

proc sql;

execute(drop table ' My Invoice ') by db;

execute(create table ' My Invoice '(

' Invoice Number ' LONG not null,

' Billed To '  VARCHAR(20),

' Amount '  CURRENCY,

' BILLED ON '  DATETIME)) by db;

execute(insert into  ' My Invoice '

values( 12345, 'John Doe', 123.45, #11/22/2003#)) by db;

quit;

* insert into table base from another table named one;

proc sql;

insert into base ( source, a, b, d   )

select            source, a, b, ' '

from one;

quit;

proc sql;

connect to oracle (user=sas password=sas path=db1);

)

by oracle;

disconnect from oracle;

quit;

* update mysql db;

* update the conversion_rate and updated_at in table projected_conversion_rates on mysql db named reporting_production by the values in sasdb.projected_merchant_conv_sas on equalling of payer_id, event_id and cutoff values;

proc sql;

connect to mysql (user=xxx password=xxx database=reporting_production server="server_m1" port=8888);

execute

(update projected_conversion_rates pcr inner join sasdb.projected_merchant_conv_sas a on (pcr.payer_id=a.payer_id and pcr.event_id=a.event_id and pcr.cutoff_min=a.cutoff_min and pcr.cutoff_max=a.cutoff_max)

set pcr.conversion_rate=a.conversion_rate, pcr.updated_at=a.updated_at)

by mysql;

disconnect from mysql;

quit;

libname mylib 'c:\sales';

* another example to join db table with client table;

proc sql;

connect to remote

(server=tso.shr1 dbms=db2

dbmsarg=(ssid=db2p));

select * from mylib.sales08,

connection to remote

(select qtr, division,

sales, pct

from revenue.all08

where region='Southeast')

where sales08.div=division;

## Tuesday, March 5, 2013

### Compare the variables distribution of two data sets

*** The purpose here is to compare the variables distribution of two data sets    ;

*** For example, if we use January data to build model and to score Feburary data ;

*** then we need to make sure the distribution of predictors are the same         ;

%let changed=0;

%macro chisq(vars, change_level_vars, timeframe);

data _null_;

array a_var(*) &vars;

call symput('nvar', ktrim(kleft(put(dim(a_var),8.))));

run;

%do i=1 %to &nvar;

%let var=%scan(&vars, &i);

%if &var = &change_level_vars %then %do;

proc freq data=forchisq;

tables &var * &timeframe / out=comp_&var (drop=percent);

run;

data comp_&var;

merge comp_&var(in=in1 where=(&timeframe=1) rename=(count=old)) comp_&var(in=in2 where=(&timeframe=2) rename=(count=new));

by &var;

drop timeframe;

if in1 and in2 then delete;

run;

proc sql;

select count(*) into: changed from comp_&var;

quit;

%if &changed > 0 %then %do;

proc printto file=toemail;

run;

title "New or deleted levels for &var";

proc print data=comp_&var width=min;

run;

proc printto;

run;

%end;

%end;

%else %do;

proc sql;

select count(distinct &var) into: m_new_&var from forchisq where &timeframe=2;

select count(distinct &var) into: m_old_&var from forchisq where &timeframe=1;

quit;

%if &&m_new_&var ne &&m_old_&var %then %do;

%let changed=1;

proc printto file=toemail;

run;

title "&var has different values of levels:";

proc freq data=forchisq;

tables &var * &timeframe / missing nocum nopercent;

run;

proc printto;

run;

%end;

%else %do;

proc freq data=forchisq(where=(&timeframe=1));

tables &var / missing out=old_dist;

run;

proc sql;

select percent into: percents separated by " " from old_dist;

quit;

proc freq data=forchisq(where=(&timeframe=2));

*weight &wt;

table &var / missing chisq testp=(&percents);

output out=chisq_&var(rename=(_PCHI_=Chisq P_PCHI=p_value)) chisq pchi;

run;

proc print data=chisq_&var width=min;

title "Chisq values for &var";

run;

proc sql;

select round(p_value, 0.001) into: pvalue from chisq_&var;

quit;

%if &pvalue<0.01 %then %do;

%let changed=1;

proc printto file=toemail;

run;

proc freq data=forchisq;

title "&var distribution has changed.";

table &var * &timeframe / missing norow nocum nopct;

run;

proc printto;

run;

%end;

%end;

%end;

%end;

%if &changed>0 %then %do;

x cat toemail.lst | mail -s "Data distribution change report" &notify; %end;

%mend chisq;

***  test the macro by the Titanic Train data set from kaggle;

***  The first 448 obs were set as timefram=1, the rest is 2;

proc import datafile="/home/test/train.csv" out=forchisq(rename=(time=timeframe)) replace;

getnames=yes;

run;

proc print data=forchisq width=min;

run;

%let vars=survived pclass sex sibsp parch embarked;

%let change_level_vars=cabin;

%let notify=hsong@nextag.com;

%chisq(&vars, &change_level_vars, timeframe);

The result is:

1)For survived, we cannot reject that the distribution of the two data sets are the same since p-value is .3299.

2) For pclass, the result is:

3) For sex, the result shows p-value<.01 so we think the distribution of this variable from two data sets is different

4) For sibsp, the result is:

5) For parch, we will check if there is new levels or delted levels, there is new new level = 6 from the result:

6) Fot embarked, the result is: