Sunday, September 2, 2012

Study Notes2 Predictive Modeling with Logistic Regression: collapse levels of contingency table

/* In the contingency table if there are many levels for a variable, then at some levels */

/* of this variable there may be no events(or all events) happening. This is called quasi */

/* completion problem(all separation). If this happens, the MLE will not exist since the */

/* regression coefficient will be infinit if want to maximize the likelihood */

/* A good solution is to collapse(cluster) some levels for the given categorical variable */

/*Variable Cluster_code has many levels which cause 0 events at some levels of cluster_code*/

/*This is called Quasi-completion problem*/

proc means data=pva1 nway missing;

class cluster_code;

var target_b;

output out=level mean=prop;

run;

proc print data=level width=min;

run;

ods output clusterhistory=cluster;

/* method=ward will use Greenacre's method to collapse levels of contingency tables */

/* the levels are clusterd based on the redunction in the chisq test of association */

/* levels with similar marginal response rate will be merged */

/* levels with least chisq value decreased will be merged, this is from the */

/* Semipartial R-Square value of the proc cluster output */

proc cluster data=level method=ward outtree=fortree plots=(dendrogram(vertical height=rsq));

freq _freq_;

var prop;

id cluster_code;

run;



proc freq data=pva1;

tables cluster_code*target_b / chisq;

output out=chi(keep=_pchi_) chisq;

run;

/* _pchi_ = 112.009 */

/* # of levels final is the one that giving least log(p_value) */

data cutoff;

if _n_ = 1 then set chi;

set cluster;

chisquare=_pchi_*rsquared;

degfree=numberofclusters-1;

logpvalue=logsdf('CHISQ',chisquare,degfree);

run;

proc sgplot data=cutoff;

scatter y=logpvalue x=numberofclusters / markerattrs=(color=blue symbol=circlefilled);

xaxis label="Number of Clusters";

yaxis label="Log of p-value";

title "Plot of the Log of p-value by Number of Clusters";

run;


/* number of clusters = 5 having the minimum logpvalue */

proc sql;

select numberofclusters into :ncl

from cutoff

having logpvalue=min(logpvalue);

quit;

ods html close;

proc tree data=fortree nclusters=&ncl out=clus;

id cluster_code;

run;

ods html;

proc sort data=clus;

by clusname;

run;

data clus_fmt;

retain fmtname "clus_fmt";

set clus;

start=cluster_code;

end=cluster_code;

label=clusname;

run;

/* at last use proc format to format the variable levels into clusters */

/* The formated fmt_cluster_code can be used as a categorical variable */

proc format cntlin=clus_fmt;

select clus_fmt;

run;


data pva1;

set pva1;

fmt_cluster_code=put(cluster_code,clus_fmt.);

run;

proc freq data=pva1;

table fmt_cluster_code / missing list;

run;

Add Video

No comments:

Post a Comment