/* In the contingency table if there are many levels for a variable, then at some levels */
/* of this variable there may be no events(or all events) happening. This is called quasi */
/* completion problem(all separation). If this happens, the MLE will not exist since the */
/* regression coefficient will be infinit if want to maximize the likelihood */
/* A good solution is to collapse(cluster) some levels for the given categorical variable */
/*Variable Cluster_code has many levels which cause 0 events at some levels of cluster_code*/
/*This is called Quasi-completion problem*/
proc means data=pva1 nway missing;
class cluster_code;
var target_b;
output out=level mean=prop;
run;
proc print data=level width=min;
run;
ods output clusterhistory=cluster;
/* method=ward will use Greenacre's method to collapse levels of contingency tables */
/* the levels are clusterd based on the redunction in the chisq test of association */
/* levels with similar marginal response rate will be merged */
/* levels with least chisq value decreased will be merged, this is from the */
/* Semipartial R-Square value of the proc cluster output */
proc cluster data=level method=ward outtree=fortree plots=(dendrogram(vertical height=rsq));
freq _freq_;
var prop;
id cluster_code;
run;
proc freq data=pva1;
tables cluster_code*target_b / chisq;
output out=chi(keep=_pchi_) chisq;
run;
/* _pchi_ = 112.009 */
/* # of levels final is the one that giving least log(p_value) */
data cutoff;
if _n_ = 1 then set chi;
set cluster;
chisquare=_pchi_*rsquared;
degfree=numberofclusters-1;
logpvalue=logsdf('CHISQ',chisquare,degfree);
run;
proc sgplot data=cutoff;
scatter y=logpvalue x=numberofclusters / markerattrs=(color=blue symbol=circlefilled);
xaxis label="Number of Clusters";
yaxis label="Log of p-value";
title "Plot of the Log of p-value by Number of Clusters";
run;
/* number of clusters = 5 having the minimum logpvalue */
proc sql;
select numberofclusters into :ncl
from cutoff
having logpvalue=min(logpvalue);
quit;
ods html close;
proc tree data=fortree nclusters=&ncl out=clus;
id cluster_code;
run;
ods html;
proc sort data=clus;
by clusname;
run;
data clus_fmt;
retain fmtname "clus_fmt";
set clus;
start=cluster_code;
end=cluster_code;
label=clusname;
run;
/* at last use proc format to format the variable levels into clusters */
/* The formated fmt_cluster_code can be used as a categorical variable */
proc format cntlin=clus_fmt;
select clus_fmt;
run;
data pva1;
set pva1;
fmt_cluster_code=put(cluster_code,clus_fmt.);
run;
proc freq data=pva1;
table fmt_cluster_code / missing list;
run;
No comments:
Post a Comment