您的位置:首页 > 其它

Bagging算法在SAS中的实现

2015-10-13 21:14 399 查看
原文地址:Bagging算法在SAS中的实现作者:文穗
%macro bagging(data = , y = , numx = , catx = , ntrees = 10);***********************************************************;* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BAGGING       *;* PROPOSED BY LEO BREIMAN (1996)                          *;* ======================================================= *;* PAMAMETERS:                                             *;*  DATA   : INPUT SAS DATA TABLE                          *;*  Y      : RESPONSE VARIABLE WITH 0/1 VALUE              *;*  NUMX   : A LIST OF NUMERIC ATTRIBUTES                  *;*  CATX   : A LIST OF CATEGORICAL ATTRIBUTES              *;*  NTREES : # OF TREES TO DO THE BAGGING                  *;* ======================================================= *;* OUTPUTS:                                                *;*  1. A SAS CATALOG FILE NAMED "TREEFILES" IN THE WORKING *;*     DIRECTORY CONTAINING ALL SCORING FILES IN BAGGING   *;*  2. A LST FILE SHOWING ks STATISTICS OF THE BAGGING     *;*     CLASSIFIER AND EACH TREE CLASSIFIER                 *;* ======================================================= *;* CONTACT:                                                *;*  WENSUI.LIU@53.COM, LOSS FORECASTING & RISK MODELING    *;***********************************************************;options mprint mlogic nocenter nodate nonumber;*** a random seed value subject to change ***;%let seed = 20110613;*** assign a library to the working folder ***;libname _path '';*** generate a series of random seeds ***;data _null_;do i = 1 to &ntrees;random = put(ranuni(&seed) * (10 ** 8), 8.);name   = compress("random"||put(i, 3.), ' ');call symput(name, random);end;run;*** clean up catalog files in the library ***;proc datasets library = _path nolist;delete TreeFiles tmp / memtype = catalog;run;quit;proc sql noprint;select count(*) into :nobs from &data where &y in (1, 0);quit;data _tmp1 (keep = &y &numx &catx _id_);set &data;_id_ + 1;run;%do i = 1 %to &ntrees;%put &&random&i;*** generate bootstrap samples for bagging ***;proc surveyselect data = _tmp1 method = urs n = &nobs seed = &&random&iout = sample&i(rename = (NumberHits = _hits)) noprint;run;*** generate data mining datasets for sas e-miner ***;proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;class &y &catx;var &numx;target &y;freq _hits;run;*** create a sas temporary catalog to contain sas output ***;filename out_tree catalog "_path.tmp.out_tree.source";*** create decision tree mimicking CART ***;proc split data = db_sample&i dmdbcat = cl_sample&icriterion    = giniassess       = impuritymaxbranch    = 2splitsize    = 100subtree      = assessmentexhaustive   = 0nsurrs       = 0;code file    = out_tree;input &numx   / level = interval;input &catx   / level = nominal;target &y     / level = binary;freq _hits;run;*** create a perminant sas catalog to contain all tree outputs ***;filename in_tree catalog "_path.TreeFiles.tree&i..source";data _null_;infile out_tree;input;file in_tree;if _n_ > 3 then put _infile_;run;*** score the original data by each tree output file ***;data _score&i (keep = p_&y.1 p_&y.0 &y _id_);set _tmp1;%include in_tree;run;*** calculate KS stat ***;proc printto new print = lst_out;run;ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));proc npar1way wilcoxon edf data = _score&i;class &y.;var p_&y.1;run;proc printto;run;%if &i = 1 %then %do;data _tmp2;set _score&i;run;data _ks;set _kstmp (keep = nvalue2);tree_id = &i;seed    = &&random&i;ks      = round(nvalue2 * 100, 0.0001);run;%end;%else %do;data _tmp2;set _tmp2 _score&i;run;data _ks;set _ks _kstmp(in = a keep = nvalue2);if a then do;tree_id = &i;seed    = &&random&i;ks      = round(nvalue2 * 100, 0.0001);end;run;%end;%end;*** aggregate predictions from all trees in the bag ***;proc summary data = _tmp2 nway;class _id_;output out = _tmp3(drop = _type_ rename = (_freq_ = freq))mean(p_&y.1) =  mean(p_&y.0) =  mean(&y) = ;run;*** calculate bagging KS stat ***;proc printto new print = lst_out;run;ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));proc npar1way wilcoxon edf data = _tmp3;class &y;var p_&y.1;run;proc printto;run;data _ks;set _ks _kstmp (in = a keep = nvalue2);if a then do;tree_id = 0;seed    = &seed;ks      = round(nvalue2 * 100, 0.0001);end;run;proc sort data = _ks;by tree_id;run;proc sql noprint;select max(ks) into :max_ks from _ks where tree_id > 0;select min(ks) into :min_ks from _ks where tree_id > 0;select ks into :bag_ks from _ks where tree_id = 0;quit;*** summarize the performance of bagging classifier and each tree in the bag ***;title "MAX KS = &max_ks, MIN KS = &min_ks, BAGGING KS = &bag_ks";proc print data = _ks noobs;var tree_id seed ks;run;title;proc datasets library = _path nolist;delete tmp / memtype = catalog;run;quit;%mend bagging;%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debttot_rev_line rev_util bureau_score ltv tot_income;%let x2 = purpose;libname data 'D:SAS_CODEbagging';�gging(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 10);
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: