1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
|
#===========================================================================#
# caTools - R library #
# Copyright (C) 2005 Jarek Tuszynski #
# Distributed under GNU General Public License version 3 #
#===========================================================================#
LogitBoost = function(xlearn, ylearn, nIter=ncol(xlearn))
# An implementation of the LogitBoost classification algorithm with
# decision stumps as weak learners.
#
# Input:
# xlearn - A dataset, whose n rows contain the training instances.
# ylearn - Class labels of dataset
# nIter - An integer, describing the number of iterations for
# which boosting should be run.
#
# Output:
# Stump - list of decision stumps used:
# column 1: contains feature numbers or each stump
# column 2: contains threshold
# column 3: contains bigger/smaller info: 1 means (col>thresh ? class_1 : class_2); -1 means (col>thresh ? class_2 : class_1)
#
# Writen by Jarek Tuszynski - SAIC (jaroslaw.w.tuszynski@saic.com)
# This code was addapted from logitboost.R function written by Marcel Dettling
# See "Boosting for Tumor Classification of Gene Expression Data", Dettling and Buhlmann (2002),
# available on the web page http://stat.ethz.ch/~dettling/boosting.html
{
lablist = sort(unique(ylearn)) # different classes in label array
nClass = length(lablist) # number of different classes in label array
if (nClass>2) { # Multi class version uses 2-class code ...
Stump = NULL # ... recursivly
for (jClass in 1:nClass) {
y = as.numeric(ylearn!=lablist[jClass]) # lablist[jClass]->1; rest->0
Stump = cbind(Stump, LogitBoost(xlearn, y, nIter)$Stump)
}
object = list(Stump=Stump, lablist=lablist) # create LogitBoost object
class(object) <- "LogitBoost"
return(object)
}
Mask = is.na(xlearn) # any NA in test data will ...
if (any(Mask)) xlearn[Mask] = Inf # ... be changed to +Inf
ylearn = as.numeric(ylearn!=lablist[1]) # change class labels to boolean
nLearn = nrow(xlearn) # Length of training data
nFeat = ncol(xlearn) # number of features to choose from
# Array Initialization
f = 0 # range -1 or 1
p = numeric(nLearn)+1/2 # range [0,1]
Stump = matrix(0, nIter,3) # will hold the results
colnames(Stump) = c("feature", "threshhold", "sign")
Thresh = matrix(0, nLearn, nFeat) # sorted xlearn each fearure at a time
Index = matrix(as.integer(0), nLearn, nFeat) # order of samples in Thresh
Mask = matrix(as.logical(0), nLearn, nFeat) # mask of unique samples in Thresh
repts = as.logical(numeric(nFeat)) # for each feature: are all samples unique
# sort all columns and store them in Thresh; Index stores original positions
for (iFeat in 1:nFeat) {
x = sort(xlearn[,iFeat], index=TRUE)
Thresh[,iFeat] = x[[1]]
Index [,iFeat] = x[[2]]
Mask [,iFeat] = c((diff(Thresh[,iFeat])!=0), TRUE)
repts [ iFeat] = !all(Mask[,iFeat])
}
# Boosting Iterations
jFeat = 0
for (m in 1:nIter) {
# Computation of working response and weights
w = pmax(p*(1-p), 1e-24) # weight for each sample
z = (ylearn-p)/w
w = w/sum(w) # normalize to 1
# older version similar to rpart (more readable but slower) left as documentation
# MinLS = 1000 # initialize search for minimum Least-Square
# for (iFeat in 1:nFeat) { # for each feature do: for each spliting point calculate least square regresion of z to xlearn with weights w.
# Col = xlearn[,iFeat] # sort all columns and store each one in thr
# thr = Thresh[,iFeat] # sort all columns and store each one in thr
# for (j in 1:nLearn) { # for every possible threshold
# ff = 2*(Col<=thr[j])-1 # for every point in the column if (col<=Thresh) then f=1 else f=-1
# LS1[j] = sum(w*(z-ff)^2) # Least Square array for every possible theshold ( f = (col<=Thresh ? 1 : -1) )
# LS2[j] = sum(w*(z+ff)^2) # Least Square array for every possible theshold after swaping output classes ( f = (col<sp ? -1 : 1) )
# }
# iLS1 = which.min(LS1)
# iLS2 = which.min(LS2)
# vLS1 = LS1[iLS1]
# vLS2 = LS2[iLS2]
# if (MinLS>vLS1) { stump=c(iFeat, thr[iLS1], 1); MinLS=vLS1; }
# if (MinLS>vLS2) { stump=c(iFeat, thr[iLS2], -1); MinLS=vLS2; }
# }
# Same as code above but faster:
# for each feature do: for each spliting point calculate least square regresion of z to xlearn with weights w.
# This part of the code is my version of rpart object.
# LS1 is least square array LS1 = sum((w.*(z-f)).^2) where f = (col<=sp ? 1 : -1) LS1 is calculated for all spliting points
# LS2 is least square array LS2 = sum((w.*(z-f)).^2) where f = (col<=sp ? -1 : 1) LS2 is calculated for all spliting points
# for each spliting point col(idx) we will take one sample and change its sign, that will change current least square:
# LS(i) = LS(i) - ( w(idx(i)) .* ( z(idx(i)) - (-1) ) ).^2 + ( w(idx(i)) .* ( z(idx(i)) - 1 ) ).^2 what can be simplified to:
# LS(i) = LS(i) - 4*(w(idx(i)).^2) .* z(idx(i))
ls1 = sum(w*(z+1)^2) # Least Square value for left-most theshold ( f = -1 )
ls2 = sum(w*(z-1)^2) # Least Square value for left-most theshold after swaping output classes ( f = 1 )
MinLS = max(ls1,ls2) # initialize search for minimum Least-Square
wz = 4 * w * z # precompute vector to be used later
for (iFeat in 1:nFeat) { # for each feature do: for each spliting point calculate least square regresion of z to xlearn with weights w.
if (iFeat==jFeat) next # Prevent the simplest cycle
Col = Thresh[,iFeat] # get one column of sorted data
LS = cumsum(wz[Index[,iFeat]]) # find offset to Least Square value for every possible theshold
if (repts[iFeat]) { # if any repeating thresholds than delete all but last LS value
mask = Mask[,iFeat] # use mask of unique samples
Col = Col[mask] # delete non-unique thresholds since they can cause errors
LS = LS [mask] # delete coressponding Least Square values
}
iLS1 = which.max(LS) # min of LS1=ls1-LS - Least Square value for every possible theshold ( f = (col<=Thresh ? 1 : -1) )
iLS2 = which.min(LS) # min of LS2=ls2+LS - Least Square value for every possible theshold after swaping output classes ( f = (col<Thresh ? -1 : 1) )
vLS1 = ls1-LS[iLS1]
vLS2 = ls2+LS[iLS2]
if (MinLS>vLS1) { stump=c(iFeat, Col[iLS1], 1); MinLS=vLS1; }
if (MinLS>vLS2) { stump=c(iFeat, Col[iLS2], -1); MinLS=vLS2; }
}
# =================
# Fitting the tree
# =================
Stump[m,] = stump # if stump[3]>0 f(i) += (xlearn(i,stump[1])<=stump[2] ? 1 : -1)
jFeat = stump[1]
f = f + stump[3] * (2*( xlearn[,jFeat]<=stump[2] )-1) # if stump[3]<0 f(i) += (xlearn(i,stump[1])> stump[2] ? 1 : -1)
p = 1/(1+exp(-f)) # Updating and probabilities - range (0, 1]
y = (f>0)
y[f==0] = 0.5
Conv = sum(abs(ylearn-y)) # keep track of error rate
}
object = list(Stump=Stump, lablist=lablist) # create LogitBoost object
class(object) <- "LogitBoost"
return(object)
}
predict.LogitBoost = function(object, xtest, type = c("class", "raw"), nIter=NA, ...)
# An implementation of the LogitBoost classification algorithm with
# decision stumps as weak learners.
#
# Input:
# xtest - A dataset, whose n rows contain samples to be classified.
# ytest - Class labels of dataset (if given than Conv will return error rates as function of iterations)
#
# input:
# Stump - list of decision stumps used:
# column 1: contains feature numbers or each stump
# column 2: contains threshold
# column 3: contains bigger/smaller info: 1 means (col>Thresh ? class_1 : class_2); -1 means (col>Thresh ? class_2 : class_1)
#
# Writen by Jarek Tuszynski - SAIC (jaroslaw.w.tuszynski@saic.com)
# This code was addapted from logitboost.R function written by Marcel Dettling
# See "Boosting for Tumor Classification of Gene Expression Data", Dettling and Buhlmann (2002),
# available on the web page http://stat.ethz.ch/~dettling/boosting.html
{
type = match.arg(type)
Stump = object$Stump
lablist = object$lablist
if (is.na(nIter)) nIter = nrow(Stump)
else nIter = min(nIter, nrow(Stump))
nTest = nrow(xtest)
nClass = ncol(Stump)/3
if (nClass==1) nClass=2
Prob = matrix(0,nTest, nClass)
colnames(Prob) = lablist
if (nClass>2) { # multi class problem
object$lablist = c(1,2) # generic labels
for(iClass in 1:nClass) {
object$Stump = Stump[,3*iClass + (-2:0)]
prob = predict.LogitBoost(object, xtest, type="raw", nIter=nIter)
Prob[,iClass] = prob[,1] # probability that "sample belongs to iClass" is true
}
} else { # two class problem
Mask = is.na(xtest) # any NA in test data will... be changed
if (any(Mask)) xtest[Mask] = Inf # be changed to +Inf
f = numeric(nTest)
for (iter in 1:nIter) {
iFeat = Stump[iter,1]
thresh = Stump[iter,2]
Sign = Stump[iter,3]
f = f + Sign*(2*(xtest[,iFeat]<=thresh)-1)
}
Ytest = (f>0)
Ytest[f==0] = 0.5
prob = 1/(1+exp(-f))
Prob[,1] = 1-prob # probability that "sample belongs to 1"
Prob[,2] = prob # probability that "sample belongs to 2"
}
if (type=="raw") RET=Prob # request to return raw probabilities
else { # otherwise assign labels
ord = t(apply(-Prob, 1, order))# find order of sorted Probs
RET = lablist[ord[,1]] # find label with highest Prob
Prob = -t(apply(-Prob,1,sort)) # sort Probs
RET[Prob[,1]==Prob[,2]] = NA # in case of ties return NA's
}
return (RET)
}
|