[go: up one dir, main page]

File: LogitBoost.R

package info (click to toggle)
catools 1.12-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 356 kB
  • sloc: ansic: 645; cpp: 640; makefile: 5
file content (201 lines) | stat: -rwxr-xr-x 10,520 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#===========================================================================#
# caTools - R library                                                       #
# Copyright (C) 2005 Jarek Tuszynski                                        #
# Distributed under GNU General Public License version 3                    #
#===========================================================================#

LogitBoost = function(xlearn, ylearn, nIter=ncol(xlearn))
#   An implementation of the LogitBoost classification algorithm with
#   decision stumps as weak learners. 
#   
# Input:
#   xlearn - A dataset, whose n rows contain the training instances.
#   ylearn - Class labels of dataset
#   nIter - An integer, describing the number of iterations for
#     which boosting should be run. 
#     
# Output:
#   Stump - list of decision stumps used:
#            column 1: contains feature numbers or each stump
#            column 2: contains threshold
#            column 3: contains bigger/smaller info: 1 means (col>thresh ? class_1 : class_2);  -1 means (col>thresh ? class_2 : class_1)
#
# Writen by Jarek Tuszynski - SAIC (jaroslaw.w.tuszynski@saic.com)
# This code was addapted from logitboost.R function written by Marcel Dettling
# See "Boosting for Tumor Classification of Gene Expression Data", Dettling and Buhlmann (2002), 
#   available on the web page http://stat.ethz.ch/~dettling/boosting.html
{
  lablist = sort(unique(ylearn))     # different classes in label array
  nClass  = length(lablist)          # number of different classes in label array
  
  if (nClass>2) {                    # Multi class version uses 2-class code ...
    Stump = NULL                     # ... recursivly
    for (jClass in 1:nClass) {
      y = as.numeric(ylearn!=lablist[jClass]) # lablist[jClass]->1; rest->0
      Stump = cbind(Stump, LogitBoost(xlearn, y, nIter)$Stump)
    }
    object = list(Stump=Stump, lablist=lablist) # create LogitBoost object
    class(object) <- "LogitBoost"
    return(object)
  }
  
  Mask = is.na(xlearn)                # any NA in test data will ... 
  if (any(Mask)) xlearn[Mask] = Inf   # ... be changed to +Inf
  ylearn = as.numeric(ylearn!=lablist[1]) # change class labels to boolean 
  nLearn = nrow(xlearn)               # Length of training data         
  nFeat  = ncol(xlearn)               # number of features to choose from         
 
  # Array Initialization
  f      = 0                          # range -1 or 1          
  p      = numeric(nLearn)+1/2        # range [0,1]
  Stump  = matrix(0, nIter,3)         # will hold the results
  colnames(Stump) = c("feature", "threshhold", "sign")
  Thresh =            matrix(0, nLearn, nFeat)  # sorted xlearn each fearure at a time
  Index  = matrix(as.integer(0), nLearn, nFeat) # order of samples in Thresh
  Mask   = matrix(as.logical(0), nLearn, nFeat) # mask of unique samples in Thresh 
  repts  = as.logical(numeric(nFeat))           # for each feature: are all samples unique
 
  # sort all columns and store them in Thresh; Index stores original positions
  for (iFeat in 1:nFeat) {  
    x = sort(xlearn[,iFeat], index=TRUE)
    Thresh[,iFeat] = x[[1]]
    Index [,iFeat] = x[[2]]
    Mask  [,iFeat] = c((diff(Thresh[,iFeat])!=0), TRUE) 
    repts [ iFeat] = !all(Mask[,iFeat])
  }

  # Boosting Iterations
  jFeat = 0
  for (m in 1:nIter) {
    # Computation of working response and weights
    w = pmax(p*(1-p), 1e-24) # weight for each sample
    z = (ylearn-p)/w
    w = w/sum(w)             # normalize to 1

    # older version similar to rpart (more readable but slower) left as documentation
      # MinLS = 1000                  # initialize search for minimum Least-Square
      # for (iFeat in 1:nFeat) {      # for each feature do: for each spliting point calculate least square regresion of z to xlearn with weights w.
        # Col = xlearn[,iFeat]        # sort all columns and store each one in thr
        # thr = Thresh[,iFeat]        # sort all columns and store each one in thr
        # for (j in 1:nLearn) {       # for every possible threshold
          # ff = 2*(Col<=thr[j])-1     # for every point in the column if (col<=Thresh) then f=1 else f=-1 
          # LS1[j] = sum(w*(z-ff)^2)   # Least Square array for every possible theshold  ( f = (col<=Thresh ? 1 : -1) )
          # LS2[j] = sum(w*(z+ff)^2)   # Least Square array for every possible theshold  after swaping output classes ( f = (col<sp ? -1 : 1) )
        # }
        # iLS1 = which.min(LS1)
        # iLS2 = which.min(LS2)
        # vLS1 = LS1[iLS1]
        # vLS2 = LS2[iLS2]      
        # if (MinLS>vLS1) { stump=c(iFeat, thr[iLS1],  1); MinLS=vLS1; }
        # if (MinLS>vLS2) { stump=c(iFeat, thr[iLS2], -1); MinLS=vLS2; }
      # }
      
    # Same as code above but faster:
    # for each feature do: for each spliting point calculate least square regresion of z to xlearn with weights w.
    # This part of the code is my version of rpart object.
    # LS1 is least square array LS1 = sum((w.*(z-f)).^2) where f = (col<=sp ? 1 : -1) LS1 is calculated for all spliting points
    # LS2 is least square array LS2 = sum((w.*(z-f)).^2) where f = (col<=sp ? -1 : 1) LS2 is calculated for all spliting points
    # for each spliting point col(idx) we will take one sample and change its sign, that will change current least square:
    # LS(i) = LS(i) - ( w(idx(i)) .* ( z(idx(i)) - (-1) ) ).^2   +   ( w(idx(i)) .* ( z(idx(i)) - 1 ) ).^2 what can be simplified to:
    # LS(i) = LS(i) - 4*(w(idx(i)).^2) .*  z(idx(i))
    ls1 = sum(w*(z+1)^2)     # Least Square value for left-most theshold  ( f = -1 )
    ls2 = sum(w*(z-1)^2)     # Least Square value for left-most theshold after swaping output classes ( f =  1 )
    MinLS = max(ls1,ls2)     # initialize search for minimum Least-Square
    wz  = 4 * w * z          # precompute vector to be used later
    for (iFeat in 1:nFeat) { # for each feature do: for each spliting point calculate least square regresion of z to xlearn with weights w.
      if (iFeat==jFeat) next # Prevent the simplest cycle
      Col = Thresh[,iFeat]   # get one column of sorted data
      LS  = cumsum(wz[Index[,iFeat]])  # find offset to Least Square value for every possible theshold
      if (repts[iFeat]) {    # if any repeating thresholds than delete all but last LS value
        mask = Mask[,iFeat]  # use mask of unique samples 
        Col = Col[mask]      # delete non-unique thresholds since they can cause errors
        LS  = LS [mask]      # delete coressponding Least Square values         
      }
      iLS1 = which.max(LS)   # min of LS1=ls1-LS - Least Square value for every possible theshold  ( f = (col<=Thresh ? 1 : -1) )
      iLS2 = which.min(LS)   # min of LS2=ls2+LS - Least Square value for every possible theshold after swaping output classes ( f = (col<Thresh ? -1 : 1) )
      vLS1 = ls1-LS[iLS1]
      vLS2 = ls2+LS[iLS2]
      if (MinLS>vLS1) { stump=c(iFeat, Col[iLS1],  1); MinLS=vLS1; }
      if (MinLS>vLS2) { stump=c(iFeat, Col[iLS2], -1); MinLS=vLS2; }
    }
 
    # =================
    # Fitting the tree
    # =================
    Stump[m,] = stump                                     # if stump[3]>0  f(i) += (xlearn(i,stump[1])<=stump[2] ? 1 : -1)
    jFeat = stump[1]
    f = f + stump[3] * (2*( xlearn[,jFeat]<=stump[2] )-1) # if stump[3]<0  f(i) += (xlearn(i,stump[1])> stump[2] ? 1 : -1)
    p = 1/(1+exp(-f))    # Updating and probabilities - range (0, 1]
    
    y = (f>0)
    y[f==0] = 0.5
    Conv = sum(abs(ylearn-y)) # keep track of error rate
  }
  object = list(Stump=Stump, lablist=lablist)  # create LogitBoost object
  class(object) <- "LogitBoost"
  return(object)
}


predict.LogitBoost = function(object, xtest, type = c("class", "raw"), nIter=NA, ...)
#   An implementation of the LogitBoost classification algorithm with
#   decision stumps as weak learners. 
#   
# Input:
#   xtest - A dataset, whose n rows contain samples to be classified.
#   ytest - Class labels of dataset (if given than Conv will return error rates as function of iterations)
#     
# input:
#   Stump - list of decision stumps used:
#            column 1: contains feature numbers or each stump
#            column 2: contains threshold
#            column 3: contains bigger/smaller info: 1 means (col>Thresh ? class_1 : class_2);  -1 means (col>Thresh ? class_2 : class_1)
#
# Writen by Jarek Tuszynski - SAIC (jaroslaw.w.tuszynski@saic.com)
# This code was addapted from logitboost.R function written by Marcel Dettling
# See "Boosting for Tumor Classification of Gene Expression Data", Dettling and Buhlmann (2002), 
#   available on the web page http://stat.ethz.ch/~dettling/boosting.html
{
  type    = match.arg(type)
  Stump   = object$Stump
  lablist = object$lablist
  if (is.na(nIter)) nIter = nrow(Stump)
  else nIter  =  min(nIter, nrow(Stump))
  nTest   = nrow(xtest)
  nClass  = ncol(Stump)/3
  if (nClass==1) nClass=2
  Prob    = matrix(0,nTest, nClass)
  colnames(Prob) = lablist
  if (nClass>2) {                     # multi class problem
    object$lablist = c(1,2)           # generic labels 
    for(iClass in 1:nClass) {
      object$Stump = Stump[,3*iClass + (-2:0)]
      prob = predict.LogitBoost(object, xtest, type="raw", nIter=nIter)
      Prob[,iClass] = prob[,1] # probability that "sample belongs to iClass" is true 
    }
  } else {                            # two class problem
    Mask    = is.na(xtest)            # any NA in test data will... be changed 
    if (any(Mask)) xtest[Mask] = Inf  # be changed to +Inf
    f = numeric(nTest)           
    for (iter in 1:nIter) {
      iFeat  = Stump[iter,1]
      thresh = Stump[iter,2]
      Sign   = Stump[iter,3]
      f = f + Sign*(2*(xtest[,iFeat]<=thresh)-1)
    }
    Ytest = (f>0)
    Ytest[f==0] = 0.5
    prob = 1/(1+exp(-f))
    Prob[,1] = 1-prob # probability that "sample belongs to 1" 
    Prob[,2] =   prob # probability that "sample belongs to 2" 
  }
  if (type=="raw") RET=Prob        # request to return raw probabilities
  else {                           # otherwise assign labels
    ord = t(apply(-Prob, 1, order))# find order of sorted Probs 
    RET = lablist[ord[,1]]         # find label with highest Prob
    Prob = -t(apply(-Prob,1,sort)) # sort Probs
    RET[Prob[,1]==Prob[,2]] = NA   # in case of ties return NA's
  }
  return (RET)
}