00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "GIS.h"
00025 #include <math.h>
00026 #include <iostream>
00027 #include <iomanip>
00028 #include <algorithm>
00029
00030
00031 #include <stdio.h>
00032
00033 using namespace std;
00034 using namespace Tanl::Text;
00035
00036 namespace Tanl {
00037 namespace Classifier {
00038
00039 const double GIS::NEAR_ZERO = 0.01;
00040 const double GIS::LLThreshold = 0.001;
00041
00059
00060
00061 GIS::GIS(int iterations, int cutoff, double alpha) :
00062 MaxEnt(iterations, cutoff), alpha(alpha)
00063 { }
00064
00065 GIS::GIS(EventStream& es, int iterations, int cutoff, double alpha) :
00066 MaxEnt(iterations, cutoff), alpha(alpha)
00067 {
00068 train(es);
00069 }
00070
00071 void GIS::train(EventStream& es)
00072 {
00073 read(es);
00074 train();
00075 }
00076
00077 void GIS::train()
00078 {
00079 counter.clear();
00080
00081
00082
00083
00084 int numEvents = buildIndex(events, predIndex, eventMap, outcomeLabels, 0,
00085 verbose);
00086
00087 numTokens = eventMap.size();
00088 numPreds = predLabels.size();
00089 numOutcomes = outcomeLabels.size();
00090
00091 if (numTokens == 0)
00092 throw EventStreamError("No events in stream");
00093
00094
00095
00096
00097
00098
00099
00100
00101 FeatureMap logObserved;
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111 correctionConstant = 0;
00112 FOR_EACH (EventMap, eventMap, it) {
00113 int evOccurred = it->second;
00114 ClassID oID = it->first.first;
00115 const vector<PID>& predicates = it->first.second;
00116 for (unsigned j = 0; j < predicates.size(); j++) {
00117 PID pID = predicates[j];
00118 Feature f(pID, oID);
00119 logObserved[f] += evOccurred;
00120
00121 lambda[f] = 0.0;
00122 }
00123 correctionConstant = MAX(correctionConstant, predicates.size());
00124 }
00125 double fSharp = correctionConstant;
00126
00127
00128
00129 Expected expected(numPreds);
00130 TO_EACH (FeatureMap, lambda, lit) {
00131 Feature const& f = lit->first;
00132 expected[f.first].push_back(make_pair(f.second, 0.0));
00133 }
00134
00135
00136 TO_EACH (FeatureMap, logObserved, oit) {
00137 oit->second = log(oit->second - alpha);
00138 }
00139
00140 # ifdef SLACK
00141
00142 int cfvalSum = 0;
00143 FOR_EACH (EventMap, eventMap, it) {
00144 int evOccurred = it->second;
00145 ClassID oID = it->first.first;
00146 const vector<PID>& predicates = it->first.second;
00147 cfvalSum += (correctionConstant - predicates.size()) * evOccurred;
00148 }
00149 cfObservedExpect = (cfvalSum == 0) ? log(NEAR_ZERO) : log(cfvalSum);
00150 # endif
00151
00152
00153
00154 double prob[numOutcomes];
00155
00156 double prevSumLogProb = 0.0;
00157 for (int i = 0; i < iterations; i++) {
00158 int correct;
00159 double sumLogProb = update(expected, prob, correct);
00160 if (verbose)
00161 cerr << setw(3) << i+1 << ": logProb = " << sumLogProb <<
00162 " correct = " << correct * 100. / numEvents << "%" << endl;
00163 if (i > 0 && sumLogProb - prevSumLogProb <= LLThreshold)
00164 break;
00165 prevSumLogProb = sumLogProb;
00166
00167
00168
00169
00170 for (PID pID = 0; pID < numPreds; pID++) {
00171 TO_EACH (Expected::value_type, expected[pID], eit) {
00172 Feature k(pID, eit->first);
00173 # ifdef SLACK
00174 lambda[k] += (logObserved[k] - log(eit->second));
00175 # else
00176 lambda[k] += (logObserved[k] - log(eit->second)) / fSharp;
00177 lambda[k] = MAX(0.0, lambda[k]);
00178 # endif
00179 eit->second = 0.0;
00180 }
00181 }
00182 }
00183
00184
00185 expected.clear();
00186 logObserved.clear();
00187 eventMap.clear();
00188 }
00189
00193 double GIS::update(Expected& expected, double prob[], int& correct)
00194 {
00195
00196
00197
00198 correct = 0;
00199 double loglikelihood = 0.0;
00200 double CFMOD = 0.0;
00201
00202
00203 FOR_EACH (EventMap, eventMap, it) {
00204 ClassID outcome = it->first.first;
00205 int evOccurred = it->second;
00206 const vector<PID>& xj = it->first.second;
00207 int oc = it->first.first;
00208
00209 ClassID best = estimate(xj, prob);
00210 if (best == outcome)
00211 correct += evOccurred;
00212
00213 for (unsigned i = 0; i < xj.size(); i++) {
00214 int pID = xj[i];
00215 TO_EACH (Expected::value_type, expected[pID], eit) {
00216 ClassID c = eit->first;
00217 double d = prob[c] * evOccurred;
00218 eit->second += d;
00219 }
00220 # ifdef SLACK
00221 for (ClassID oid = 0; oid < numOutcomes; oid++) {
00222 if (lambda.find(Feature(pID, oid)) == lambda.end())
00223 CFMOD += prob[oid] * evOccurred;
00224 }
00225 # endif
00226 }
00227 loglikelihood += log(prob[oc]) * evOccurred;
00228 # ifdef SLACK
00229 CFMOD += (correctionConstant - xj.size()) * evOccurred;
00230 # endif
00231 }
00232 # ifdef SLACK
00233 if (CFMOD > 0.0)
00234 correctionParam += cfObservedExpect - log(CFMOD);
00235 # endif
00236 return loglikelihood;
00237 }
00238
00239 }
00240 }