00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "MaxEnt.h"
00025
00026
00027 #include <math.h>
00028 #include <fcntl.h>
00029 #include <sys/stat.h>
00030 #ifdef _WIN32
00031 #include <windows.h>
00032 #include <float.h>
00033 #include "lib/strtok_r.h"
00034 #define isinf(x) (_isnan(x) ? 0 : (_fpclass(x) == _FPCLASS_NINF) ? -1 : (_fpclass(x) == _FPCLASS_PINF) ? 1 : 0)
00035 #else
00036 #include <sys/mman.h>
00037 #endif
00038
00039 #include <fstream>
00040 #include <iostream>
00041 #include <iomanip>
00042 #include <map>
00043 #include <assert.h>
00044 #include <limits.h>
00045 #include <algorithm>
00046
00047 using namespace std;
00048
00049 namespace Tanl {
00050 namespace Classifier {
00051
00052 MaxEnt::MaxEnt(char const* file)
00053 {
00054 ifstream ifs(file);
00055 load(ifs);
00056 }
00057
00058 MaxEnt::MaxEnt(istream& ifs)
00059 {
00060 load(ifs);
00061 }
00062
00063 MaxEnt::~MaxEnt()
00064 {
00065 FOR_EACH (vector<char const*>, outcomeLabels, it)
00066 free((void*)*it);
00067 }
00068
00069 void MaxEnt::read(EventStream& eventStream)
00070 {
00071
00072
00073 int evCount = events.size();
00074 while (eventStream.hasNext()) {
00075 if (verbose) {
00076 if (++evCount % 10000 == 0)
00077 cerr << '+' << flush;
00078 else if (evCount % 1000 == 0)
00079 cerr << '.' << flush;
00080 }
00081 Event* ev = eventStream.next();
00082 events.push_back(ev);
00083 readEvent(ev);
00084 }
00085 }
00086
00087 void MaxEnt::readEvent(Event* ev)
00088 {
00089 # ifdef DEBUG
00090 cerr << *ev << endl;
00091 # endif
00092 Features& ec = ev->features;
00093
00094 for (unsigned j = 0; j < ec.size(); j++) {
00095 string& pred = ec[j];
00096
00097 if (predIndex.find(pred.c_str()) == predIndex.end()) {
00098
00099 WordCounts::iterator wcit = counter.find(pred);
00100
00101 unsigned count;
00102 if (wcit == counter.end())
00103 count = counter[pred] = 1;
00104 else
00105 count = ++wcit->second;
00106 if (count >= cutoff) {
00107 predLabels.push_back(pred);
00108 predIndex[pred.c_str()] = pID++;
00109 counter.erase(pred);
00110 }
00111 }
00112 }
00113 }
00114
00115 #define VISIBLE
00116 #ifdef VISIBLE
00117 #define LOAD(var, type) is >> var
00118 #define SAVE(ofs, var) ofs << var << endl;
00119 #else
00120 #define LOAD(var, type) is.read(line, sizeof(type)); var = *(type*)line;
00121 #define SAVE(ofs, var) ofs.write((char*)&var, sizeof(var))
00122 #endif
00123
00124 typedef map<vector<ClassID>, vector<PID> > Groups;
00125
00180 void MaxEnt::save(char const* file)
00181 {
00182 ofstream ofs(file, ios::binary | ios::trunc);
00183
00184
00185 SAVE(ofs, "GIS");
00186
00187
00188 SAVE(ofs, correctionConstant);
00189
00190
00191 SAVE(ofs, correctionParam);
00192
00193
00194 SAVE(ofs, numOutcomes);
00195 for (ClassID i = 0; i < numOutcomes; i++)
00196 SAVE(ofs, outcomeLabels[i]);
00197
00198
00199 Groups groups;
00200 for (PID pid = 0; pid < numPreds; pid++) {
00201
00202 vector<ClassID> outcomes;
00203 for (ClassID oid = 0; oid < numOutcomes; oid++) {
00204 FeatureMap::const_iterator rit = lambda.find(make_pair(pid, oid));
00205 if (rit != lambda.end())
00206 outcomes.push_back(oid);
00207 }
00208 groups[outcomes].push_back(pid);
00209 }
00210
00211 SAVE(ofs, numPreds);
00212 FOR_EACH (Groups, groups, git) {
00213 FOR_EACH (vector<PID>, git->second, pids) {
00214 SAVE(ofs, predLabels[*pids]);
00215 }
00216 }
00217
00218 unsigned numGroups = groups.size();
00219 SAVE(ofs, numGroups);
00220 FOR_EACH (Groups, groups, git) {
00221
00222 unsigned gSize = git->second.size();
00223 SAVE(ofs, gSize);
00224
00225 unsigned numOc = git->first.size();
00226 SAVE(ofs, numOc);
00227 FOR_EACH (vector<ClassID>, git->first, oids) {
00228
00229 SAVE(ofs, *oids);
00230 FOR_EACH (vector<PID>, git->second, pids) {
00231 double param = lambda[make_pair(*pids, *oids)];
00232 SAVE(ofs, param);
00233 }
00234 }
00235 }
00236 }
00237
00238 #define LINE 16384
00239
00240 #define HEADER "#txt,maxent"
00241
00245 void MaxEnt::load(istream& is)
00246 {
00247 char line[LINE];
00248 is >> line;
00249 if (::strncmp(line, HEADER, sizeof(HEADER)-1) == 0) {
00250 correctionConstant = 1;
00251 correctionParam = 0.0;
00252 loadZhang(is);
00253 return;
00254 } else if (::strcmp(line, "GIS"))
00255 throw FileError("wrong file format");
00256
00257
00258 LOAD(correctionConstant, int);
00259
00260
00261 LOAD(correctionParam, double);
00262
00263
00264 LOAD(numOutcomes, int);
00265 outcomeLabels.resize(numOutcomes);
00266 for (ClassID i = 0; i < numOutcomes; i++) {
00267 is >> line;
00268 outcomeLabels[i] = strdup(line);
00269 }
00270
00271
00272 LOAD(numPreds, int);
00273 predLabels.resize(numPreds);
00274 for (PID i = 0; i < numPreds; i++) {
00275 is >> line;
00276 predIndex[line] = i;
00277 predLabels[i] = line;
00278 }
00279
00280 PID pid = 0;
00281
00282 int numGroups;
00283 LOAD(numGroups, int);
00284 while (numGroups--) {
00285
00286 int gSize;
00287 LOAD(gSize, int);
00288
00289 int gOutcomes;
00290 LOAD(gOutcomes, int);
00291 while (gOutcomes--) {
00292
00293 ClassID oid;
00294 LOAD(oid, ClassID);
00295 for (int i = 0; i < gSize; i++) {
00296
00297 double param;
00298 LOAD(param, double);
00299 lambda[make_pair((PID)(pid + i), oid)] = param;
00300 }
00301 }
00302 pid += gSize;
00303 }
00304 }
00305
00306 void MaxEnt::loadZhang(istream& f)
00307 {
00308 char line[LINE];
00309
00310
00311 do {
00312 f.getline(line, LINE);
00313 } while (line[0] == '\0' || line[0] == '#');
00314
00315
00316 numPreds = atoi(line);
00317 predLabels.resize(numPreds);
00318 for (size_t i = 0; i < numPreds; ++i) {
00319 f.getline(line, LINE);
00320 predIndex[line] = i;
00321 predLabels[i] = line;
00322 }
00323
00324
00325 f.getline(line, LINE);
00326 numOutcomes = atoi(line);
00327 outcomeLabels.resize(numOutcomes);
00328 for (size_t i = 0; i < numOutcomes; ++i) {
00329 f.getline(line, LINE);
00330 outcomeLabels[i] = strdup(line);
00331 }
00332
00333
00334 size_t fid = 0;
00335 vector<pair<ClassID, size_t> > pairs;
00336 for (unsigned i = 0; i < numPreds; ++i) {
00337 f.getline(line, LINE);
00338 char* next = line;
00339 char const* tok = strtok_r(0, " ", &next);
00340
00341 while ((tok = strtok_r(0, " ", &next))) {
00342 ClassID oid = atoi(tok);
00343 pairs.push_back(make_pair(i, oid));
00344 fid++;
00345 }
00346 }
00347
00348
00349 f.getline(line, LINE);
00350 int m_n_theta = atoi(line);
00351 assert(fid == m_n_theta);
00352
00353 for (int i = 0; i < m_n_theta && f.getline(line, LINE); i++)
00354 lambda[pairs[i]] = atof(line);
00355 }
00356
00363 double sumLogProb(double logprobs[], int n)
00364 {
00365 double max = 0;
00366 int i;
00367 for (i = 0; i < n; i++) {
00368 if (i==0 || logprobs[i] > max)
00369 max = logprobs[i];
00370 }
00371 if (isinf(max))
00372 return max;
00373 double p = 0;
00374 for (i = 0; i < n; i++) {
00375 p += exp(logprobs[i] - max);
00376 }
00377 return max + log(p);
00378 }
00379
00380 ClassID MaxEnt::estimate(const std::vector<PID>& predicates, double prob[])
00381 {
00382
00383
00384
00385
00386
00387
00388 double iprob = 1.0 / numOutcomes;
00389 double fSharp = correctionConstant;
00390 int numFeats[numOutcomes];
00391 for (ClassID oid = 0; oid < numOutcomes; oid++) {
00392 # ifdef SLACK
00393 prob[oid] = iprob;
00394 numFeats[oid] = 0;
00395 # else
00396 prob[oid] = 0;
00397 # endif
00398 for (unsigned i = 0; i < predicates.size(); i++) {
00399 PID pID = predicates[i];
00400 FeatureMap::const_iterator rit = lambda.find(make_pair(pID, oid));
00401 if (rit != lambda.end()) {
00402 # ifdef SLACK
00403 prob[oid] += rit->second / fSharp;
00404 numFeats[oid]++;
00405 # else
00406 prob[oid] += rit->second;
00407 # endif
00408 }
00409 }
00410 }
00411
00412 ClassID best = 0;
00413 double max_prob = 0.0;
00414 double sum = 0.0;
00415 for (ClassID oid = 0; oid < numOutcomes; oid++) {
00416 # ifdef SLACK
00417 prob[oid] = exp(prob[oid] +
00418 (1.0 - (double)numFeats[oid] / correctionConstant)
00419 * correctionParam);
00420 # else
00421 prob[oid] = exp(prob[oid]);
00422 # endif
00423 sum += prob[oid];
00424 if (prob[oid] >= max_prob) {
00425 max_prob = prob[oid];
00426 best = oid;
00427 }
00428 }
00429 for (ClassID oid = 0; oid < numOutcomes; oid++) {
00430 prob[oid] /= sum;
00431 }
00432 return best;
00433 }
00434
00435 ClassID MaxEnt::BestOutcome(double* ocs) const
00436 {
00437 ClassID best = 0;
00438 for (ClassID i = 1; i < numOutcomes; i++)
00439 if (ocs[i] > ocs[best]) best = i;
00440 return best;
00441 }
00442
00443 ostream& operator <<(ostream& s, MaxEnt const& m)
00444 {
00445 FOR_EACH (FeatureMap, m.lambda, fit) {
00446 Feature const& f = fit->first;
00447 PID pid = f.first;
00448 ClassID cid = f.second;
00449 s << "lambda(" << m.OutcomeName(cid) << ", " << m.PredicateName(pid) << ")="
00450 << setprecision(4) << fit->second << endl;
00451 }
00452 return s;
00453 }
00454
00455
00456 struct clearedContext {
00457 bool operator()(const vector<PID>& c) const {
00458 return c.empty();
00459 }
00460 };
00461
00462 struct clearedNumber {
00463 bool operator()(const unsigned& n) const {
00464 return n == UINT_MAX;
00465 }
00466 };
00467
00468
00469
00470 int MaxEnt::buildIndex(list<Event*>& events,
00471 Text::WordIndex& predIndex,
00472 EventMap& eventMap,
00473 vector<char const*>& outcomeLabels,
00474 int evCutoff, bool verbose)
00475 {
00476 vector<PID> pIDs;
00477 int evCount = 0;
00478 if (verbose)
00479 cerr << endl;
00480 while (!events.empty()) {
00481 if (verbose) {
00482 if (++evCount % 10000 == 0)
00483 cerr << '+' << flush;
00484 else if (evCount % 1000 == 0)
00485 cerr << '.' << flush;
00486 }
00487 Event* ev = events.front();
00488 events.pop_front();
00489
00490
00491 pIDs.clear();
00492 FOR_EACH (Features, ev->features, pit) {
00493 Text::WordIndex::const_iterator it = predIndex.find(pit->c_str());
00494 if (it != predIndex.end())
00495 pIDs.push_back(it->second);
00496 }
00497
00498 size_t k = pIDs.size();
00499 if (k == 0 && !ev->features.empty()) {
00500
00501 cerr << "Dropped event [";
00502 FOR_EACH (Features, ev->features, pit)
00503 cerr << *pit << " ";
00504 cerr << "]" << endl;
00505 } else {
00506
00507
00508 ClassName& className = ev->className;
00509 ClassID ocID = (ClassID)-1;
00510 FOR_EACH (vector<char const*>, outcomeLabels, cit)
00511 if (*cit == className) {
00512 ocID = cit - outcomeLabels.begin();
00513 break;
00514 }
00515 if (ocID == (ClassID)-1) {
00516 ocID = outcomeLabels.size();
00517 outcomeLabels.push_back(::strdup(className.c_str()));
00518 }
00519
00520
00521 sort(pIDs.begin(), pIDs.end());
00522 eventMap[make_pair(ocID, pIDs)]++;
00523 }
00524 delete ev;
00525 }
00526 if (verbose)
00527 cerr << endl;
00528 int numEvents = 0;
00529
00530 for (EventMap::iterator it = eventMap.begin(); it != eventMap.end(); ) {
00531 int count = it->second;
00532 if (count < evCutoff)
00533 eventMap.erase(it);
00534 else {
00535 numEvents += count;
00536 ++it;
00537 }
00538 }
00539 if (verbose) {
00540 cerr << "\t Number of events: " << evCount << endl;
00541 cerr << "\t Unique events: " << eventMap.size() << endl;
00542 cerr << "\t Number of Classes: " << outcomeLabels.size() << endl;
00543 cerr << "\tNumber of Predicates: " << predIndex.size() << endl;
00544 }
00545 return numEvents;
00546 }
00547
00548 }
00549 }