00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Parser.h"
00025 #include "ReviseEventStream.h"
00026 #include "Corpus.h"
00027 #include "version.h"
00028 #include "conf_Replacements.h"
00029
00030
00031 #include "ixe/include/Timer.h"
00032 #include "io/Format.h"
00033
00034
00035 #include <iostream>
00036 #ifdef _WIN32
00037 # include <io.h>
00038 # include <fcntl.h>
00039 #else
00040 # include <sys/resource.h>
00041 #endif
00042
00043 using namespace std;
00044 using namespace IXE;
00045 using IXE::io::Format;
00046
00047 #define MAX_LINE_LEN 8196
00048
00049 namespace Parser {
00050
00051 conf<string> algorithm("Algorithm", "SVM");
00052 conf<string> lang("Language", "en");
00053 conf<string> fileVersion("Version", version);
00054 conf<int> beam("Beam", 1, 1);
00055
00057 conf<Replacements> normLemma("LemmaReplace");
00058
00059 conf<int> Parser::featureCutoff("FeatureCutoff", 0);
00060
00061 conf<int> Parser::lexCutoff("LexCutoff", 0);
00062
00063 conf<bool> Parser::verbose("Verbose", false);
00064
00065 float const GlobalInfo::freqRatio = 1.5;
00066
00068 bool showTreelets = false;
00069
00077 std::map<char const*, ParserFactory*>& ParserMap::get()
00078 {
00079 static std::map<char const*, ParserFactory*> parserFor;
00080 return parserFor;
00081 }
00082
00083 ParserFactory* ParserMap::get(char const* type)
00084 {
00085 std::map<char const*, ParserFactory*>& parserMap = get();
00086 map<char const*, ParserFactory*>::const_iterator rit = parserMap.find(type);
00087 return (rit == parserMap.end()) ? 0 : rit->second;
00088 }
00089
00090 Parser::~Parser() { }
00091
00092 Parser* Parser::create(char const* modelFile)
00093 {
00094 if (modelFile) {
00095
00096 ifstream ifs(modelFile);
00097 if (!ifs) {
00098 cerr << "Missing model file: " << modelFile << endl;
00099 return 0;
00100 }
00101 ::Parser::Parser::readHeader(ifs);
00102 ifs.close();
00103 }
00104
00105 ParserFactory* factory = ParserMap::get(algorithm->c_str());
00106 if (!factory) {
00107 cerr << "No such algorithm: " << *algorithm << endl;
00108 return 0;
00109 }
00110 ::Parser::Parser* parser = factory(modelFile);
00111 if (parser == 0) {
00112 cerr << "Could not load: " << modelFile << endl;
00113 return 0;
00114 }
00115 return parser;
00116 }
00117
00118 Enumerator<Sentence*>* Parser::pipe(Enumerator<std::vector<Token*>*>& tve)
00119 {
00120 return new ParserPipe(*this, tve);
00121 }
00122
00123 Enumerator<Sentence*>* Parser::pipe(Enumerator<Sentence*>& tve)
00124 {
00125 return new ParserSentPipe(*this, tve);
00126 }
00127
00128 void Parser::parse(SentenceReader* reader, ostream& os)
00129 {
00130 int las = 0;
00131 int uas = 0;
00132 int tot = 0;
00133 int sent = 0;
00134 # ifdef _WIN32
00135 if (os == cout)
00136 _setmode(_fileno(stdout), _O_BINARY);
00137 # endif
00138
00139 while (reader->MoveNext()) {
00140 Sentence* sin = reader->Current();
00141 Sentence* sout = parse(sin);
00142 reader->corpus->print(os, *sout);
00143 os << endl;
00144 ++sent;
00145
00146 Sentence::const_iterator soutIt = sout->begin();
00147 FOR_EACH (Sentence, *sin, sinIt) {
00148 TreeToken* tin = *sinIt;
00149 TreeToken* tout = *soutIt++;
00150 tot++;
00151 if (tin->linkHead() == tout->linkHead()) {
00152 uas++;
00153 if (tin->linkLabel() == tout->linkLabel())
00154 las++;
00155 }
00156 }
00157 delete sin;
00158 delete sout;
00159 }
00160 showEval(tot, las, uas, sent);
00161 }
00162
00163 void Parser::writeHeader(ostream& os)
00164 {
00165 os << "<desr alg='" << *algorithm << "' version='" << version << "'>" << endl;
00166 Configuration::Map& vars = Configuration::variables();
00167 FOR_EACH (Configuration::Map, vars, vit)
00168 vit->second->serialize(os);
00169 os << "</desr>" << endl;
00170 }
00171
00172 bool Parser::readHeader(istream& is)
00173 {
00174 Configuration::reset();
00175 Configuration::load(is);
00176 return true;
00177 }
00178
00179 void Parser::showEval(int tokenCount, int las, int uas, int sentCount)
00180 {
00181 cerr << Format("LAS: %.2f %% (%d/%d)", (100. * las)/tokenCount, las, tokenCount) << endl;
00182 cerr << Format("UAS: %.2f %% (%d/%d)", (100. * uas)/tokenCount, uas, tokenCount) << endl;
00183 cerr << "Sentences: " << sentCount << endl;
00184 # ifdef ORACLE
00185 cerr << Format("Oracle: %.2f %% (%d/%d)", (100. * oracleCorrect)/oracleCount, oracleCorrect, oracleCount) << endl;
00186 # endif
00187 }
00188
00189 static Timer timer;
00190
00191 string secs_to_h(int secs)
00192 {
00193 struct NameSize { const char* n; char const* sep; int s; };
00194
00195 static const NameSize units[] = {
00196 "week", " ", 7*24*3600,
00197 "day", " ", 24*3600,
00198 "", ":", 3600,
00199 "", ":", 60,
00200 "", "", 1
00201 };
00202
00203
00204 if (secs == 0) return "0 s";
00205
00206 string s;
00207
00208 for (int i = 0; i < 5; i++) {
00209 int divisor = units[i].s;
00210 int quot = secs / divisor;
00211 if (quot) {
00212 s += Format("%d%s", quot, units[i].n);
00213 secs -= quot * divisor;
00214 if (secs) s += units[i].sep;
00215 }
00216 }
00217 return s;
00218 }
00219
00220
00221 string Parser::procStat()
00222 {
00223 # ifdef _WIN32
00224 FILETIME starttime;
00225 FILETIME exittime;
00226 FILETIME kerneltime;
00227 FILETIME usertime;
00228 GetProcessTimes(GetCurrentProcess(),
00229 &starttime, &exittime, &kerneltime, &usertime);
00230 long long ktime = *(long long*)&kerneltime;
00231 long long utime = *(long long*)&usertime;
00232
00233 int procSec = (ktime + utime) / 10000000L;
00234 SIZE_T minRSS, maxRSS;
00235 GetProcessWorkingSetSize(GetCurrentProcess(), &minRSS, &maxRSS);
00236 # else
00237 struct rusage rusage;
00238 getrusage(RUSAGE_SELF, &rusage);
00239 int procSec = rusage.ru_utime.tv_sec + rusage.ru_stime.tv_sec;
00240 int maxRSS = rusage.ru_maxrss;
00241 # endif
00242 timer.split();
00243 long elapsedSec = timer.seconds();
00244 double usage = 100. * procSec / (elapsedSec + 0.00001);
00245 char result[200];
00246 string procTime = secs_to_h(procSec);
00247 string elapsed = secs_to_h(elapsedSec);
00248 snprintf(result, sizeof(result),
00249 "Process: %s run, %s real, %.2f%% CPU, %0.2f MB",
00250 procTime.c_str(), elapsed.c_str(),
00251 usage, maxRSS / (1024.*1024.));
00252 return result;
00253 }
00254
00259 static void normalizeLemma(Token& tok)
00260 {
00261 string lemma(*tok.lemma());
00262 TO_EACH (Replacements, *normLemma, dit)
00263 if (dit->first.modify(lemma, dit->second))
00264 break;
00265 tok.lemma(lemma);
00266 }
00267
00268 void Parser::preprocess(Sentence* sentence)
00269 {
00270 TO_EACH (Sentence, *sentence, sit) {
00271 Token& tok = *(*sit)->token;
00272 normalizeLemma(tok);
00273 if (tok.links.empty())
00274 tok.links.resize(1);
00275 }
00276 }
00277
00278 deque<Sentence*> Parser::collectSentences(Enumerator<Sentence*>* sentenceReader)
00279 {
00280
00281 int formCutoff = Parser::lexCutoff;
00282 int lemmaCutoff = Parser::lexCutoff;
00283 WordCounts formCounts;
00284 WordCounts lemmaCounts;
00285 deque<Sentence*> sentences;
00286
00287 while (sentenceReader->MoveNext()) {
00288 Sentence* sentence = sentenceReader->Current();
00289 if (sentence->size()) {
00290 preprocess(sentence);
00291 FOR_EACH (Sentence, *sentence, sit) {
00292 TreeToken* tok = *sit;
00293 formCounts[tok->get("FORM")->c_str()]++;
00294 lemmaCounts[tok->get("LEMMA")->c_str()]++;
00295 }
00296 sentences.push_back(sentence);
00297 }
00298 }
00299
00300
00301 FOR_EACH (deque<Sentence*>, sentences, sits) {
00302 FOR_EACH (Sentence, **sits, sit) {
00303 TreeToken* tok = *sit;
00304 string const* form = tok->get("FORM");
00305 if (formCounts[form->c_str()] < formCutoff)
00306 tok->set("FORM", "#UNKNOWN");
00307 string const* lemma = tok->get("LEMMA");
00308 if (lemmaCounts[lemma->c_str()] < lemmaCutoff)
00309 tok->set("LEMMA", "#UNKNOWN");
00310 }
00311 }
00312
00313 formCounts.clear(); formCounts = WordCounts();
00314 lemmaCounts.clear(); lemmaCounts = WordCounts();
00315 return sentences;
00316 }
00317
00318
00319
00320 void GlobalInfo::extract(Sentence const& sentence)
00321 {
00322 Language const* lang = sentence.language;
00323
00324
00325 FOR_EACH (Sentence, sentence, sit) {
00326 TreeToken* node = *sit;
00327 Token& tok = *node->token;
00328 if (lang->hasPostpositions) {
00329 int head = node->linkHead();
00330 if (head && tok.isPreposition(lang)) {
00331
00332 Token& parent = *sentence[head - 1]->token;
00333 string const* noun = parent.lemma();
00334 if (noun && !noun->empty()) {
00335 if (parent.isNoun(lang)) {
00336 if (parent.isTime(lang)) {
00337
00338 timeLemmas.add(*noun);
00339 } else if (parent.isLocation(lang)) {
00340
00341 locLemmas.add(*noun);
00342 }
00343 }
00344 }
00345 }
00346 } else {
00347 int head = node->linkHead();
00348 if (head == 0)
00349 continue;
00350 if (tok.isNoun(lang)) {
00351 string const* noun = tok.lemma();
00352 if (noun && !noun->empty()) {
00353 if (tok.isTime(lang)) {
00354
00355 timeLemmas.add(*noun);
00356 } else if (tok.isLocation(lang)) {
00357
00358 locLemmas.add(*noun);
00359 }
00360
00361 Token* par = sentence[head - 1]->token;
00362 if (par->isPreposition(lang)) {
00363 if (par->isTime(lang)) {
00364
00365 timeLemmas.add(*noun);
00366 } else if (par->isLocation(lang)) {
00367
00368 locLemmas.add(*noun);
00369 }
00370 }
00371 }
00372 }
00373 }
00374 }
00375 }
00376
00377 void GlobalInfo::clearRareEntities()
00378 {
00379 for (WordCounts::iterator pit = timeLemmas.begin();
00380 pit != timeLemmas.end(); ) {
00381 WordCounts::iterator cur = pit++;
00382 int tc = cur->second;
00383 int lc = locLemmas.count(cur->first);
00384 if (tc >= freqRatio * lc)
00385 locLemmas.erase(cur->first);
00386 else if (lc >= freqRatio * tc)
00387 timeLemmas.erase(cur);
00388 }
00389 }
00390
00391 void GlobalInfo::clear()
00392 {
00393 timeLemmas.clear();
00394 timeLemmas = WordCounts();
00395 locLemmas.clear();
00396 locLemmas = WordCounts();
00397 }
00398
00399 void GlobalInfo::save(ofstream& ofs)
00400 {
00401
00402 ofs << timeLemmas.size() << endl;
00403 FOR_EACH (WordCounts, timeLemmas, pit)
00404 ofs << pit->first << endl;
00405
00406 ofs << locLemmas.size() << endl;
00407 FOR_EACH (WordCounts, locLemmas, pit)
00408 ofs << pit->first << endl;
00409 }
00410
00411 void GlobalInfo::load(ifstream& ifs)
00412 {
00413 char line[MAX_LINE_LEN];
00414
00415 if (ifs.getline(line, MAX_LINE_LEN)) {
00416 int n = atoi(line);
00417 while (n-- && ifs.getline(line, MAX_LINE_LEN))
00418 timeLemmas.add(line);
00419
00420 if (ifs.getline(line, MAX_LINE_LEN)) {
00421 n = atoi(line);
00422 while (n-- && ifs.getline(line, MAX_LINE_LEN))
00423 locLemmas.add(line);
00424 }
00425 }
00426 }
00427
00428
00429
00430 ParserPipe::ParserPipe(Parser& parser, Enumerator<std::vector<Token*>*>& tve) :
00431 parser(parser),
00432 tve(tve),
00433 language(Language::get(lang->c_str()))
00434 {
00435 }
00436
00437 void ParserPipe::Dispose()
00438 {
00439 parser.decRef();
00440 delete this;
00441 }
00442
00443 bool ParserPipe::MoveNext()
00444 {
00445 return tve.MoveNext();
00446 }
00447
00448 Sentence* ParserPipe::Current()
00449 {
00450 vector<Token*>* sent = tve.Current();
00451 Sentence* sentence = new Sentence(language);
00452 int id = 1;
00453 FOR_EACH (vector<Token*>, *sent, vit) {
00454 Token* tok = *vit;
00455 TreeToken* token = new TreeToken(id++, tok->form, tok->attributes, tok->links);
00456 sentence->push_back(token);
00457 delete tok;
00458 }
00459 delete sent;
00460 return parser.parse(sentence);
00461 }
00462
00463
00464
00465 ParserSentPipe::ParserSentPipe(Parser& parser, Enumerator<Sentence*>& tve) :
00466 parser(parser),
00467 tve(tve)
00468 { }
00469
00470 void ParserSentPipe::Dispose()
00471 {
00472 parser.decRef();
00473 delete this;
00474 }
00475
00476 bool ParserSentPipe::MoveNext()
00477 {
00478 return tve.MoveNext();
00479 }
00480
00481 Sentence* ParserSentPipe::Current()
00482 {
00483 Sentence* sentence = tve.Current();
00484 return parser.parse(sentence);
00485 }
00486
00487 }