tanl: tanl: parse/desr/src/Parser.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/Parser.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2006  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "Parser.h"
00025 #include "ReviseEventStream.h"
00026 #include "Corpus.h"
00027 #include "version.h"
00028 #include "conf_Replacements.h"
00029 
00030 // library
00031 #include "ixe/include/Timer.h"
00032 #include "io/Format.h"
00033 
00034 // standard
00035 #include <iostream>
00036 #ifdef _WIN32
00037 # include <io.h>
00038 # include <fcntl.h>
00039 #else
00040 # include <sys/resource.h>      // getrusage()
00041 #endif
00042 
00043 using namespace std;
00044 using namespace IXE;
00045 using IXE::io::Format;
00046 
00047 #define MAX_LINE_LEN 8196
00048 
00049 namespace Parser {
00050 
00051 conf<string>    algorithm("Algorithm", "SVM"); // ME, AP, PA, SVM
00052 conf<string>    lang("Language", "en"); // en, it, pt, sp, jp, ...
00053 conf<string>    fileVersion("Version", version);
00054 conf<int>       beam("Beam", 1, 1); // beam size
00055 
00057 conf<Replacements>      normLemma("LemmaReplace");
00058 
00059 conf<int>       Parser::featureCutoff("FeatureCutoff", 0);
00060 
00061 conf<int>       Parser::lexCutoff("LexCutoff", 0);
00062 
00063 conf<bool>      Parser::verbose("Verbose", false);
00064 
00065 float const     GlobalInfo::freqRatio = 1.5;
00066 
00068 bool            showTreelets = false;
00069 
00077 std::map<char const*, ParserFactory*>& ParserMap::get()
00078 {
00079   static std::map<char const*, ParserFactory*> parserFor;
00080   return parserFor;
00081 }
00082 
00083 ParserFactory* ParserMap::get(char const* type)
00084 {
00085   std::map<char const*, ParserFactory*>& parserMap = get();
00086   map<char const*, ParserFactory*>::const_iterator rit = parserMap.find(type);
00087   return (rit == parserMap.end()) ? 0 : rit->second;
00088 }
00089 
00090 Parser::~Parser() { }
00091 
00092 Parser* Parser::create(char const* modelFile)
00093 {
00094   if (modelFile) {
00095     // read parameters from modelFile
00096     ifstream ifs(modelFile);
00097     if (!ifs) {
00098       cerr << "Missing model file: " << modelFile << endl;
00099       return 0;
00100     }
00101     ::Parser::Parser::readHeader(ifs);
00102     ifs.close();
00103   }
00104   // here we know which algorith to use
00105   ParserFactory* factory = ParserMap::get(algorithm->c_str());
00106   if (!factory) {
00107     cerr << "No such algorithm: " << *algorithm << endl;
00108     return 0;
00109   }
00110   ::Parser::Parser* parser = factory(modelFile);
00111   if (parser == 0) {
00112     cerr << "Could not load: " << modelFile << endl;
00113     return 0;
00114   }
00115   return parser;
00116 }
00117 
00118 Enumerator<Sentence*>* Parser::pipe(Enumerator<std::vector<Token*>*>& tve)
00119 {
00120   return new ParserPipe(*this, tve);
00121 }
00122 
00123 Enumerator<Sentence*>* Parser::pipe(Enumerator<Sentence*>& tve)
00124 {
00125   return new ParserSentPipe(*this, tve);
00126 }
00127 
00128 void Parser::parse(SentenceReader* reader, ostream& os)
00129 {
00130   int las = 0;
00131   int uas = 0;
00132   int tot = 0;
00133   int sent = 0;
00134 # ifdef _WIN32
00135   if (os == cout)
00136     _setmode(_fileno(stdout), _O_BINARY); // no \r
00137 # endif
00138 
00139   while (reader->MoveNext()) {
00140     Sentence* sin = reader->Current();
00141     Sentence* sout = parse(sin);
00142     reader->corpus->print(os, *sout);
00143     os << endl;
00144     ++sent;
00145     // check accuracy
00146     Sentence::const_iterator soutIt = sout->begin();
00147     FOR_EACH (Sentence, *sin, sinIt) {
00148       TreeToken* tin = *sinIt;
00149       TreeToken* tout = *soutIt++;
00150       tot++;
00151       if (tin->linkHead() == tout->linkHead()) {
00152         uas++;
00153         if (tin->linkLabel() == tout->linkLabel())
00154           las++;
00155       }
00156     }
00157     delete sin;
00158     delete sout;
00159   }
00160   showEval(tot, las, uas, sent);
00161 }
00162 
00163 void Parser::writeHeader(ostream& os)
00164 {
00165   os << "<desr alg='" << *algorithm << "' version='" << version << "'>" << endl;
00166   Configuration::Map& vars = Configuration::variables();
00167   FOR_EACH (Configuration::Map, vars, vit)
00168     vit->second->serialize(os);
00169   os << "</desr>" << endl;
00170 }
00171 
00172 bool Parser::readHeader(istream& is)
00173 {
00174   Configuration::reset();
00175   Configuration::load(is);
00176   return true;
00177 }
00178 
00179 void Parser::showEval(int tokenCount, int las, int uas, int sentCount)
00180 {
00181   cerr << Format("LAS: %.2f %% (%d/%d)", (100. * las)/tokenCount, las, tokenCount) << endl;
00182   cerr << Format("UAS: %.2f %% (%d/%d)", (100. * uas)/tokenCount, uas, tokenCount) << endl;
00183   cerr << "Sentences: " << sentCount << endl;
00184 # ifdef ORACLE
00185   cerr << Format("Oracle: %.2f %% (%d/%d)", (100. * oracleCorrect)/oracleCount, oracleCorrect, oracleCount) << endl;
00186 # endif
00187 }
00188 
00189 static Timer timer;
00190 
00191 string secs_to_h(int secs)
00192 {
00193   struct NameSize { const char* n; char const* sep; int s; };
00194 
00195   static const NameSize units[] = {
00196     "week", " ", 7*24*3600,
00197     "day",  " ",   24*3600,
00198     "",    ":",      3600,
00199     "",    ":",        60,
00200     "",     "",         1
00201   };
00202 
00203   // specifically handle zero
00204   if (secs == 0) return "0 s";
00205 
00206   string s;
00207 
00208   for (int i = 0; i < 5; i++) {
00209     int divisor = units[i].s;
00210     int quot = secs / divisor;
00211     if (quot) {
00212       s += Format("%d%s", quot, units[i].n);
00213       secs -= quot * divisor;
00214       if (secs) s += units[i].sep;
00215     }
00216   }
00217   return s;
00218 }
00219 
00220 // Doesn't compile on VS2008 SP1 (_MSC_VER = 1500)
00221 string Parser::procStat()
00222 {
00223 # ifdef _WIN32
00224   FILETIME starttime;
00225   FILETIME exittime;
00226   FILETIME kerneltime;
00227   FILETIME usertime;
00228   GetProcessTimes(GetCurrentProcess(),
00229                   &starttime, &exittime, &kerneltime, &usertime);
00230   long long ktime = *(long long*)&kerneltime;
00231   long long utime = *(long long*)&usertime;
00232   /* Convert FILETIME (0.1 us) to seconds */
00233   int procSec = (ktime + utime) / 10000000L;
00234   SIZE_T minRSS, maxRSS;
00235   GetProcessWorkingSetSize(GetCurrentProcess(), &minRSS, &maxRSS);
00236 # else
00237     struct rusage rusage;
00238     getrusage(RUSAGE_SELF, &rusage);
00239     int procSec = rusage.ru_utime.tv_sec + rusage.ru_stime.tv_sec;
00240     int maxRSS = rusage.ru_maxrss;
00241 # endif
00242     timer.split();
00243     long elapsedSec = timer.seconds();
00244     double usage = 100. * procSec / (elapsedSec + 0.00001);
00245     char result[200];
00246     string procTime = secs_to_h(procSec);
00247     string elapsed = secs_to_h(elapsedSec);
00248     snprintf(result, sizeof(result),
00249              "Process: %s run, %s real, %.2f%% CPU, %0.2f MB",
00250              procTime.c_str(), elapsed.c_str(),
00251              usage, maxRSS / (1024.*1024.));
00252     return result;
00253 }
00254 
00259 static void normalizeLemma(Token& tok)
00260 {
00261   string lemma(*tok.lemma());
00262   TO_EACH (Replacements, *normLemma, dit)
00263     if (dit->first.modify(lemma, dit->second))
00264       break;
00265   tok.lemma(lemma);
00266 }
00267 
00268 void Parser::preprocess(Sentence* sentence)
00269 {
00270   TO_EACH (Sentence, *sentence, sit) {
00271     Token& tok = *(*sit)->token;
00272     normalizeLemma(tok);
00273     if (tok.links.empty())
00274       tok.links.resize(1);              // at least dependency link is needed
00275   }
00276 }
00277 
00278 deque<Sentence*> Parser::collectSentences(Enumerator<Sentence*>* sentenceReader)
00279 {
00280   // collect all sentences in order to count word/lemma frequencies
00281   int formCutoff = Parser::lexCutoff;
00282   int lemmaCutoff = Parser::lexCutoff;
00283   WordCounts formCounts;
00284   WordCounts lemmaCounts;
00285   deque<Sentence*> sentences;
00286 
00287   while (sentenceReader->MoveNext()) {
00288     Sentence* sentence = sentenceReader->Current();
00289     if (sentence->size()) {
00290       preprocess(sentence);
00291       FOR_EACH (Sentence, *sentence, sit) {
00292         TreeToken* tok = *sit;
00293         formCounts[tok->get("FORM")->c_str()]++;
00294         lemmaCounts[tok->get("LEMMA")->c_str()]++;
00295       }
00296       sentences.push_back(sentence);
00297     }
00298   }
00299 
00300   // replace less frequent tokens attributes with #UNKNOWN
00301   FOR_EACH (deque<Sentence*>, sentences, sits) {
00302     FOR_EACH (Sentence, **sits, sit) {
00303       TreeToken* tok = *sit;
00304       string const* form = tok->get("FORM");
00305       if (formCounts[form->c_str()] < formCutoff)
00306         tok->set("FORM", "#UNKNOWN");
00307       string const* lemma = tok->get("LEMMA");
00308       if (lemmaCounts[lemma->c_str()] < lemmaCutoff)
00309         tok->set("LEMMA", "#UNKNOWN");
00310     }
00311   }
00312   // clear memory
00313   formCounts.clear(); formCounts = WordCounts();
00314   lemmaCounts.clear(); lemmaCounts = WordCounts();
00315   return sentences;
00316 }
00317 
00318 // ======================================================================
00319 
00320 void GlobalInfo::extract(Sentence const& sentence)
00321 {
00322   Language const* lang = sentence.language;
00323   // look for noun child of preposition with time/loc dependency
00324   // or for noun parent of postposition with time/loc dependency
00325   FOR_EACH (Sentence, sentence, sit) {
00326     TreeToken* node = *sit;
00327     Token& tok = *node->token;
00328     if (lang->hasPostpositions) {
00329       int head = node->linkHead();
00330       if (head && tok.isPreposition(lang)) {
00331         // get parent
00332         Token& parent = *sentence[head - 1]->token;
00333         string const* noun = parent.lemma();
00334         if (noun && !noun->empty()) {
00335           if (parent.isNoun(lang)) {
00336             if (parent.isTime(lang)) {
00337               // add to time lemmas
00338               timeLemmas.add(*noun);
00339             } else if (parent.isLocation(lang)) {
00340               // add to location lemmas
00341               locLemmas.add(*noun);
00342             }
00343           }
00344         }
00345       }
00346     } else {
00347       int head = node->linkHead();
00348       if (head == 0)
00349         continue;
00350       if (tok.isNoun(lang)) {
00351         string const* noun = tok.lemma();
00352         if (noun && !noun->empty()) {
00353           if (tok.isTime(lang)) {
00354             // add to time lemmas
00355             timeLemmas.add(*noun);
00356           } else if (tok.isLocation(lang)) {
00357             // add to location lemmas
00358             locLemmas.add(*noun);
00359           }
00360           // sometimes the link is indirect
00361           Token* par = sentence[head - 1]->token;
00362           if (par->isPreposition(lang)) {
00363             if (par->isTime(lang)) {
00364               // add to time lemmas
00365               timeLemmas.add(*noun);
00366             } else if (par->isLocation(lang)) {
00367               // add to location lemmas
00368               locLemmas.add(*noun);
00369             }
00370           }
00371         }
00372       }
00373     }
00374   }
00375 }
00376 
00377 void GlobalInfo::clearRareEntities()
00378 {
00379   for (WordCounts::iterator pit = timeLemmas.begin();
00380        pit != timeLemmas.end(); ) {
00381     WordCounts::iterator cur = pit++;
00382     int tc = cur->second;
00383     int lc = locLemmas.count(cur->first);
00384     if (tc >= freqRatio * lc)
00385       locLemmas.erase(cur->first);
00386     else if (lc >= freqRatio * tc)
00387       timeLemmas.erase(cur);
00388   }
00389 }
00390 
00391 void GlobalInfo::clear()
00392 {
00393   timeLemmas.clear();
00394   timeLemmas = WordCounts();
00395   locLemmas.clear();
00396   locLemmas = WordCounts();
00397 }
00398 
00399 void GlobalInfo::save(ofstream& ofs)
00400 {
00401   // dump timeLemmas
00402   ofs << timeLemmas.size() << endl;
00403   FOR_EACH (WordCounts, timeLemmas, pit)
00404     ofs << pit->first << endl;
00405   // dump locLemmas
00406   ofs << locLemmas.size() << endl;
00407   FOR_EACH (WordCounts, locLemmas, pit)
00408     ofs << pit->first << endl;
00409 }
00410 
00411 void GlobalInfo::load(ifstream& ifs)
00412 {
00413   char line[MAX_LINE_LEN];
00414   // read timeLemmas
00415   if (ifs.getline(line, MAX_LINE_LEN)) {
00416     int n = atoi(line);         // number of timeLemmas
00417     while (n-- && ifs.getline(line, MAX_LINE_LEN))
00418       timeLemmas.add(line);
00419     // read locLemmas
00420     if (ifs.getline(line, MAX_LINE_LEN)) {
00421       n = atoi(line);           // number of locLemmas
00422       while (n-- && ifs.getline(line, MAX_LINE_LEN))
00423         locLemmas.add(line);
00424     }
00425   }
00426 }
00427 
00428 // ======================================================================
00429 
00430 ParserPipe::ParserPipe(Parser& parser, Enumerator<std::vector<Token*>*>& tve) :
00431   parser(parser),
00432   tve(tve),
00433   language(Language::get(lang->c_str())) // FIXME: should not use globals
00434 {
00435 }
00436 
00437 void ParserPipe::Dispose()
00438 {
00439   parser.decRef();
00440   delete this;
00441 }
00442 
00443 bool ParserPipe::MoveNext()
00444 {
00445   return tve.MoveNext();
00446 }
00447 
00448 Sentence* ParserPipe::Current()
00449 {
00450   vector<Token*>* sent = tve.Current();
00451   Sentence* sentence = new Sentence(language);
00452   int id = 1;
00453   FOR_EACH (vector<Token*>, *sent, vit) {
00454     Token* tok = *vit;
00455     TreeToken* token = new TreeToken(id++, tok->form, tok->attributes, tok->links);
00456     sentence->push_back(token);
00457     delete tok;
00458   }
00459   delete sent;
00460   return parser.parse(sentence);
00461 }
00462 
00463 // ======================================================================
00464 
00465 ParserSentPipe::ParserSentPipe(Parser& parser, Enumerator<Sentence*>& tve) :
00466   parser(parser),
00467   tve(tve)
00468 { }
00469 
00470 void ParserSentPipe::Dispose()
00471 {
00472   parser.decRef();
00473   delete this;
00474 }
00475 
00476 bool ParserSentPipe::MoveNext()
00477 {
00478   return tve.MoveNext();
00479 }
00480 
00481 Sentence* ParserSentPipe::Current()
00482 {
00483   Sentence* sentence = tve.Current();
00484   return parser.parse(sentence);
00485 }
00486 
00487 } // namespace Parser