tanl: tanl: parse/desr/src/Token.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/Token.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005 Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program. If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "platform.h"
00025 
00026 #include "Sentence.h"
00027 #include "Corpus.h"
00028 
00029 // standard
00030 #include <iostream>
00031 #include <iomanip>
00032 
00033 // library
00034 #include "text/strings.h"       // strStartsWith
00035 
00036 using namespace std;
00037 using namespace Parser;
00038 using namespace Tanl::Text;
00039 
00040 namespace Tanl {
00041 
00042 ostream& operator <<(ostream& os, Token const& tok)
00043 {
00044   os << "<token";
00045   FOR_EACH (Attributes, tok.attributes, it)
00046     os << " " << (*it).first << "=\"" << (*it).second << '\"';
00047   return os << " />";
00048 }
00049 
00050 void Token::print(ostream& os, int indent) const
00051 {
00052   os << setw(indent) << "<token";
00053   for (Attributes::const_iterator fit = attributes.begin();
00054        fit != attributes.end(); ++fit)
00055     os << ' ' << (*fit).first << "=\"" << (*fit).second << "\"";
00056   os << " />" << endl;
00057 }
00058 
00059 // ======================================================================
00060 // Tag
00061 
00062 static Text::RegExp::Pattern reTag("<(\\w+)\\b[^>]*>");
00063 static Text::RegExp::Pattern reAttribute("(\\w+)\\s*=\\s*\\\"([^\\\"]*)\\\"");
00064 
00065 Tag::Tag(const string& xmlTag)
00066 {
00067   char const* start = xmlTag.c_str();
00068   Text::RegExp::MatchGroups match(3);
00069   if (reTag.match(start, match)) {
00070     name = string(start + match[1].first, match[1].second - match[1].first);
00071     start += match[1].second;
00072     while (reAttribute.match(start, match)) {
00073       string key(start + match[1].first, match[1].second - match[1].first);
00074       string value(start + match[2].first, match[2].second - match[2].first);
00075       attributes[key] = value;
00076       start += match[0].second;
00077     }
00078   }
00079 }
00080 
00081 //======================================================================
00082 // Token
00083 
00084 //#define CONLL08                       // FIXME: for the time being
00085 #ifdef CONLL08
00086 static char const* DeprelTag = "DEPREL";
00087 static char const* FormTag = "SPLIT_FORM";
00088 static char const* LemmaTag = "SPLIT_LEMMA";
00089 static char const* PosTag = "PPOSS";
00090 static char const* CPosTag = "";
00091 #else
00092 static char const* DeprelTag = "DEPREL";
00093 static char const* FormTag = "FORM";
00094 static char const* LemmaTag = "LEMMA";
00095 static char const* PosTag = "POSTAG";
00096 static char const* CPosTag = "CPOSTAG";
00097 #endif
00098 
00099 Token::Token(string& form, Corpus& corpus, Context* context) :
00100   form(form),
00101   attributes(&corpus.index),
00102   links(1, TokenLink(-1)),
00103   context(context)
00104 {
00105   if (context)
00106     context->incRef();
00107 }
00108 
00109 bool Token::isNoun(Language const* lang)
00110 {
00111   string const* cpos = get(CPosTag);
00112   if (cpos && *cpos == lang->nounCPos)
00113     return true;
00114   string const* pos = get(PosTag);
00115   return pos && strStartsWith(pos->c_str(), lang->nounCPos);
00116 }
00117 
00118 bool Token::isPreposition(Language const* lang)
00119 {
00120   string const* cpos = get(CPosTag);
00121   if (cpos && *cpos == lang->prepCPos)
00122     return true;
00123   string const* pos = get(PosTag);
00124   return pos && strStartsWith(pos->c_str(), lang->prepCPos);
00125 }
00126 
00127 bool Token::isTime(Language const* lang)
00128 {
00129   // from gold deprel
00130   string const* label = get(DeprelTag);
00131   return label && strStartsWith(label->c_str(), lang->timeDep);
00132 }
00133 
00134 bool Token::isLocation(Language const* lang)
00135 {
00136   // from gold deprel
00137   string const* label = get(DeprelTag);
00138   return label && strStartsWith(label->c_str(),lang->locDep);
00139 }
00140 
00141 bool Token::isVerb(Language const* lang)
00142 {
00143   string const* cpos = get(CPosTag);
00144   if (cpos && *cpos == lang->verbCPos)
00145     return true;
00146   string const* pos = get(PosTag);
00147   return pos && strStartsWith(pos->c_str(), lang->verbCPos);
00148 }
00149 
00150 string const* Token::lemma() const
00151 {
00152   return get(LemmaTag);
00153 }
00154 
00155 void Token::lemma(string const& l)
00156 {
00157   return set(LemmaTag, l);
00158 }
00159 
00160 string const* Token::pos() const
00161 {
00162   return get(PosTag);
00163 }
00164 
00165 void Token::pos(string const& p)
00166 {
00167   return set(PosTag, p);
00168 }
00169 
00170 //======================================================================
00171 // TreeToken
00172 
00173 TreeToken* TreeToken::ancestorCPos(char const* pos, Sentence& tokens)
00174 {
00175   for (int nid = linkHead(); nid != 0; ) {
00176     TreeToken* tokn = (TreeToken*)tokens[nid-1];
00177     string const* attr = tokn->get(CPosTag);
00178     if (attr && *attr == pos)
00179       return tokn;
00180     nid = tokn->linkHead();
00181   }
00182   return 0;
00183 }
00184 
00185 TreeToken* TreeToken::follow(Parser::TokenPath const& tp, Sentence& sentence)
00186 {
00187   TreeToken* tok = this;
00188   FOR_EACH(vector<TokenPath::Direction>, tp.path, pit) {
00189     switch (*pit) {
00190     case TokenPath::leftChild:
00191       // move to the furthest left child (as in Nivre), if present
00192       tok = tok->left.empty() ? 0 : tok->left.front();
00193       break;
00194 
00195     case TokenPath::rightChild:
00196       // move to the furthest right child (as in Nivre), if present
00197       tok = tok->right.empty() ? 0 : tok->right.front();
00198       break;
00199 
00200     case TokenPath::parent:
00201       if (tok->linkHead() == 0) {
00202         tok = 0;
00203         break;
00204       }
00205       tok = sentence[tok->linkHead() - 1];
00206       break;
00207 
00208     case TokenPath::leftSibling: {
00209       if (tok->linkHead() == 0) {
00210         tok = 0;
00211         break;
00212       }
00213       TreeToken* parent = sentence[tok->linkHead() - 1];
00214       TreeToken* sibl = 0;
00215       // find token following tok in left children
00216       FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00217         if (*cit == tok) {
00218           if (++cit != parent->left.end()) {
00219             sibl = *cit;
00220             break;
00221           }
00222         }
00223       }
00224       if (sibl) {
00225         tok = sibl;
00226         break;
00227       }
00228       // find token preceding tok in right children
00229       FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00230         if (*cit == tok) {
00231           tok = 0;
00232           break;
00233         }
00234         sibl = *cit;
00235       }
00236       tok = (tok == 0) ? sibl : 0;
00237       break;
00238     }
00239 
00240     case TokenPath::rightSibling: {
00241       if (tok->linkHead() == 0) {
00242         tok = 0;
00243         break;
00244       }
00245       TreeToken* parent = sentence[tok->linkHead() - 1];
00246       TreeToken* sibl = 0;
00247       // find token following tok in right children
00248       FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00249         if (*cit == tok) {
00250           if (++cit != parent->left.end()) {
00251             sibl = *cit;
00252             break;
00253           }
00254         }
00255       }
00256       if (sibl) {
00257         tok = sibl;
00258         break;
00259       }
00260       // find token preceding tok in left children
00261       FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00262         if (*cit == tok) {
00263           tok = 0;
00264           break;
00265         }
00266         sibl = *cit;
00267       }
00268       tok = (tok == 0) ? sibl : 0;
00269       break;
00270     }
00271 
00272     case TokenPath::previous:
00273       tok =  (tok->id > 1) ? sentence[tok->id - 2] : 0;
00274       break;
00275 
00276     case TokenPath::next:
00277       tok = (tok->id < sentence.size()) ? sentence[tok->id] : 0;
00278       break;
00279     }
00280 
00281     if (!tok) break;
00282   }
00283   return tok;
00284 }
00285 
00286 void TreeToken::print(ostream& os, int indent) const
00287 {
00288   Attributes& attributes = token->attributes;
00289   os << setw(indent) << ' ' << "<tok id=\"" << id
00290      << "\" form=\"" << attributes.get("FORM")
00291      << "\" lemma=\"" << attributes.get("LEMMA")
00292      << "\" cpos=\"" << attributes.get("CPOS")
00293      << "\" pos=\"" << attributes.get("POS")
00294      << "\" morph=\"" << attributes.get("FEATS")
00295      << "\" head=\"" << linkHead() << "\" deprel=\"" << linkLabel() << "\">" << endl;
00296   indent += 2;
00297   FOR_EACH (vector<TreeToken*>, left, it)
00298     (*it)->print(os, indent);
00299   FOR_EACH (vector<TreeToken*>, right, it)
00300     (*it)->print(os, indent);
00301 }
00302 
00303 void TreeToken::printTab(ostream& os)
00304 {
00305   Attributes& attributes = token->attributes;
00306   bool first = true;
00307   FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00308     string const* tag = attributes.get(*it);
00309     if (first)
00310       first = false;
00311     else
00312       os << "\t";
00313     os << ((!tag || tag->empty()) ? "_" : *tag);
00314   }
00315 }
00316 
00317 void TreeToken::printTab(ostream& os, Corpus const& corpus)
00318 {
00319   Attributes& attributes = token->attributes;
00320   TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00321   bool first = true;
00322   FOR_EACH (TokenFields, corpus.tokenFields, it) {
00323     if (first)
00324       first = false;
00325     else
00326       os << "\t";
00327     switch (tfit->role) {
00328     case TokenField::head:
00329       os << linkHead(); break;
00330     case TokenField::deprel:
00331       os << (linkLabel().empty() ? tfit->default_ : linkLabel()); break;
00332     default:
00333       if (tfit->use == TokenField::ignore)
00334         os << tfit->default_;
00335       else {
00336         string const* tag = attributes.get(it->name);
00337         os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00338     }
00339     }
00340     ++tfit;
00341   }
00342 }
00343 
00344 void TreeToken::printConll08(ostream& os, Corpus const& corpus, vector<int>* preds)
00345 {
00346   Attributes& attributes = token->attributes;
00347   TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00348   TokenLinks::const_iterator tlit = token->links.begin();
00349   bool first = true;
00350   FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00351     string const* tag = attributes.get(*it);
00352     if (first)
00353       first = false;
00354     else
00355       os << "\t";
00356     if (!tfit->link.empty())
00357       os << tlit->head;
00358     else if (!tfit->label.empty()) {
00359       if (tlit != token->links.end()) {
00360         os << (tlit->label.empty() ? tfit->default_ : tlit->label);
00361         ++tlit;
00362       }
00363     } else {
00364       os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00365     }
00366     ++tfit;
00367   }
00368   // print links
00369 # ifdef COMPACT
00370   preds = 0;
00371 # endif
00372   if (preds) {
00373     unsigned predNo = 0;
00374     for (; tlit != token->links.end(); ++tlit) {
00375       int head = tlit->head;
00376       while (head != (*preds)[predNo++])
00377         os << "\t_";
00378       os << '\t' << tlit->label;
00379     }
00380     while (predNo++ < preds->size())
00381       os << "\t_";
00382   } else {
00383     for (; tlit != token->links.end(); ++tlit)
00384       os << '\t' << tlit->label << '[' << tlit->head << ']';
00385   }
00386 }
00387 
00388 void TreeToken::printLeaves(ostream& os)
00389 {
00390   for (vector<TreeToken*>::reverse_iterator it = left.rbegin(); it != left.rend(); ++it)
00391     (*it)->printLeaves(cout);
00392   cout << token->form << ' ';
00393   FOR_EACH (vector<TreeToken*>, right, it)
00394     (*it)->printLeaves(cout);
00395 }
00396 
00397 } // namespace Tanl