00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "platform.h"
00025
00026 #include "Sentence.h"
00027 #include "Corpus.h"
00028
00029
00030 #include <iostream>
00031 #include <iomanip>
00032
00033
00034 #include "text/strings.h"
00035
00036 using namespace std;
00037 using namespace Parser;
00038 using namespace Tanl::Text;
00039
00040 namespace Tanl {
00041
00042 ostream& operator <<(ostream& os, Token const& tok)
00043 {
00044 os << "<token";
00045 FOR_EACH (Attributes, tok.attributes, it)
00046 os << " " << (*it).first << "=\"" << (*it).second << '\"';
00047 return os << " />";
00048 }
00049
00050 void Token::print(ostream& os, int indent) const
00051 {
00052 os << setw(indent) << "<token";
00053 for (Attributes::const_iterator fit = attributes.begin();
00054 fit != attributes.end(); ++fit)
00055 os << ' ' << (*fit).first << "=\"" << (*fit).second << "\"";
00056 os << " />" << endl;
00057 }
00058
00059
00060
00061
00062 static Text::RegExp::Pattern reTag("<(\\w+)\\b[^>]*>");
00063 static Text::RegExp::Pattern reAttribute("(\\w+)\\s*=\\s*\\\"([^\\\"]*)\\\"");
00064
00065 Tag::Tag(const string& xmlTag)
00066 {
00067 char const* start = xmlTag.c_str();
00068 Text::RegExp::MatchGroups match(3);
00069 if (reTag.match(start, match)) {
00070 name = string(start + match[1].first, match[1].second - match[1].first);
00071 start += match[1].second;
00072 while (reAttribute.match(start, match)) {
00073 string key(start + match[1].first, match[1].second - match[1].first);
00074 string value(start + match[2].first, match[2].second - match[2].first);
00075 attributes[key] = value;
00076 start += match[0].second;
00077 }
00078 }
00079 }
00080
00081
00082
00083
00084
00085 #ifdef CONLL08
00086 static char const* DeprelTag = "DEPREL";
00087 static char const* FormTag = "SPLIT_FORM";
00088 static char const* LemmaTag = "SPLIT_LEMMA";
00089 static char const* PosTag = "PPOSS";
00090 static char const* CPosTag = "";
00091 #else
00092 static char const* DeprelTag = "DEPREL";
00093 static char const* FormTag = "FORM";
00094 static char const* LemmaTag = "LEMMA";
00095 static char const* PosTag = "POSTAG";
00096 static char const* CPosTag = "CPOSTAG";
00097 #endif
00098
00099 Token::Token(string& form, Corpus& corpus, Context* context) :
00100 form(form),
00101 attributes(&corpus.index),
00102 links(1, TokenLink(-1)),
00103 context(context)
00104 {
00105 if (context)
00106 context->incRef();
00107 }
00108
00109 bool Token::isNoun(Language const* lang)
00110 {
00111 string const* cpos = get(CPosTag);
00112 if (cpos && *cpos == lang->nounCPos)
00113 return true;
00114 string const* pos = get(PosTag);
00115 return pos && strStartsWith(pos->c_str(), lang->nounCPos);
00116 }
00117
00118 bool Token::isPreposition(Language const* lang)
00119 {
00120 string const* cpos = get(CPosTag);
00121 if (cpos && *cpos == lang->prepCPos)
00122 return true;
00123 string const* pos = get(PosTag);
00124 return pos && strStartsWith(pos->c_str(), lang->prepCPos);
00125 }
00126
00127 bool Token::isTime(Language const* lang)
00128 {
00129
00130 string const* label = get(DeprelTag);
00131 return label && strStartsWith(label->c_str(), lang->timeDep);
00132 }
00133
00134 bool Token::isLocation(Language const* lang)
00135 {
00136
00137 string const* label = get(DeprelTag);
00138 return label && strStartsWith(label->c_str(),lang->locDep);
00139 }
00140
00141 bool Token::isVerb(Language const* lang)
00142 {
00143 string const* cpos = get(CPosTag);
00144 if (cpos && *cpos == lang->verbCPos)
00145 return true;
00146 string const* pos = get(PosTag);
00147 return pos && strStartsWith(pos->c_str(), lang->verbCPos);
00148 }
00149
00150 string const* Token::lemma() const
00151 {
00152 return get(LemmaTag);
00153 }
00154
00155 void Token::lemma(string const& l)
00156 {
00157 return set(LemmaTag, l);
00158 }
00159
00160 string const* Token::pos() const
00161 {
00162 return get(PosTag);
00163 }
00164
00165 void Token::pos(string const& p)
00166 {
00167 return set(PosTag, p);
00168 }
00169
00170
00171
00172
00173 TreeToken* TreeToken::ancestorCPos(char const* pos, Sentence& tokens)
00174 {
00175 for (int nid = linkHead(); nid != 0; ) {
00176 TreeToken* tokn = (TreeToken*)tokens[nid-1];
00177 string const* attr = tokn->get(CPosTag);
00178 if (attr && *attr == pos)
00179 return tokn;
00180 nid = tokn->linkHead();
00181 }
00182 return 0;
00183 }
00184
00185 TreeToken* TreeToken::follow(Parser::TokenPath const& tp, Sentence& sentence)
00186 {
00187 TreeToken* tok = this;
00188 FOR_EACH(vector<TokenPath::Direction>, tp.path, pit) {
00189 switch (*pit) {
00190 case TokenPath::leftChild:
00191
00192 tok = tok->left.empty() ? 0 : tok->left.front();
00193 break;
00194
00195 case TokenPath::rightChild:
00196
00197 tok = tok->right.empty() ? 0 : tok->right.front();
00198 break;
00199
00200 case TokenPath::parent:
00201 if (tok->linkHead() == 0) {
00202 tok = 0;
00203 break;
00204 }
00205 tok = sentence[tok->linkHead() - 1];
00206 break;
00207
00208 case TokenPath::leftSibling: {
00209 if (tok->linkHead() == 0) {
00210 tok = 0;
00211 break;
00212 }
00213 TreeToken* parent = sentence[tok->linkHead() - 1];
00214 TreeToken* sibl = 0;
00215
00216 FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00217 if (*cit == tok) {
00218 if (++cit != parent->left.end()) {
00219 sibl = *cit;
00220 break;
00221 }
00222 }
00223 }
00224 if (sibl) {
00225 tok = sibl;
00226 break;
00227 }
00228
00229 FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00230 if (*cit == tok) {
00231 tok = 0;
00232 break;
00233 }
00234 sibl = *cit;
00235 }
00236 tok = (tok == 0) ? sibl : 0;
00237 break;
00238 }
00239
00240 case TokenPath::rightSibling: {
00241 if (tok->linkHead() == 0) {
00242 tok = 0;
00243 break;
00244 }
00245 TreeToken* parent = sentence[tok->linkHead() - 1];
00246 TreeToken* sibl = 0;
00247
00248 FOR_EACH(vector<TreeToken*>, parent->right, cit) {
00249 if (*cit == tok) {
00250 if (++cit != parent->left.end()) {
00251 sibl = *cit;
00252 break;
00253 }
00254 }
00255 }
00256 if (sibl) {
00257 tok = sibl;
00258 break;
00259 }
00260
00261 FOR_EACH(vector<TreeToken*>, parent->left, cit) {
00262 if (*cit == tok) {
00263 tok = 0;
00264 break;
00265 }
00266 sibl = *cit;
00267 }
00268 tok = (tok == 0) ? sibl : 0;
00269 break;
00270 }
00271
00272 case TokenPath::previous:
00273 tok = (tok->id > 1) ? sentence[tok->id - 2] : 0;
00274 break;
00275
00276 case TokenPath::next:
00277 tok = (tok->id < sentence.size()) ? sentence[tok->id] : 0;
00278 break;
00279 }
00280
00281 if (!tok) break;
00282 }
00283 return tok;
00284 }
00285
00286 void TreeToken::print(ostream& os, int indent) const
00287 {
00288 Attributes& attributes = token->attributes;
00289 os << setw(indent) << ' ' << "<tok id=\"" << id
00290 << "\" form=\"" << attributes.get("FORM")
00291 << "\" lemma=\"" << attributes.get("LEMMA")
00292 << "\" cpos=\"" << attributes.get("CPOS")
00293 << "\" pos=\"" << attributes.get("POS")
00294 << "\" morph=\"" << attributes.get("FEATS")
00295 << "\" head=\"" << linkHead() << "\" deprel=\"" << linkLabel() << "\">" << endl;
00296 indent += 2;
00297 FOR_EACH (vector<TreeToken*>, left, it)
00298 (*it)->print(os, indent);
00299 FOR_EACH (vector<TreeToken*>, right, it)
00300 (*it)->print(os, indent);
00301 }
00302
00303 void TreeToken::printTab(ostream& os)
00304 {
00305 Attributes& attributes = token->attributes;
00306 bool first = true;
00307 FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00308 string const* tag = attributes.get(*it);
00309 if (first)
00310 first = false;
00311 else
00312 os << "\t";
00313 os << ((!tag || tag->empty()) ? "_" : *tag);
00314 }
00315 }
00316
00317 void TreeToken::printTab(ostream& os, Corpus const& corpus)
00318 {
00319 Attributes& attributes = token->attributes;
00320 TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00321 bool first = true;
00322 FOR_EACH (TokenFields, corpus.tokenFields, it) {
00323 if (first)
00324 first = false;
00325 else
00326 os << "\t";
00327 switch (tfit->role) {
00328 case TokenField::head:
00329 os << linkHead(); break;
00330 case TokenField::deprel:
00331 os << (linkLabel().empty() ? tfit->default_ : linkLabel()); break;
00332 default:
00333 if (tfit->use == TokenField::ignore)
00334 os << tfit->default_;
00335 else {
00336 string const* tag = attributes.get(it->name);
00337 os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00338 }
00339 }
00340 ++tfit;
00341 }
00342 }
00343
00344 void TreeToken::printConll08(ostream& os, Corpus const& corpus, vector<int>* preds)
00345 {
00346 Attributes& attributes = token->attributes;
00347 TokenFields::const_iterator tfit = corpus.tokenFields.begin();
00348 TokenLinks::const_iterator tlit = token->links.begin();
00349 bool first = true;
00350 FOR_EACH (vector<char const*>, attributes.attributeIndex->names, it) {
00351 string const* tag = attributes.get(*it);
00352 if (first)
00353 first = false;
00354 else
00355 os << "\t";
00356 if (!tfit->link.empty())
00357 os << tlit->head;
00358 else if (!tfit->label.empty()) {
00359 if (tlit != token->links.end()) {
00360 os << (tlit->label.empty() ? tfit->default_ : tlit->label);
00361 ++tlit;
00362 }
00363 } else {
00364 os << ((!tag || tag->empty()) ? tfit->default_ : *tag);
00365 }
00366 ++tfit;
00367 }
00368
00369 # ifdef COMPACT
00370 preds = 0;
00371 # endif
00372 if (preds) {
00373 unsigned predNo = 0;
00374 for (; tlit != token->links.end(); ++tlit) {
00375 int head = tlit->head;
00376 while (head != (*preds)[predNo++])
00377 os << "\t_";
00378 os << '\t' << tlit->label;
00379 }
00380 while (predNo++ < preds->size())
00381 os << "\t_";
00382 } else {
00383 for (; tlit != token->links.end(); ++tlit)
00384 os << '\t' << tlit->label << '[' << tlit->head << ']';
00385 }
00386 }
00387
00388 void TreeToken::printLeaves(ostream& os)
00389 {
00390 for (vector<TreeToken*>::reverse_iterator it = left.rbegin(); it != left.rend(); ++it)
00391 (*it)->printLeaves(cout);
00392 cout << token->form << ' ';
00393 FOR_EACH (vector<TreeToken*>, right, it)
00394 (*it)->printLeaves(cout);
00395 }
00396
00397 }