00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef DeSR_Token_H
00025 #define DeSR_Token_H
00026
00027
00028 #include <string>
00029 #include <vector>
00030 #include <sstream>
00031
00032
00033 #include "text/RegExp.h"
00034 #include "Common/util.h"
00035
00036
00037 #include "Language.h"
00038 #include "RefCountable.h"
00039 #include "TokenPath.h"
00040 #include "Corpus.h"
00041
00042 namespace Tanl {
00043
00049 struct TokenLink
00050 {
00051 TokenLink(int head = -1, char const* label = "") :
00052 head(head),
00053 label(label)
00054 { }
00055
00056 bool operator ==(const TokenLink& other) const {
00057 return head == other.head && label == other.label;
00058 }
00059 bool operator !=(const TokenLink& other) { return !(*this == other); }
00060
00061 int head;
00062 std::string label;
00063 };
00064
00065 typedef std::vector<TokenLink> TokenLinks;
00066
00072 struct Tag
00073 {
00074 Tag(const std::string& xmlTag);
00075
00076 std::string& operator[](const std::string& key) {
00077 return attributes[key];
00078 }
00079
00080 const std::string open() const {
00081 std::string tag = "<" + name;
00082 for (std::map<std::string, std::string>::const_iterator it = attributes.begin();
00083 it != attributes.end(); ++it)
00084 tag += ' ' + it->first + "=\"" + it->second + '"';
00085 return tag + ">";
00086 }
00087
00088 const std::string close() const {
00089 return "</" + name + ">";
00090 }
00091
00092 std::string name;
00093 std::map<std::string, std::string> attributes;
00094 };
00095
00101 struct Context : public RefCountable
00102 {
00103 Context(const std::string& xmlTag, Context* parent = 0) :
00104 tag(xmlTag),
00105 parent(parent)
00106 {
00107 if (parent)
00108 parent->incRef();
00109 }
00110
00111 ~Context() {
00112 if (parent)
00113 parent->decRef();
00114 }
00115
00116 void Dispose() {
00117 decRef();
00118 }
00119
00120 Tag tag;
00121 Context* parent;
00122
00123 };
00124
00130 struct Token
00131 {
00133 struct Morpho {
00134 std::string morph;
00135 char Case;
00136 char gender;
00137 char mode;
00138 char negative;
00139 char number;
00140 char person;
00141 char tense;
00142 char aspect;
00143 char transitive;
00144 char extra[20];
00145
00146 Morpho(const char* morph = "", size_t morphSize = 0) :
00147 morph(morph, morphSize),
00148 Case(0),
00149 gender(0),
00150 mode(0),
00151 negative(0),
00152 number(0),
00153 person(0),
00154 tense(0),
00155 transitive(0)
00156 {
00157 extra[0] = '\0';
00158 }
00159
00161 void set(MorphExtractor::Features& mf) {
00162 Case = mf.Case[0];
00163 gender = mf.gender[0];
00164 number = mf.number[0];
00165 person = mf.person[0];
00166 mode = mf.mode[0];
00167 negative = mf.negative[0];
00168 number = mf.number[0];
00169 person = mf.person[0];
00170 tense = mf.tense[0];
00171 transitive = mf.trans[0];
00172 int extraLen = sizeof(extra) - 1;
00173 strncpy(extra, mf.extra, extraLen);
00174 extra[extraLen] = '\0';
00175 }
00176 };
00177
00178 Token(Token const& tok) :
00179 form(tok.form),
00180 morpho(tok.morpho),
00181 attributes(tok.attributes),
00182 links(tok.links),
00183 context(tok.context)
00184 {
00185 if (context)
00186 context->incRef();
00187 }
00188
00189 Token(std::string& form, Attributes& attributes, Context* context = 0) :
00190 form(form),
00191 attributes(attributes),
00192 links(1, TokenLink(-1)),
00193 context(context)
00194 {
00195 if (context)
00196 context->incRef();
00197 }
00198
00199 Token(std::string& form, Attributes& attributes, TokenLinks& links, Context* context = 0) :
00200 form(form),
00201 attributes(attributes),
00202 links(links),
00203 context(context)
00204 {
00205 if (context)
00206 context->incRef();
00207 }
00208
00209 Token(Attributes& attributes, Context* context = 0) :
00210 attributes(attributes),
00211 links(1, TokenLink(-1)),
00212 context(context)
00213 {
00214 if (context)
00215 context->incRef();
00216 }
00217
00218 Token(std::string& form, AttributeIndex* attributeIndex, Context* context = 0) :
00219 form(form),
00220 attributes(attributeIndex),
00221 links(1, TokenLink(-1)),
00222 context(context)
00223 {
00224 if (context)
00225 context->incRef();
00226 }
00227
00228 Token(char const* form, AttributeIndex* attributeIndex, Context* context = 0) :
00229 form(form),
00230 attributes(attributeIndex),
00231 links(1, TokenLink(-1)),
00232 context(context)
00233 {
00234 if (context)
00235 context->incRef();
00236 }
00237
00238 Token(std::string& form, Corpus& corpus, Context* context = 0);
00239
00240 Token(AttributeIndex* attributeIndex, Context* context = 0) :
00241 attributes(attributeIndex),
00242 links(1, TokenLink(-1)),
00243 context(context)
00244 {
00245 if (context)
00246 context->incRef();
00247 }
00248
00249 ~Token() {
00250 if (context)
00251 context->decRef();
00252 }
00253
00254 std::string const& operator [](char const* key) {
00255 return *attributes.get(key);
00256 }
00257
00259 std::string const* get(char const* key) const {
00260 return attributes.get(key);
00261 }
00262
00264 void set(char const* key, char const* value) {
00265 attributes.insert(key, value);
00266 }
00267
00269 void set(char const* key, std::string const& value) {
00270 attributes.insert(key, value);
00271 }
00272
00274 void set(char const* key, int value) {
00275 attributes.insert(key, value);
00276 }
00277
00279 int attrIndex(char const* name) const {
00280 std::vector<char const*> const& names = attributes.attributeIndex->names;
00281 unsigned idx = 0;
00282 FOR_EACH (std::vector<char const*>, names, it) {
00283 if (!::strcmp(*it, name))
00284 break;
00285 idx++;
00286 }
00287 return (idx >= names.size()) ? -1 : (int)idx;
00288 }
00289
00290 bool operator ==(const Token& other) {
00291 return attributes == other.attributes &&
00292 links == other.links;
00293 }
00294 bool operator !=(const Token& other) { return !(*this == other); }
00295
00296 bool isNoun(Language const* lang);
00297 bool isPreposition(Language const* lang);
00298 bool isTime(Language const* lang);
00299 bool isLocation(Language const* lang);
00300 bool isVerb(Language const* lang);
00301 std::string const* lemma() const;
00302 void lemma(std::string const&);
00303 std::string const* pos() const;
00304 void pos(std::string const&);
00305
00307 void print(std::ostream& os, int indent = 0) const;
00308
00309 std::string form;
00310 Morpho morpho;
00311 Attributes attributes;
00312 TokenLinks links;
00313 Context* context;
00314 };
00315
00319 std::ostream& operator <<(std::ostream& os, Token const& tok);
00320
00326 struct TreeToken
00327 {
00328 TreeToken(std::string& form, AttributeIndex* ai) :
00329 token(new Token(form, ai))
00330 { }
00331
00332 TreeToken(int id, char const* form, AttributeIndex* attributeIndex = 0) :
00333 token(new Token(form, attributeIndex)),
00334 id(id)
00335 { }
00336
00337 TreeToken(int id, std::string& form, Attributes& attributes,
00338 TokenLinks& links) :
00339 token(new Token(form, attributes, links)),
00340 id(id)
00341 { }
00342
00343 TreeToken(int id, std::string& form, Attributes& attributes) :
00344 token(new Token(form, attributes)),
00345 id(id)
00346 { }
00347
00348 TreeToken(int id, Token* token) :
00349 token(token),
00350 id(id)
00351 { }
00352
00353 TreeToken(TreeToken const& tok) :
00354 token(new Token(*tok.token)),
00355 id(tok.id),
00356 left(tok.left),
00357 right(tok.right)
00358 { }
00359
00360 virtual ~TreeToken() { delete token; }
00361
00362 Token* token;
00363 unsigned id;
00364 std::vector<TreeToken*> left;
00365 std::vector<TreeToken*> right;
00366
00368 std::string const& form() const { return token->form; }
00369
00371 std::string const* get(char const* key) const {
00372 return token->get(key);
00373 }
00374
00376 std::string const* predicted(char const* key) const {
00377
00378 return strcmp(key, "DEPREL") ? token->get(key) : &linkLabel();
00379 }
00380
00382 void set(char const* key, char const* value) {
00383 token->set(key, value);
00384 }
00385
00387 void set(char const* key, std::string const& value) {
00388 token->set(key, value);
00389 }
00390
00391 int linkHead(int i = 0) const {
00392 return token->links[i].head;
00393 }
00394 int linkHead(int h, int i = 0) {
00395 return token->links[i].head = h;
00396 }
00397 std::string const& linkLabel(unsigned i = 0) const {
00398 return token->links[i].label;
00399 }
00400 std::string const& linkLabel(std::string& l, unsigned i = 0) {
00401 return token->links[i].label = l;
00402 }
00403 std::string const& linkLabel(char const* l, unsigned i = 0) {
00404 return token->links[i].label = l;
00405 }
00406
00407 bool operator ==(const TreeToken& other) { return *token == *other.token; }
00408 bool operator !=(const TreeToken& other) { return !(*this == other); }
00409
00411 int size() {
00412 int count = 0;
00413 FOR_EACH (std::vector<TreeToken*>, left, it)
00414 count += (*it)->size();
00415 FOR_EACH (std::vector<TreeToken*>, right, it)
00416 count += (*it)->size();
00417 return count + 1;
00418 }
00419
00425 TreeToken* follow(Parser::TokenPath const& tp, Sentence& sentence);
00426
00427 int leftmostDescendant() {
00428 int lmd = id;
00429 FOR_EACH (std::vector<TreeToken*>, left, it)
00430 lmd = MIN(lmd, (*it)->leftmostDescendant());
00431 return lmd;
00432 }
00433
00434 int rightmostDescendant() {
00435 int lmd = id;
00436 FOR_EACH (std::vector<TreeToken*>, right, it)
00437 lmd = MAX(lmd, (*it)->rightmostDescendant());
00438 return lmd;
00439 }
00440
00444 TreeToken* ancestorCPos(char const* pos, Sentence& sent);
00445
00449 TreeToken* descendantCPos(char const* pos) {
00450
00451 FOR_EACH (std::vector<TreeToken*>, left, it) {
00452 TreeToken* tokn = *it;
00453 std::string const* attr = tokn->get("CPOS");
00454 if (attr && *attr == pos)
00455 return tokn;
00456 tokn = tokn->descendantCPos(pos);
00457 if (tokn)
00458 return tokn;
00459 }
00460 FOR_EACH (std::vector<TreeToken*>, right, it) {
00461 TreeToken* tokn = *it;
00462 std::string const* attr = tokn->get("CPOS");
00463 if (attr && *attr == pos)
00464 return tokn;
00465 tokn = tokn->descendantCPos(pos);
00466 if (tokn)
00467 return tokn;
00468 }
00469 return 0;
00470 }
00471
00473
00474 void print(std::ostream& os, int indent = 0) const;
00475
00477 void printTab(std::ostream& os);
00478
00479 void printTab(std::ostream& os, Corpus const& corpus);
00480
00485 void printConll08(std::ostream& os, Corpus const& corpus,
00486 std::vector<int>* preds = 0);
00487
00491 void printLeaves(std::ostream& os);
00492 };
00493
00494 }
00495
00496 #endif // DeSR_Token_H