00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_Token_H
00025 #define Tanl_Token_H
00026
00027
00028 #include <string>
00029 #include <vector>
00030 #include <sstream>
00031 #include <map>
00032
00033 #include <iostream>
00034
00035
00036 #include "text/WordIndex.h"
00037 #include "text/RegExp.h"
00038
00039
00040 #include "include/RefCountable.h"
00041
00042 namespace Tanl {
00043
00044 typedef unsigned AttributeId;
00045
00051 struct AttributeIndex : public Text::WordIndex
00052 {
00053 std::vector<char const*> names;
00054
00059 AttributeId insert(const char* name) {
00060 AttributeId id = Text::WordIndex::insert(name);
00061 if (id >= names.size()) {
00062
00063 names.resize(id + 1);
00064 names[id] = ::strdup(name);
00065 }
00066 return id;
00067 }
00068
00069 AttributeIndex& operator =(AttributeIndex const& ai) {
00070 Text::WordIndex::operator=(ai);
00071 for (unsigned i = 0; i < names.size(); i++)
00072 free((void*)names[i]);
00073 names.resize(ai.size());
00074 for (unsigned i = 0; i < ai.names.size(); i++)
00075 names[i] = strdup(ai.names[i]);
00076 return *this;
00077 }
00078
00079 ~AttributeIndex() {
00080 for (unsigned i = 0; i < names.size(); i++)
00081 free((void*)names[i]);
00082 }
00083 };
00084
00085 std::ostream& operator <<(std::ostream& os, AttributeIndex const& ai);
00086
00095 struct Attributes
00096 {
00097 typedef std::string Attribute;
00098
00099 AttributeIndex* attributeIndex;
00100 std::vector<Attribute> values;
00101
00102 static AttributeIndex* emptyAttrIndex;
00103
00104 Attributes(AttributeIndex* attributeIndex) :
00105 attributeIndex(attributeIndex)
00106 {
00107 if (attributeIndex)
00108 values.resize(attributeIndex->size());
00109 else
00110 this->attributeIndex = emptyAttrIndex;
00111 }
00112
00113 Attributes(AttributeIndex* attributeIndex, std::vector<Attribute>& values) :
00114 attributeIndex(attributeIndex),
00115 values(values)
00116 { }
00117
00118 Attribute& operator [](int i) { return values[i]; }
00119
00120 bool operator ==(const Attributes& other) {
00121 return attributeIndex == other.attributeIndex &&
00122 values == other.values;
00123 }
00124
00125 bool operator !=(const Attributes& other) { return !(*this == other); }
00126
00127 struct const_iterator {
00128 const_iterator(Attributes const& attributes, int fit = 0) :
00129 attributes(attributes),
00130 fit(fit)
00131 { }
00132
00133 bool operator ==(const const_iterator& other) {
00134 return &attributes == &other.attributes && fit == other.fit;
00135 }
00136 bool operator !=(const const_iterator& other) { return !(*this == other); }
00137 const_iterator& operator ++() { fit++; return *this; }
00138 const_iterator operator ++(int) {
00139 const_iterator tmp = *this;
00140 ++*this;
00141 return tmp;
00142 }
00143
00144
00145 std::pair<char const*, std::string const*> operator*() {
00146 std::string const& val = attributes.values[fit];
00147 return std::make_pair(attributes.attributeIndex->names[fit], &val);
00148 }
00149
00150 Attributes const& attributes;
00151 int fit;
00152 };
00153
00154 const_iterator begin() const { return const_iterator(*this); }
00155 const_iterator end() const { return const_iterator(*this, values.size()); }
00156
00160 std::string const* get(std::string const& name) const {
00161 AttributeId id = attributeIndex->index(name.c_str());
00162 return (id == Text::WordIndex::None) ? 0 : &values[id];
00163 }
00164
00168 std::string const* get(char const* name) const {
00169 AttributeId id = attributeIndex->index(name);
00170 return (id == Text::WordIndex::None) ? 0 : &values[id];
00171 }
00172
00176 AttributeId index(char const* key) const {
00177 return attributeIndex->index(key);
00178 }
00179
00183 void insert(char const* key, char const* value) {
00184 AttributeId id = attributeIndex->insert(key);
00185 if (id >= values.size())
00186 values.resize(id + 1);
00187 values[id] = value;
00188 }
00189
00193 void insert(char const* key, std::string const& value) {
00194 AttributeId id = attributeIndex->insert(key);
00195 if (id >= values.size())
00196 values.resize(id + 1);
00197 values[id] = value;
00198 }
00199
00203 void insert(char const* key, int value) {
00204 AttributeId id = attributeIndex->insert(key);
00205 if (id >= values.size())
00206 values.resize(id + 1);
00207 std::ostringstream oss;
00208 oss << value;
00209 values[id] = oss.str();
00210 }
00211 };
00212
00218 struct TokenLink
00219 {
00220 TokenLink(int head = -1, char const* label = "") :
00221 head(head),
00222 label(label)
00223 { }
00224
00225 bool operator ==(const TokenLink& other) const {
00226 return head == other.head && label == other.label;
00227 }
00228 bool operator !=(const TokenLink& other) { return !(*this == other); }
00229
00230 int head;
00231 std::string label;
00232 };
00233
00234 typedef std::vector<TokenLink> TokenLinks;
00235
00241 struct Tag
00242 {
00243 Tag(const std::string& xmlTag);
00244
00245 std::string& operator[](const std::string& key) {
00246 return attributes[key];
00247 }
00248
00249 const std::string open() const {
00250 std::string tag = "<" + name;
00251 for (std::map<std::string, std::string>::const_iterator it = attributes.begin();
00252 it != attributes.end(); ++it)
00253 tag += ' ' + it->first + "=\"" + it->second + '"';
00254 return tag + ">";
00255 }
00256
00257 const std::string close() const {
00258 return "</" + name + ">";
00259 }
00260
00261 std::string name;
00262 std::map<std::string, std::string> attributes;
00263 };
00264
00270 struct Context : public RefCountable
00271 {
00272 Context(const std::string& xmlTag, Context* parent = 0) :
00273 tag(xmlTag),
00274 parent(parent)
00275 {
00276 if (parent)
00277 parent->incRef();
00278 }
00279
00280 ~Context() {
00281 if (parent)
00282 parent->decRef();
00283 }
00284
00285 void Dispose() {
00286 decRef();
00287 }
00288
00289 Tag tag;
00290 Context* parent;
00291
00292 };
00293
00294 class Corpus;
00295
00301 struct Token
00302 {
00304 struct Morpho {
00305 std::string morph;
00306 char Case;
00307 char gender;
00308 char mode;
00309 char negative;
00310 char number;
00311 char person;
00312 char tense;
00313 char aspect;
00314 char transitive;
00315 char extra[20];
00316
00317 Morpho(const char* morph = "", size_t morphSize = 0) :
00318 morph(morph, morphSize),
00319 Case(0),
00320 gender(0),
00321 mode(0),
00322 negative(0),
00323 number(0),
00324 person(0),
00325 tense(0),
00326 transitive(0)
00327 {
00328 extra[0] = '\0';
00329 }
00330 };
00331
00332 Token(std::string& form, Attributes& attributes, Context* context = 0) :
00333 form(form),
00334 attributes(attributes),
00335 links(1, TokenLink(-1)),
00336 context(context)
00337 {
00338 if (context)
00339 context->incRef();
00340 }
00341
00342 Token(std::string& form, Attributes& attributes, TokenLinks& links, Context* context = 0) :
00343 form(form),
00344 attributes(attributes),
00345 links(links),
00346 context(context)
00347 {
00348 if (context)
00349 context->incRef();
00350 }
00351
00352 Token(Attributes& attributes, Context* context = 0) :
00353 attributes(attributes),
00354 links(1, TokenLink(-1)),
00355 context(context)
00356 {
00357 if (context)
00358 context->incRef();
00359 }
00360
00361 Token(std::string& form, AttributeIndex* attributeIndex, Context* context = 0) :
00362 form(form),
00363 attributes(attributeIndex),
00364 links(1, TokenLink(-1)),
00365 context(context)
00366 {
00367 if (context)
00368 context->incRef();
00369 }
00370
00371 Token(char const* form, AttributeIndex* attributeIndex, Context* context = 0) :
00372 form(form),
00373 attributes(attributeIndex),
00374 links(1, TokenLink(-1)),
00375 context(context)
00376 {
00377 if (context)
00378 context->incRef();
00379 }
00380
00381 Token(AttributeIndex* attributeIndex, Context* context = 0) :
00382 attributes(attributeIndex),
00383 links(1, TokenLink(-1)),
00384 context(context)
00385 {
00386 if (context)
00387 context->incRef();
00388 }
00389
00390 ~Token() {
00391 if (context)
00392 context->decRef();
00393 }
00394
00395 std::string const& operator [](char const* key) {
00396 return *attributes.get(key);
00397 }
00398
00400 std::string const* get(char const* key) const {
00401 return attributes.get(key);
00402 }
00403
00405 void set(char const* key, char const* value) {
00406 attributes.insert(key, value);
00407 }
00408
00410 void set(char const* key, std::string const& value) {
00411 attributes.insert(key, value);
00412 }
00413
00415 void set(char const* key, int value) {
00416 attributes.insert(key, value);
00417 }
00418
00420 int attrIndex(char const* name) const {
00421 std::vector<char const*> const& names = attributes.attributeIndex->names;
00422 unsigned idx = 0;
00423 for (std::vector<char const*>::const_iterator it = names.begin();
00424 it != names.end(); ++it) {
00425 if (!::strcmp(*it, name))
00426 break;
00427 idx++;
00428 }
00429 return (idx >= names.size()) ? -1 : (int)idx;
00430 }
00431
00432 bool operator ==(const Token& other) {
00433 return attributes == other.attributes &&
00434 links == other.links;
00435 }
00436 bool operator !=(const Token& other) { return !(*this == other); }
00437
00439 void print(std::ostream& os, int indent = 0) const;
00440
00441 void printTab(std::ostream& os, Corpus const& corpus);
00442
00443 std::string form;
00444 Morpho morpho;
00445 Attributes attributes;
00446 TokenLinks links;
00447 Context* context;
00448 };
00449
00453 std::ostream& operator <<(std::ostream& os, Token const& tok);
00454
00455 typedef std::vector<Token*> Sentence;
00456
00457 }
00458
00459 #endif // Tanl_Token_H