00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "SentenceReader.h"
00025
00026
00027 #include <iostream>
00028 #include <iomanip>
00029
00030
00031 #include "text/Utf8Utils.h"
00032 #include "io/Format.h"
00033
00034
00035 #include "Corpus.h"
00036
00037 using namespace std;
00038 using namespace Tanl::Text;
00039 using namespace IXE::io;
00040
00041 namespace Tanl {
00042
00043
00044
00045
00046 SentenceReader::SentenceReader(istream* is, Corpus* corpus) :
00047 is(is),
00048 corpus(corpus)
00049 { }
00050
00051
00052 static RegExp::Pattern reTab("([^\t\n]+)");
00053
00054 bool SentenceReader::MoveNext()
00055 {
00056 string line;
00057 if (!getline(*is, line) || line.empty())
00058 return false;
00059
00060 sentence = new Sentence(&corpus->language);
00061 vector<char const*> const& names = corpus->index.names;
00062 vector<int> preds;
00063
00064 int id = 1;
00065 RegExp::MatchGroups match(2);
00066
00067 do {
00068 Attributes attributes(&corpus->index);
00069 string form;
00070 int head = 0;
00071 string deprel;
00072 int fields = corpus->tokenFields.size();
00073 TokenLinks links;
00074 unordered_map<string, int> linkMap;
00075 int argNo = 0;
00076 int i = 0;
00077 char const* cur = line.c_str();
00078 char const* end = cur + line.size();
00079 while (reTab.match(cur, end, match) > 0) {
00080 TokenField const& tf = corpus->tokenFields[i];
00081 char const* fieldStart = cur + match[1].first;
00082 int fieldLen = match[1].second - match[1].first;
00083 string value(fieldStart, fieldLen);
00084
00085 if (value == tf.default_)
00086 value = "";
00087
00088 if (tf.use != TokenField::ignore) {
00089 attributes[i] = value;
00090 if (!tf.link.empty()) {
00091
00092 int head = value.empty() ? -1 : atoi(value.c_str());
00093 if (linkMap.find(tf.link) == linkMap.end()) {
00094
00095 linkMap[tf.link] = links.size();
00096 links.push_back(TokenLink(head));
00097 } else
00098
00099 links[linkMap[tf.link]].head = head;
00100 } else if (!tf.label.empty()) {
00101 if (value.empty())
00102 --argNo;
00103 else {
00104
00105 if (linkMap.find(tf.label) == linkMap.end()) {
00106
00107 linkMap[tf.label] = links.size();
00108
00109 links.push_back(TokenLink(--argNo, value.c_str()));
00110 } else
00111
00112 links[linkMap[tf.label]].label = value;
00113 }
00114 } else {
00115 switch (tf.role) {
00116 case TokenField::form:
00117 form = value; break;
00118 case TokenField::predicate:
00119 if (!value.empty())
00120 preds.push_back(id);
00121 break;
00122 }
00123 }
00124 }
00125 i++;
00126 cur += match[0].second;
00127 if (i == fields || cur == end) {
00128
00129 break;
00130 }
00131 }
00132 TreeToken* token = new TreeToken(id++, form, attributes, links);
00133 sentence->push_back(token);
00134 } while (getline(*is, line) && !line.empty());
00135 if (preds.size()) {
00136
00137 FOR_EACH (Sentence, *sentence, sit) {
00138 TO_EACH (TokenLinks, (*sit)->token->links, tit) {
00139 if (tit->head < 0)
00140 tit->head = preds[-tit->head - 1];
00141 }
00142 }
00143 }
00144 return true;
00145 }
00146
00147 Sentence* SentenceReader::Current()
00148 {
00149 return sentence;
00150 }
00151
00152
00153
00154
00155
00156
00157 static RegExp::Pattern reCoNLL("(\\d+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t\n]+)(?:\t([^\t]+)\t([^\t]+)(?:\t([^\t]+)\t([^\t]+)\t?)?)?");
00158
00159 ConllXSentenceReader::ConllXSentenceReader(istream* is, Corpus* corpus) :
00160 SentenceReader(is, corpus),
00161 morphExtractor(*corpus->language.morphExtractor)
00162 { }
00163
00164 bool ConllXSentenceReader::MoveNext()
00165 {
00166 string line;
00167 if (!getline(*is, line) || line.empty())
00168 return false;
00169
00170 sentence = new Sentence(&corpus->language);
00171 MorphExtractor::Features mf;
00172 vector<char const*>& names = corpus->index.names;
00173
00174 int ln = 1;
00175 RegExp::MatchGroups match(11);
00176 do {
00177 int matches = reCoNLL.match(line, match);
00178 if (matches > 0) {
00179 char const* start = line.c_str();
00180 morphExtractor(start + match[6].first, start + match[6].second, mf);
00181 Attributes attributes(&corpus->index);
00182 int id = atoi(start + match[1].first);
00183 if (id != ln++)
00184 throw CorpusFormatError("Bad numbering: " + line);
00185 attributes.insert(names[0], string(start + match[1].first, match[1].second - match[1].first));
00186 string form(start + match[2].first, match[2].second - match[2].first);
00187 attributes.insert(names[1], form);
00188 int head = 0;
00189 string deprel;
00190 for (int i = 3; i < matches; i++) {
00191
00192 char const* tagStart = start + match[i].first;
00193 int tagLen = match[i].second - match[i].first;
00194 string value;
00195
00196 if (tagLen != 1 || tagStart[0] != '_')
00197 value = string(tagStart, tagLen);
00198 attributes.insert(names[i-1], value);
00199 if (i == 7)
00200 head = atoi(tagStart);
00201 else if (i == 8)
00202 deprel = value;
00203 }
00204
00205 if (id == head)
00206 head = 0;
00207 TokenLinks links(1, TokenLink(head, deprel.c_str()));
00208 TreeToken* token = new TreeToken(id, form, attributes, links);
00209
00210 token->token->morpho.set(mf);
00211 sentence->push_back(token);
00212 }
00213 } while (getline(*is, line) && !line.empty());
00214
00215 size_t len = sentence->size();
00216 FOR_EACH (Sentence, *sentence, sit) {
00217 int head = (*sit)->linkHead();
00218 if (head < 0 || head > len) {
00219 TreeToken* tok = *sit;
00220 Format msg("Wrong head at token: ID=%d FORM='%s' HEAD=%d ",
00221 tok->id, tok->token->form.c_str(), head);
00222 throw CorpusFormatError(msg);
00223 }
00224 }
00225 return true;
00226 }
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00250 DgaSentenceReader::DgaSentenceReader(istream* is, Corpus* corpus) :
00251 SentenceReader(is, corpus),
00252 reader(*is)
00253 { }
00254
00255 bool DgaSentenceReader::MoveNext()
00256 {
00257
00258 if (!reader.Read() || reader.NodeType != Tanl::XML::Element ||
00259 reader.Name != "s")
00260 return false;
00261
00262 sentence = new Sentence(&corpus->language);
00263 vector<char const*>& names = corpus->index.names;
00264 while (reader.Read()) {
00265 if (reader.NodeType == Tanl::XML::Whitespace)
00266 continue;
00267 if (reader.NodeType == Tanl::XML::EndElement)
00268 break;
00269 if (reader.NodeType == Tanl::XML::Element &&
00270 reader.Name == "tok") {
00271 Attributes attributes(&corpus->index);
00272 int id;
00273 string form;
00274 int head = 0;
00275 string deprel;
00276 while (reader.MoveToNextAttribute()) {
00277 if (reader.Name == "id")
00278 id = atoi(reader.Value.c_str());
00279 }
00280
00281 while (reader.Read()) {
00282 if (reader.NodeType == Tanl::XML::Whitespace)
00283 continue;
00284 if (reader.NodeType == Tanl::XML::EndElement)
00285 break;
00286 if (reader.NodeType != Tanl::XML::Element)
00287 goto fail;
00288 string& name = reader.Name;
00289 if (name == "dep") {
00290
00291 while (reader.MoveToNextAttribute()) {
00292 string& name = reader.Name;
00293 if (name == "head")
00294 head = atoi(reader.Value.c_str());
00295 else if (name == "type")
00296 deprel = reader.Value;
00297 }
00298 } else if (name == "orth") {
00299 if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00300 goto fail;
00301 form = reader.Value;
00302 if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00303 reader.Name != "orth")
00304 goto fail;
00305 } else {
00306 string tag = reader.Name;
00307 if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00308 goto fail;
00309 attributes.insert(tag.c_str(), reader.Value);
00310 if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00311 reader.Name != tag)
00312 goto fail;
00313 }
00314 }
00315
00316 if (reader.Name != "tok")
00317 goto fail;
00318 TokenLinks links(1, TokenLink(head, deprel.c_str()));
00319 TreeToken* tok = new TreeToken(id, form, attributes, links);
00320 sentence->push_back(tok);
00321 } else
00322 goto fail;
00323 }
00324
00325 if (reader.Name == "s")
00326 return true;
00327 fail:
00328 delete sentence;
00329 sentence = 0;
00330 return false;
00331 }
00332
00333
00334
00335
00339 TokenSentenceReader::TokenSentenceReader(istream* is, Corpus* corpus) :
00340 SentenceReader(is, corpus)
00341 {
00342 # ifdef STEMMER
00343 if (corpus && corpus->language)
00344 stemmer = sb_stemmer_new(corpus->language, 0);
00345 # endif
00346 }
00347
00348 RegExp::Pattern TokenSentenceReader::reTok("\\s*([^\\s]*?)");
00349
00350 bool TokenSentenceReader::MoveNext()
00351 {
00352 string line;
00353 if (!getline(*is, line) || line.empty())
00354 return false;
00355
00356 sentence = new Sentence(&corpus->language);
00357 int id = 1;
00358 RegExp::MatchGroups matches(2);
00359 do {
00360 char const* cur = line.c_str();
00361 char const* endSent = cur + line.size();;
00362 while (reTok.match(cur, endSent, matches) > 0) {
00363 const char* tokStart = cur + matches[1].first;
00364 int tokSize = matches[1].second - matches[1].first;
00365 cur += matches[0].second;
00366 string form(tokStart, tokSize);
00367 TreeToken* tok = new TreeToken(id++, form.c_str(), &corpus->index);
00368 sentence->push_back(tok);
00369 }
00370 } while (getline(*is, line) && line.size());
00371 return true;
00372 }
00373
00374
00375
00376 bool TaggedSentenceReader::MoveNext()
00377 {
00378 if (!reader->MoveNext())
00379 return false;
00380 sentence = reader->Current();
00381 if (tagger) {
00382
00383 Parser::Tagged tagSentence;
00384 FOR_EACH (Sentence, *sentence, sit) {
00385 TreeToken* token = *sit;
00386 tagSentence.words.push_back(token->token->form.c_str());
00387 string const* pos = token->get("POS");
00388 tagSentence.tags.push_back(pos ? pos->c_str() : 0);
00389 }
00390 if (tagger->tag(tagSentence)) {
00391 for (unsigned i = 0; i < sentence->size(); i++) {
00392 (*sentence)[i]->set("POS", tagSentence.tags[i]);
00393 TreeToken* node = (*sentence)[i];
00394 string const* lemma = node->get("LEMMA");
00395 if (lemma && *lemma == "") {
00396 char const* lemma = tagSentence.lemmas[i];
00397 if (strcmp(lemma, "<unknown>"))
00398 node->set("LEMMA", lemma);
00399 else
00400 node->set("LEMMA", node->token->form);
00401 }
00402 }
00403 }
00404 }
00405 return true;
00406 }
00407
00408 }