tanl: tanl: parse/desr/src/SentenceReader.cpp Source File

00001 /*
00002 **  DeSR
00003 **  src/SentenceReader.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of DeSR.
00009 **
00010 **  DeSR is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  DeSR is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #include "SentenceReader.h"
00025 
00026 // standard
00027 #include <iostream>
00028 #include <iomanip>
00029 
00030 // IXE library
00031 #include "text/Utf8Utils.h"
00032 #include "io/Format.h"
00033 
00034 // local
00035 #include "Corpus.h"
00036 
00037 using namespace std;
00038 using namespace Tanl::Text;
00039 using namespace IXE::io;
00040 
00041 namespace Tanl {
00042 
00043 //======================================================================
00044 // SentenceReader
00045 
00046 SentenceReader::SentenceReader(istream* is, Corpus* corpus) :
00047   is(is),
00048   corpus(corpus)
00049 { }
00050 
00051 // Generic tab format reader
00052 static RegExp::Pattern reTab("([^\t\n]+)");
00053 
00054 bool SentenceReader::MoveNext()
00055 {
00056   string line;
00057   if (!getline(*is, line) || line.empty())
00058     return false;
00059 
00060   sentence = new Sentence(&corpus->language);
00061   vector<char const*> const& names = corpus->index.names;
00062   vector<int> preds;            // ids of predicates in sentence
00063 
00064   int id = 1;
00065   RegExp::MatchGroups match(2);
00066 
00067   do {
00068     Attributes attributes(&corpus->index);
00069     string form;
00070     int head = 0;
00071     string deprel;
00072     int fields = corpus->tokenFields.size();
00073     TokenLinks links;
00074     unordered_map<string, int> linkMap;
00075     int argNo = 0;
00076     int i = 0;
00077     char const* cur = line.c_str();
00078     char const* end = cur + line.size();
00079     while (reTab.match(cur, end, match) > 0) {
00080       TokenField const& tf = corpus->tokenFields[i];
00081       char const* fieldStart = cur + match[1].first;
00082       int fieldLen = match[1].second - match[1].first;
00083       string value(fieldStart, fieldLen);
00084       // clear empty fields
00085       if (value == tf.default_)
00086         value = "";
00087       // discard IGNORE fields
00088       if (tf.use != TokenField::ignore) {
00089         attributes[i] = value;
00090         if (!tf.link.empty()) {
00091           // Got target of link: create even when value is missing
00092           int head = value.empty() ? -1 : atoi(value.c_str());
00093           if (linkMap.find(tf.link) == linkMap.end()) {
00094             // Create link and assign position in LinkMap 
00095             linkMap[tf.link] = links.size();
00096             links.push_back(TokenLink(head));
00097           } else
00098             // fill head in previously created link
00099             links[linkMap[tf.link]].head = head;
00100         } else if (!tf.label.empty()) {
00101           if (value.empty())
00102             --argNo;
00103           else {
00104             // Got label of link
00105             if (linkMap.find(tf.label) == linkMap.end()) {
00106               // Create link with dummy target and assign position in LinkMap 
00107               linkMap[tf.label] = links.size();
00108               // use negative value to distinguish from real target
00109               links.push_back(TokenLink(--argNo, value.c_str()));
00110             } else
00111               // fill label in previously created link
00112               links[linkMap[tf.label]].label = value;
00113           }
00114         } else {
00115           switch (tf.role) {
00116           case TokenField::form:
00117             form = value; break;
00118           case TokenField::predicate:
00119             if (!value.empty())
00120               preds.push_back(id);
00121             break;
00122           }
00123         }
00124       }
00125       i++;
00126       cur += match[0].second;
00127       if (i == fields || cur == end) {
00128         // skip extra fields
00129         break;
00130       }
00131     }
00132     TreeToken* token = new TreeToken(id++, form, attributes, links);
00133     sentence->push_back(token);
00134   } while (getline(*is, line) && !line.empty());
00135   if (preds.size()) {
00136     // fix ARG links (CoNLL 2008 format)
00137     FOR_EACH (Sentence, *sentence, sit) {
00138       TO_EACH (TokenLinks, (*sit)->token->links, tit) {
00139         if (tit->head < 0)
00140           tit->head = preds[-tit->head - 1];
00141       }
00142     }
00143   }
00144   return true;
00145 }
00146 
00147 Sentence* SentenceReader::Current()
00148 {
00149   return sentence;
00150 }
00151 
00152 //======================================================================
00153 // ConllXSentenceReader
00154 
00155 // pattern for analyzing token line in CoNLL format:
00156 // czech 2007 has extra tab at end of lines.
00157 static RegExp::Pattern reCoNLL("(\\d+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t\n]+)(?:\t([^\t]+)\t([^\t]+)(?:\t([^\t]+)\t([^\t]+)\t?)?)?");
00158 
00159 ConllXSentenceReader::ConllXSentenceReader(istream* is, Corpus* corpus) :
00160   SentenceReader(is, corpus),
00161   morphExtractor(*corpus->language.morphExtractor)
00162 {  }
00163 
00164 bool ConllXSentenceReader::MoveNext()
00165 {
00166   string line;
00167   if (!getline(*is, line) || line.empty())
00168     return false;
00169 
00170   sentence = new Sentence(&corpus->language);
00171   MorphExtractor::Features mf;
00172   vector<char const*>& names = corpus->index.names;
00173 
00174   int ln = 1;                   // count lines
00175   RegExp::MatchGroups match(11);
00176   do {
00177     int matches = reCoNLL.match(line, match);
00178     if (matches > 0) {
00179       char const* start = line.c_str();
00180       morphExtractor(start + match[6].first, start + match[6].second, mf);
00181       Attributes attributes(&corpus->index);
00182       int id = atoi(start + match[1].first);
00183       if (id != ln++)
00184         throw CorpusFormatError("Bad numbering: " + line);
00185       attributes.insert(names[0], string(start + match[1].first, match[1].second - match[1].first)); // ID
00186       string form(start + match[2].first, match[2].second - match[2].first);
00187       attributes.insert(names[1], form); // FORM
00188       int head = 0;
00189       string deprel;
00190       for (int i = 3; i < matches; i++) {
00191         // LEMMA, CPOS, POS, FEATS, HEAD, DEPREL, PHEAD, PDEPREL
00192         char const* tagStart = start + match[i].first;
00193         int tagLen = match[i].second - match[i].first;
00194         string value;
00195         // discard CoNLL empty fields
00196         if (tagLen != 1 || tagStart[0] != '_')
00197           value = string(tagStart, tagLen);
00198         attributes.insert(names[i-1], value);
00199         if (i == 7)             // HEAD
00200           head = atoi(tagStart);
00201         else if (i == 8)        // DEPREL
00202           deprel = value;
00203       }
00204       // sanity check: avoid circularities
00205       if (id == head)
00206         head = 0;
00207       TokenLinks links(1, TokenLink(head, deprel.c_str()));
00208       TreeToken* token = new TreeToken(id, form, attributes, links);
00209       // add morpho features
00210       token->token->morpho.set(mf);
00211       sentence->push_back(token);
00212     }
00213   } while (getline(*is, line) && !line.empty());
00214   // sanity check
00215   size_t len = sentence->size();
00216   FOR_EACH (Sentence, *sentence, sit) {
00217     int head = (*sit)->linkHead();
00218     if (head < 0 || head > len) {
00219       TreeToken* tok = *sit;
00220       Format msg("Wrong head at token: ID=%d FORM='%s' HEAD=%d ",
00221                  tok->id, tok->token->form.c_str(), head);
00222       throw CorpusFormatError(msg);
00223     }
00224   }
00225   return true;
00226 }
00227 
00228 //======================================================================
00229 // DgaSentenceReader
00230 
00231 // The stream consists of sentences:
00232 //
00233 // <s>token+</s>
00234 //
00235 // Tokens have the following format:
00236 //
00237 // <tok id="...">
00238 //  <orth>...</orth>
00239 //  <lemma>...</lemma>
00240 //  <pos>...</pos>
00241 //  <gen>...</gen>
00242 //  <num>...</num>
00243 //  <per>...</per>
00244 //  <dep head="..." type="..." />
00245 // </tok>
00246 
00250 DgaSentenceReader::DgaSentenceReader(istream* is, Corpus* corpus) :
00251   SentenceReader(is, corpus),
00252   reader(*is)
00253 { }
00254 
00255 bool DgaSentenceReader::MoveNext()
00256 {
00257   // check for open tag <s>
00258   if (!reader.Read() || reader.NodeType != Tanl::XML::Element ||
00259       reader.Name != "s")
00260     return false;
00261 
00262   sentence = new Sentence(&corpus->language);
00263   vector<char const*>& names = corpus->index.names;
00264   while (reader.Read()) {
00265     if (reader.NodeType == Tanl::XML::Whitespace)
00266       continue;
00267     if (reader.NodeType == Tanl::XML::EndElement)
00268       break;
00269     if (reader.NodeType == Tanl::XML::Element &&
00270          reader.Name == "tok") {
00271       Attributes attributes(&corpus->index);
00272       int id;
00273       string form;
00274       int head = 0;
00275       string deprel;
00276       while (reader.MoveToNextAttribute()) {
00277         if (reader.Name == "id")
00278           id = atoi(reader.Value.c_str());
00279       }
00280       // read token contents
00281       while (reader.Read()) {
00282         if (reader.NodeType == Tanl::XML::Whitespace)
00283           continue;
00284         if (reader.NodeType == Tanl::XML::EndElement)
00285           break;
00286         if (reader.NodeType != Tanl::XML::Element)
00287           goto fail;
00288         string& name = reader.Name;
00289         if (name == "dep") {
00290           // read attributes
00291           while (reader.MoveToNextAttribute()) {
00292             string& name = reader.Name;
00293             if (name == "head")
00294               head = atoi(reader.Value.c_str());
00295             else if (name == "type")
00296               deprel = reader.Value;
00297           }
00298         } else if (name == "orth") {
00299           if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00300             goto fail;
00301           form = reader.Value;
00302           if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00303               reader.Name != "orth")
00304             goto fail;
00305         } else {
00306           string tag = reader.Name;
00307           if (!reader.Read() || reader.NodeType != Tanl::XML::Text)
00308             goto fail;
00309           attributes.insert(tag.c_str(), reader.Value);
00310           if (!reader.Read() || reader.NodeType != Tanl::XML::EndElement ||
00311               reader.Name != tag)
00312             goto fail;
00313         }
00314       }
00315       // ckeck closing tag
00316       if (reader.Name != "tok")
00317         goto fail;
00318       TokenLinks links(1, TokenLink(head, deprel.c_str()));
00319       TreeToken* tok = new TreeToken(id, form, attributes, links);
00320       sentence->push_back(tok);
00321     } else
00322       goto fail;
00323   }
00324   // check closing tag
00325   if (reader.Name == "s")
00326     return true;
00327  fail:
00328   delete sentence;
00329   sentence = 0;
00330   return false;
00331 }
00332 
00333 //======================================================================
00334 // TokenSentenceReader
00335 
00339 TokenSentenceReader::TokenSentenceReader(istream* is, Corpus* corpus) :
00340   SentenceReader(is, corpus)
00341 {
00342 # ifdef STEMMER
00343   if (corpus && corpus->language)
00344     stemmer = sb_stemmer_new(corpus->language, 0); // UTF-8 encoding
00345 # endif
00346 }
00347 
00348 RegExp::Pattern TokenSentenceReader::reTok("\\s*([^\\s]*?)");
00349 
00350 bool TokenSentenceReader::MoveNext()
00351 {
00352   string line;
00353   if (!getline(*is, line) || line.empty())
00354     return false;
00355 
00356   sentence = new Sentence(&corpus->language);
00357   int id = 1;
00358   RegExp::MatchGroups matches(2);
00359   do {
00360     char const* cur = line.c_str();
00361     char const* endSent = cur + line.size();;
00362     while (reTok.match(cur, endSent, matches) > 0) {
00363       const char* tokStart = cur + matches[1].first;
00364       int tokSize = matches[1].second - matches[1].first;
00365       cur += matches[0].second;
00366       string form(tokStart, tokSize);
00367       TreeToken* tok = new TreeToken(id++, form.c_str(), &corpus->index);
00368       sentence->push_back(tok);
00369     }
00370   } while (getline(*is, line) && line.size());
00371   return true;
00372 }
00373 
00374 //======================================================================
00375 
00376 bool TaggedSentenceReader::MoveNext()
00377 {
00378   if (!reader->MoveNext())
00379     return false;
00380   sentence = reader->Current();
00381   if (tagger) {
00382     // perform POS tagging
00383     Parser::Tagged tagSentence;
00384     FOR_EACH (Sentence, *sentence, sit) {
00385       TreeToken* token = *sit;
00386       tagSentence.words.push_back(token->token->form.c_str());
00387       string const* pos = token->get("POS");
00388       tagSentence.tags.push_back(pos ? pos->c_str() : 0);
00389     }
00390     if (tagger->tag(tagSentence)) {
00391       for (unsigned i = 0; i < sentence->size(); i++) {
00392         (*sentence)[i]->set("POS", tagSentence.tags[i]);
00393         TreeToken* node = (*sentence)[i];
00394         string const* lemma = node->get("LEMMA");
00395         if (lemma && *lemma == "") {
00396           char const* lemma = tagSentence.lemmas[i];
00397           if (strcmp(lemma, "<unknown>"))
00398             node->set("LEMMA", lemma);
00399           else
00400             node->set("LEMMA", node->token->form);
00401         }
00402       }
00403     }
00404   }
00405   return true;
00406 }
00407 
00408 } // namespace Tanl