tanl: tanl: morph/MorphSplitter/MorphSplitter.cpp Source File

00001 /*
00002 **  Tanl
00003 **  MorphSplitter/MorphSplitter.cpp
00004 ** ----------------------------------------------------------------------
00005 **  Authors: Antonio Fuschetto (fuschett@di.unipi.it), University of Pisa
00006 **           Giuseppe Attardi (attardi@di.unipi.it), University of Pisa
00007 ** ----------------------------------------------------------------------
00008 **  Copyright (c) 2009  Giuseppe Attardi (attardi@di.unipi.it).
00009 ** ----------------------------------------------------------------------
00010 **
00011 **  This file is part of Tanl.
00012 **
00013 **  Tanl is free software; you can redistribute it and/or modify it
00014 **  under the terms of the GNU General Public License, version 3,
00015 **  as published by the Free Software Foundation.
00016 **
00017 **  Tanl is distributed in the hope that it will be useful,
00018 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 **  GNU General Public License for more details.
00021 **
00022 **  You should have received a copy of the GNU General Public License
00023 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00024 **  ----------------------------------------------------------------------
00025 */
00026 
00027 // Local
00028 #include "MorphSplitter.h"
00029 
00030 // Standard
00031 #include <sstream>
00032 
00033 using namespace std;
00034 
00035 namespace Tanl {
00036 
00037 map<string, string> MorphSplitter::clitics;
00038 
00039 MorphSplitter::MorphSplitter()
00040 {
00041   if (!clitics.size()) {
00042     clitics["ce"]   = "PC1np";
00043     clitics["ci"]   = "PC1np";
00044     clitics["gli"]  = "PC3ms";
00045     clitics["glie"] = "PC3ms";
00046     clitics["la"]   = "PC3fs";
00047     clitics["le"]   = "PC3fn";
00048     clitics["li"]   = "PC3mp";
00049     clitics["lo"]   = "PC3ms";
00050     clitics["me"]   = "PC1ns";
00051     clitics["mi"]   = "PC1ns";
00052     clitics["se"]   = "PC3nn";
00053     clitics["si"]   = "PC3nn";
00054     clitics["te"]   = "PC2ns";
00055     clitics["ti"]   = "PC2ns";
00056     clitics["ve"]   = "PC2np";
00057     clitics["vi"]   = "PC2np";
00058     clitics["ne"]   = "PCnn";
00059   }
00060 }
00061 
00062 // ======================================================================
00063 
00064 Enumerator<vector<Token*>*>* MorphSplitter::pipe(istream& is)
00065 {
00066   return new MorphStream(is);
00067 }
00068 
00069 Enumerator<vector<Token*>*>* MorphSplitter::pipe(Enumerator<vector<Token*>*>& tve)
00070 {
00071   return new MorphPipe(tve);
00072 }
00073 
00074 // ======================================================================
00075 // Auxiliary functions.
00076 
00077 static void splitPos(const string& pos, string& postag, string& feats)
00078 {
00079   postag.clear();
00080 
00081   string morph;
00082   stringstream oss;
00083   for (string::size_type i = 0; i < pos.size(); ++i)
00084     if (isupper(pos[i]))
00085       postag += pos[i];
00086     else
00087       morph += pos[i];
00088 
00089   if (morph.size()) {
00090     if (postag[0] == 'V') { // verb
00091       if (morph[0] == 'p') { // participle
00092         if (morph[1] == 'p') // present participle
00093           oss << "num=" << morph[2] << "|mod=" << morph[0];
00094         else // past participle
00095           oss << "num=" << morph[3] << "|mod=" << morph[0] << "|gen=" << morph[2];
00096       } else if (morph[0] == 'm' || morph[0] == 'd') // imperative or conditional
00097         oss << "num=" << morph[2] << "|per=" << morph[1] << "|mod=" << morph[0] << "|ten=p";
00098       else if (morph[0] == 'f' || morph[0] == 'g') // infinite or gerundive
00099         oss << "mod=" << morph[0];
00100       else // other (indicative or conjunctive)
00101         oss << "num=" << morph[3] << "|per=" << morph[2] << "|mod=" << morph[0] << "|ten=" << morph[1];
00102     } else if ((postag == "PC" || postag == "PE") && isdigit(morph[0])) // personal pronoun
00103       oss << "num=" << morph[2] << "|per=" << morph[0] << "|gen=" << morph[1];
00104     else // other
00105       oss << "num=" << morph[1] << "|gen=" << morph[0];
00106   }
00107   feats = oss.str();
00108 }
00109 
00113 static void tokenizeText(const string& text, vector<string>& tokens,
00114                          const char* delimiters)
00115 {
00116   // FIXME: revwrite using strtok_r().
00117   tokens.clear();
00118 
00119   string::size_type lastPos = text.find_first_not_of(delimiters, 0);
00120   string::size_type pos = text.find_first_of(delimiters, lastPos);
00121 
00122   while (string::npos != pos || string::npos != lastPos) {
00123     tokens.push_back(text.substr(lastPos, pos - lastPos));
00124     lastPos = text.find_first_not_of(delimiters, pos);
00125     pos = text.find_first_of(delimiters, lastPos);
00126   }
00127 }
00128 
00129 vector<Token*>* MorphSplitter::split(vector<Token*>* inSentence)
00130 {
00131   vector<Token*>* outSentence = new vector<Token*>;
00132   unsigned tokenNo = 1;
00133   FOR_EACH (vector<Token*>, *inSentence, it) {
00134     Token* token = *it;
00135 
00136     string form = *token->get("FORM");
00137     const string& pos = *token->get("POS"); // input tag
00138     string lemma = *token->get("LEMMA");
00139 
00140     string postag, feats;
00141 
00142     // Watch for unknown lemma.
00143     // e.g.: de-condizionarlo   Vfc   <unknown>
00144     bool unknown = false;
00145     if (lemma == "<unknown>" || (form.find('-') && form == lemma)) {
00146       unknown = true;
00147       lemma = form;
00148     }
00149 
00150     // Check presence of clitics.
00151     if (pos[pos.size() - 1] != 'c' || unknown) {
00152       splitPos(pos, postag, feats);
00153 
00154       token->set("ID", tokenNo++);
00155       token->set("FORM", form);
00156       token->set("CPOSTAG", postag.substr(0, 1));
00157       token->set("POSTAG", postag);
00158       token->set("FEATS", feats);
00159 
00160       outSentence->push_back(token);
00161       continue;
00162     }
00163 
00164     // Watch for multiple lemmas.
00165     // e.g.: rifugiatisi   Vpsmpc   rifugiare-si|rifugiarsi-si
00166     vector<string> tokens;
00167     vector<string> lemmas;
00168     tokenizeText(lemma, lemmas, "|");
00169     tokenizeText(lemmas[0], tokens, "-");
00170 
00171     // False clitic, assume it is a proper name.
00172     // e.g.: Zevi   Vm2pc   <unknown>
00173     if (tokens.size() == 1) {
00174       token->set("ID", tokenNo++);
00175       token->set("FORM", form);
00176       token->set("LEMMA", lemma);
00177       token->set("CPOSTAG", "S");
00178       token->set("POSTAG", "SP");
00179       token->set("FEATS", "_");
00180       outSentence->push_back(token);
00181       continue;
00182     }
00183 
00184     // Output verbal root.
00185     // e.g.: leggervela   Vfc   leggere-ve-la
00186     for (int i = 1; i < tokens.size(); ++i)
00187       form = form.substr(0, form.size() - tokens[i].size());
00188     splitPos(pos.substr(0, pos.size() - 1), postag, feats);
00189 
00190     token->set("ID", tokenNo++);
00191     token->set("FORM", form + "-");
00192     token->set("LEMMA", tokens[0]);
00193     token->set("CPOSTAG", postag.substr(0, 1));
00194     token->set("POSTAG", postag);
00195     token->set("FEATS", feats);
00196 
00197     outSentence->push_back(token);
00198 
00199     // Output all clitics.
00200     for (int i = 1; i < tokens.size(); ++i) {
00201       form = i < tokens.size() - 1 ? tokens[i] + "-" : tokens[i];
00202 
00203       string& pos = MorphSplitter::clitics[tokens[i]];
00204       splitPos(tokens[i], postag, feats);
00205 
00206       AttributeIndex* ai = new AttributeIndex;
00207       Token* otherToken = new Token(form, ai);
00208       otherToken->set("ID", tokenNo++);
00209       otherToken->set("FORM", form);
00210       otherToken->set("LEMMA", pos);
00211       // FIXME: only for Tanl has single char corase POS
00212       otherToken->set("CPOSTAG", postag.substr(0, 1));
00213       otherToken->set("POSTAG", postag);
00214       otherToken->set("FEATS", feats);
00215 
00216       outSentence->push_back(otherToken);
00217     }
00218   }
00219   return outSentence;
00220 }
00221 
00222 // ======================================================================
00223 
00224 MorphStream::MorphStream(std::istream& is) :
00225   is(is)
00226 { }
00227 
00228 bool MorphStream::MoveNext()
00229 {
00230   sentence.clear();
00231 
00232   string line;
00233   vector<string> attributes;
00234 
00235   while (!is.eof()) {
00236     getline(is, line);
00237 
00238     tokenizeText(line, attributes, "\t ");
00239     if (!attributes.size()) {
00240       if (sentence.size())
00241         return true;
00242     } else {
00243       AttributeIndex* ai = new AttributeIndex;
00244       ai->insert("FORM");
00245 
00246       Token* token = new Token(attributes[0], ai);
00247       token->set("FORM", attributes[0]);
00248       token->set("POS", attributes[1]);
00249       token->set("LEMMA", attributes[2]);
00250 
00251       sentence.push_back(token);
00252     }
00253   }
00254   return false;
00255 }
00256 
00257 vector<Token*>* MorphStream::Current()
00258 {
00259   return MorphSplitter::split(&sentence);
00260 }
00261 
00262 // ======================================================================
00263 
00264 MorphPipe::MorphPipe(Enumerator<vector<Token*>*>& tve) :
00265   tve(tve)
00266 { }
00267 
00268 bool MorphPipe::MoveNext()
00269 {
00270   return tve.MoveNext();
00271 }
00272 
00273 vector<Token*>* MorphPipe::Current()
00274 {
00275   vector<Token*>* inSentence = tve.Current();
00276   vector<Token*>* sentence = MorphSplitter::split(inSentence);
00277   delete inSentence;
00278   return sentence;
00279 }
00280 
00281 } // namespace Tanl
00282 
00283 using namespace Tanl;
00284 
00285 int main(int argc, char* argv[])
00286 {
00287   if (argc > 1) {
00288     cerr << "Usage: MorphSplitter < inputFile > outputFile" << endl
00289          << "   splits morphed pos tags and clitics." << endl
00290          << "   inputFile       file in Tanl POS format, one token per line:" << endl
00291          << "                   form\\tlemma\\tpos+morph" << endl
00292          << "   outputFile      file in Tanl POS format, one token per line:" << endl
00293          << "                   form\\tlemma\\tcpos\\tpos\\tfeats" << endl;
00294     return -1;
00295   }
00296 
00297   Enumerator<vector<Token*>*>* pipe = MorphSplitter().pipe();
00298   while (pipe->MoveNext()) {
00299     int id = 1;
00300     vector<Token*>* sentence = pipe->Current();
00301     for (vector<Token*>::const_iterator it = sentence->begin(); it != sentence->end(); ++it) {
00302       Token* token = *it;
00303       cout << id++ << "\t" << *token->get("FORM") << "\t"
00304            << *token->get("LEMMA") << "\t" << *token->get("CPOSTAG") << "\t"
00305            << *token->get("POSTAG") << "\t" << *token->get("FEATS") << endl;
00306 
00307       delete token;
00308     }
00309     cout << endl;
00310     delete sentence;
00311   }
00312   delete pipe;
00313   return 0;
00314 }