00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #include "MorphSplitter.h"
00029
00030
00031 #include <sstream>
00032
00033 using namespace std;
00034
00035 namespace Tanl {
00036
00037 map<string, string> MorphSplitter::clitics;
00038
00039 MorphSplitter::MorphSplitter()
00040 {
00041 if (!clitics.size()) {
00042 clitics["ce"] = "PC1np";
00043 clitics["ci"] = "PC1np";
00044 clitics["gli"] = "PC3ms";
00045 clitics["glie"] = "PC3ms";
00046 clitics["la"] = "PC3fs";
00047 clitics["le"] = "PC3fn";
00048 clitics["li"] = "PC3mp";
00049 clitics["lo"] = "PC3ms";
00050 clitics["me"] = "PC1ns";
00051 clitics["mi"] = "PC1ns";
00052 clitics["se"] = "PC3nn";
00053 clitics["si"] = "PC3nn";
00054 clitics["te"] = "PC2ns";
00055 clitics["ti"] = "PC2ns";
00056 clitics["ve"] = "PC2np";
00057 clitics["vi"] = "PC2np";
00058 clitics["ne"] = "PCnn";
00059 }
00060 }
00061
00062
00063
00064 Enumerator<vector<Token*>*>* MorphSplitter::pipe(istream& is)
00065 {
00066 return new MorphStream(is);
00067 }
00068
00069 Enumerator<vector<Token*>*>* MorphSplitter::pipe(Enumerator<vector<Token*>*>& tve)
00070 {
00071 return new MorphPipe(tve);
00072 }
00073
00074
00075
00076
00077 static void splitPos(const string& pos, string& postag, string& feats)
00078 {
00079 postag.clear();
00080
00081 string morph;
00082 stringstream oss;
00083 for (string::size_type i = 0; i < pos.size(); ++i)
00084 if (isupper(pos[i]))
00085 postag += pos[i];
00086 else
00087 morph += pos[i];
00088
00089 if (morph.size()) {
00090 if (postag[0] == 'V') {
00091 if (morph[0] == 'p') {
00092 if (morph[1] == 'p')
00093 oss << "num=" << morph[2] << "|mod=" << morph[0];
00094 else
00095 oss << "num=" << morph[3] << "|mod=" << morph[0] << "|gen=" << morph[2];
00096 } else if (morph[0] == 'm' || morph[0] == 'd')
00097 oss << "num=" << morph[2] << "|per=" << morph[1] << "|mod=" << morph[0] << "|ten=p";
00098 else if (morph[0] == 'f' || morph[0] == 'g')
00099 oss << "mod=" << morph[0];
00100 else
00101 oss << "num=" << morph[3] << "|per=" << morph[2] << "|mod=" << morph[0] << "|ten=" << morph[1];
00102 } else if ((postag == "PC" || postag == "PE") && isdigit(morph[0]))
00103 oss << "num=" << morph[2] << "|per=" << morph[0] << "|gen=" << morph[1];
00104 else
00105 oss << "num=" << morph[1] << "|gen=" << morph[0];
00106 }
00107 feats = oss.str();
00108 }
00109
00113 static void tokenizeText(const string& text, vector<string>& tokens,
00114 const char* delimiters)
00115 {
00116
00117 tokens.clear();
00118
00119 string::size_type lastPos = text.find_first_not_of(delimiters, 0);
00120 string::size_type pos = text.find_first_of(delimiters, lastPos);
00121
00122 while (string::npos != pos || string::npos != lastPos) {
00123 tokens.push_back(text.substr(lastPos, pos - lastPos));
00124 lastPos = text.find_first_not_of(delimiters, pos);
00125 pos = text.find_first_of(delimiters, lastPos);
00126 }
00127 }
00128
00129 vector<Token*>* MorphSplitter::split(vector<Token*>* inSentence)
00130 {
00131 vector<Token*>* outSentence = new vector<Token*>;
00132 unsigned tokenNo = 1;
00133 FOR_EACH (vector<Token*>, *inSentence, it) {
00134 Token* token = *it;
00135
00136 string form = *token->get("FORM");
00137 const string& pos = *token->get("POS");
00138 string lemma = *token->get("LEMMA");
00139
00140 string postag, feats;
00141
00142
00143
00144 bool unknown = false;
00145 if (lemma == "<unknown>" || (form.find('-') && form == lemma)) {
00146 unknown = true;
00147 lemma = form;
00148 }
00149
00150
00151 if (pos[pos.size() - 1] != 'c' || unknown) {
00152 splitPos(pos, postag, feats);
00153
00154 token->set("ID", tokenNo++);
00155 token->set("FORM", form);
00156 token->set("CPOSTAG", postag.substr(0, 1));
00157 token->set("POSTAG", postag);
00158 token->set("FEATS", feats);
00159
00160 outSentence->push_back(token);
00161 continue;
00162 }
00163
00164
00165
00166 vector<string> tokens;
00167 vector<string> lemmas;
00168 tokenizeText(lemma, lemmas, "|");
00169 tokenizeText(lemmas[0], tokens, "-");
00170
00171
00172
00173 if (tokens.size() == 1) {
00174 token->set("ID", tokenNo++);
00175 token->set("FORM", form);
00176 token->set("LEMMA", lemma);
00177 token->set("CPOSTAG", "S");
00178 token->set("POSTAG", "SP");
00179 token->set("FEATS", "_");
00180 outSentence->push_back(token);
00181 continue;
00182 }
00183
00184
00185
00186 for (int i = 1; i < tokens.size(); ++i)
00187 form = form.substr(0, form.size() - tokens[i].size());
00188 splitPos(pos.substr(0, pos.size() - 1), postag, feats);
00189
00190 token->set("ID", tokenNo++);
00191 token->set("FORM", form + "-");
00192 token->set("LEMMA", tokens[0]);
00193 token->set("CPOSTAG", postag.substr(0, 1));
00194 token->set("POSTAG", postag);
00195 token->set("FEATS", feats);
00196
00197 outSentence->push_back(token);
00198
00199
00200 for (int i = 1; i < tokens.size(); ++i) {
00201 form = i < tokens.size() - 1 ? tokens[i] + "-" : tokens[i];
00202
00203 string& pos = MorphSplitter::clitics[tokens[i]];
00204 splitPos(tokens[i], postag, feats);
00205
00206 AttributeIndex* ai = new AttributeIndex;
00207 Token* otherToken = new Token(form, ai);
00208 otherToken->set("ID", tokenNo++);
00209 otherToken->set("FORM", form);
00210 otherToken->set("LEMMA", pos);
00211
00212 otherToken->set("CPOSTAG", postag.substr(0, 1));
00213 otherToken->set("POSTAG", postag);
00214 otherToken->set("FEATS", feats);
00215
00216 outSentence->push_back(otherToken);
00217 }
00218 }
00219 return outSentence;
00220 }
00221
00222
00223
00224 MorphStream::MorphStream(std::istream& is) :
00225 is(is)
00226 { }
00227
00228 bool MorphStream::MoveNext()
00229 {
00230 sentence.clear();
00231
00232 string line;
00233 vector<string> attributes;
00234
00235 while (!is.eof()) {
00236 getline(is, line);
00237
00238 tokenizeText(line, attributes, "\t ");
00239 if (!attributes.size()) {
00240 if (sentence.size())
00241 return true;
00242 } else {
00243 AttributeIndex* ai = new AttributeIndex;
00244 ai->insert("FORM");
00245
00246 Token* token = new Token(attributes[0], ai);
00247 token->set("FORM", attributes[0]);
00248 token->set("POS", attributes[1]);
00249 token->set("LEMMA", attributes[2]);
00250
00251 sentence.push_back(token);
00252 }
00253 }
00254 return false;
00255 }
00256
00257 vector<Token*>* MorphStream::Current()
00258 {
00259 return MorphSplitter::split(&sentence);
00260 }
00261
00262
00263
00264 MorphPipe::MorphPipe(Enumerator<vector<Token*>*>& tve) :
00265 tve(tve)
00266 { }
00267
00268 bool MorphPipe::MoveNext()
00269 {
00270 return tve.MoveNext();
00271 }
00272
00273 vector<Token*>* MorphPipe::Current()
00274 {
00275 vector<Token*>* inSentence = tve.Current();
00276 vector<Token*>* sentence = MorphSplitter::split(inSentence);
00277 delete inSentence;
00278 return sentence;
00279 }
00280
00281 }
00282
00283 using namespace Tanl;
00284
00285 int main(int argc, char* argv[])
00286 {
00287 if (argc > 1) {
00288 cerr << "Usage: MorphSplitter < inputFile > outputFile" << endl
00289 << " splits morphed pos tags and clitics." << endl
00290 << " inputFile file in Tanl POS format, one token per line:" << endl
00291 << " form\\tlemma\\tpos+morph" << endl
00292 << " outputFile file in Tanl POS format, one token per line:" << endl
00293 << " form\\tlemma\\tcpos\\tpos\\tfeats" << endl;
00294 return -1;
00295 }
00296
00297 Enumerator<vector<Token*>*>* pipe = MorphSplitter().pipe();
00298 while (pipe->MoveNext()) {
00299 int id = 1;
00300 vector<Token*>* sentence = pipe->Current();
00301 for (vector<Token*>::const_iterator it = sentence->begin(); it != sentence->end(); ++it) {
00302 Token* token = *it;
00303 cout << id++ << "\t" << *token->get("FORM") << "\t"
00304 << *token->get("LEMMA") << "\t" << *token->get("CPOSTAG") << "\t"
00305 << *token->get("POSTAG") << "\t" << *token->get("FEATS") << endl;
00306
00307 delete token;
00308 }
00309 cout << endl;
00310 delete sentence;
00311 }
00312 delete pipe;
00313 return 0;
00314 }