Tanl Linguistic Pipeline |
00001 /* 00002 ** Hunpos 00003 ** src/PosTagger.h 00004 ** ---------------------------------------------------------------------- 00005 ** Copyright (c) 2003 Giuseppe Attardi (attardi@di.unipi.it). 00006 ** ---------------------------------------------------------------------- 00007 */ 00008 00009 #include "PosTagger.h" 00010 00011 #include "tree-tagger-api.h" 00012 #include<iostream> 00013 00014 using namespace std; 00015 00016 namespace Tanl { namespace POS { 00017 00018 PosTagger::PosTagger(char const* modelFile) 00019 { 00020 } 00021 00022 Tanl::Enumerator<std::vector<Tanl::Token*>* >* 00023 PosTagger::pipe(Tanl::Enumerator<Tanl::Token*>& ts) 00024 { 00025 return new PosTaggerPipe(*this, ts); 00026 } 00027 00028 bool PosTaggerPipe::MoveNext() 00029 { 00030 // collect sentence 00031 std::vector<Tanl::Token*> tokens; 00032 while (ts.MoveNext()) { 00033 Tanl::Token* tok = ts.Current(); 00034 if (tok->form == "\n") 00035 break; 00036 else 00037 tokens.push_back(tok); 00038 } 00039 if (tokens.empty()) 00040 return false; 00041 [[ to be defined ]] 00042 // extend Attributes with POS 00043 // We assume that tokens have the same attributes, i.e. they belong 00044 // to the same corpus. 00045 Tanl::AttributeIndex* ai = tokens[0]->attributes.attributeIndex; 00046 ai->insert("POS"); 00047 sentence = new std::vector<Tanl::Token*>(tokens.size()); 00048 for (unsigned i = 0; i < tokens.size(); ++i) { 00049 Tanl::Token* tok = new Tanl::Token(*tokens[i]); 00050 tok->set("POS", tagged.tags[i]); 00051 (*sentence)[i] = tok; 00052 } 00053 return true; 00054 } 00055 00056 std::vector<Tanl::Token*>* PosTaggerPipe::Current() 00057 { 00058 return sentence; 00059 } 00060 00061 } // namespace POS 00062 } // namespace Tanl