Tanl Linguistic Pipeline |
00001 /* 00002 ** Tanl 00003 ** NER/NER.h 00004 ** ---------------------------------------------------------------------- 00005 ** Copyright (c) 2005 Giuseppe Attardi (attardi@di.unipi.it). 00006 ** ---------------------------------------------------------------------- 00007 ** 00008 ** This file is part of Tanl. 00009 ** 00010 ** Tanl is free software; you can redistribute it and/or modify it 00011 ** under the terms of the GNU General Public License, version 3, 00012 ** as published by the Free Software Foundation. 00013 ** 00014 ** Tanl is distributed in the hope that it will be useful, 00015 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 ** GNU General Public License for more details. 00018 ** 00019 ** You should have received a copy of the GNU General Public License 00020 ** along with this program. If not, see <http://www.gnu.org/licenses/>. 00021 ** ---------------------------------------------------------------------- 00022 */ 00023 00024 #ifndef Tanl_NER_NER_H 00025 #define Tanl_NER_NER_H 00026 00027 // IXE library 00028 #include "conf/conf_bool.h" 00029 #include "conf/conf_float.h" 00030 #include "conf/conf_int.h" 00031 #include "conf/conf_string.h" 00032 00033 // local 00034 #include "include/config.h" 00035 #include "NerEventStream.h" 00036 #include "include/IPipe.h" 00037 #include "RefCountable.h" 00038 #include "Corpus/Corpus.h" 00039 #include "classifier/MaxEnt.h" 00040 00041 struct _object; // PyObject 00042 00043 namespace Tanl { 00044 namespace NER { 00045 00050 00051 // Configuration Parameters 00053 00054 00058 class NER : public IPipe<std::vector<Token*>*, std::vector<Token*>*>, 00059 public virtual RefCountable 00060 { 00061 public: 00062 00064 IXE::conf<std::string> resourceDir; 00066 IXE::conf<std::string> language; 00068 IXE::conf<int> cutoff; 00070 IXE::conf<int> iter; 00072 IXE::conf<float> alpha; 00074 IXE::conf<bool> verbose; 00075 00080 NER(char const* modelFile, char const* configFile = 0, 00081 char const* POStag = "POSTAG", char const* NEtag = "NETAG"); 00082 00083 ~NER(); 00084 00089 void train(SentenceReader* sentenceReader, char const* modelFile); 00090 00096 std::vector<Token*>* tag(std::vector<Token*>* sent, 00097 NerEventStream* eventStream = 0); 00098 00103 Enumerator<std::vector<Token*>*>* pipe(Enumerator<std::vector<Token*>*>& tve); 00104 00105 # ifdef HAVE_PYTHON 00106 00110 Enumerator<std::vector<Token*>*>* pipe(struct _object* pit); // PyObject 00111 # endif 00112 00113 friend class NerPipe; 00114 friend class NerPyPipe; 00115 00116 char const* POStag; 00117 char const* NEtag; 00118 00119 private: 00120 Classifier::MaxEnt* model; 00121 Resources resources; 00122 }; 00123 00127 struct NerPipe : public Enumerator<std::vector<Token*>*> 00128 { 00130 NerPipe(NER& ner, Enumerator<std::vector<Token*>*>& se); 00131 00135 void Dispose(); 00136 00139 bool MoveNext(); 00140 00142 std::vector<Token*>* Current(); 00143 00144 private: 00145 NER& ner; 00146 NerEventStream eventStream; 00147 Enumerator<std::vector<Token*>*>& se; 00148 vector<Token*>* sent; 00149 unordered_map<int, int> outcomeId; 00150 }; 00151 00152 } // namespace NER 00153 } // namespace Tanl 00154 00155 #endif // Tanl_NER_NER_H