Tanl Linguistic Pipeline |
00001 /* 00002 ** Tanl 00003 ** split/Tokenizer/Tokenizer.h 00004 ** ---------------------------------------------------------------------- 00005 ** Copyright (c) 2008 Giuseppe Attardi (attardi@di.unipi.it). 00006 ** ---------------------------------------------------------------------- 00007 ** 00008 ** This file is part of Tanl. 00009 ** 00010 ** Tanl is free software; you can redistribute it and/or modify it 00011 ** under the terms of the GNU General Public License, version 3, 00012 ** as published by the Free Software Foundation. 00013 ** 00014 ** Tanl is distributed in the hope that it will be useful, 00015 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 ** GNU General Public License for more details. 00018 ** 00019 ** You should have received a copy of the GNU General Public License 00020 ** along with this program. If not, see <http://www.gnu.org/licenses/>. 00021 ** ---------------------------------------------------------------------- 00022 */ 00023 00024 #ifndef Tanl_Tokenizer_H 00025 #define Tanl_Tokenizer_H 00026 00027 // Python 00028 #include <Python.h> 00029 00030 // standard 00031 00032 // local 00033 #define QUEX_OPTION_ASSERTS_DISABLED 00034 #define QUEX_OPTION_WARNING_ON_PLAIN_FILLER_DISABLED 00035 #include "quex_tokenizer" 00036 00037 // Tanl library 00038 #include "IPipe.h" 00039 #include "Corpus/Token.h" 00040 00041 namespace Tanl { 00042 00047 struct TokenizerPipe; 00048 struct TokenizerPipeEnum; 00049 struct TokenizerPipePython; 00050 00051 class Tokenizer : public IPipe<std::string*, Token*> 00052 { 00053 public: 00055 Enumerator<Token*>* pipe(std::istream& is = std::cin); 00056 00058 Enumerator<Token*>* pipe(Enumerator<std::string*>& se); 00059 00061 Enumerator<Token*>* pipe(PyObject* pit); 00062 }; 00063 00067 struct TokenizerPipe : public Enumerator<Token*> 00068 { 00069 virtual ~TokenizerPipe(); 00070 00072 TokenizerPipe(Tokenizer& tokenizer, std::istream& is); 00073 00075 TokenizerPipe(Tokenizer& tokenizer, Enumerator<std::string*>& se); 00076 00079 bool MoveNext(); 00080 00082 Token* Current(); 00083 00084 int len; 00085 00086 protected: 00087 TokenizerPipe(); // to allow inheritance 00088 00089 quex::Token token; 00090 quex::quex_tokenizer qlex; 00091 Context* currContext; 00092 }; 00093 00097 struct TokenizerPipeEnum : public TokenizerPipe 00098 { 00099 TokenizerPipeEnum(Tokenizer& tokenizer, Enumerator<std::string*>& senum); 00100 00101 bool MoveNext(); 00102 00103 private: 00104 Enumerator<std::string*>& senum; 00105 }; 00106 00110 struct TokenizerPipePython : public TokenizerPipe 00111 { 00112 TokenizerPipePython(Tokenizer& tokenizer, PyObject* pit); 00113 00114 ~TokenizerPipePython(); 00115 00116 bool MoveNext(); 00117 00118 private: 00119 PyObject* pit; 00120 }; 00121 00122 } // namespace Tanl 00123 00124 #endif // Tanl_Tokenizer_H