Tanl Linguistic Pipeline |
00001 /* 00002 ** DeSR 00003 ** src/SentenceReader.h 00004 ** ---------------------------------------------------------------------- 00005 ** Copyright (c) 2006 Giuseppe Attardi (attardi@di.unipi.it). 00006 ** ---------------------------------------------------------------------- 00007 ** 00008 ** This file is part of DeSR. 00009 ** 00010 ** DeSR is free software; you can redistribute it and/or modify it 00011 ** under the terms of the GNU General Public License, version 3, 00012 ** as published by the Free Software Foundation. 00013 ** 00014 ** DeSR is distributed in the hope that it will be useful, 00015 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00017 ** GNU General Public License for more details. 00018 ** 00019 ** You should have received a copy of the GNU General Public License 00020 ** along with this program. If not, see <http://www.gnu.org/licenses/>. 00021 ** ---------------------------------------------------------------------- 00022 */ 00023 00024 #ifndef DeSR_SentenceReader_H 00025 #define DeSR_SentenceReader_H 00026 00027 // library 00028 #include "text/RegExp.h" 00029 #include "text/XmlReader.h" 00030 00031 // standard 00032 #include <istream> 00033 #include <deque> 00034 00035 // local 00036 #include "PosTagger.h" 00037 #include "Sentence.h" 00038 #include "Enumerator.h" 00039 00040 namespace Tanl { 00041 00042 class Corpus; 00043 00047 class SentenceReader : public Enumerator<Sentence*> 00048 { 00049 public: 00050 00051 SentenceReader() { } 00052 00058 SentenceReader(std::istream* is, Corpus* corpus); 00059 00062 00064 virtual bool MoveNext(); 00065 00067 virtual Sentence* Current(); 00068 00070 virtual void Reset() {} 00071 00073 00074 virtual ~SentenceReader() {} 00075 00076 Corpus* corpus; 00077 00078 protected: 00079 Sentence* sentence; 00080 std::istream* is; 00081 }; 00082 00089 class ConllXSentenceReader : public SentenceReader 00090 { 00091 public: 00092 00098 ConllXSentenceReader(std::istream* is, Corpus* corpus); 00099 00101 bool MoveNext(); 00102 00103 MorphExtractor const& morphExtractor; 00104 }; 00105 00111 class DgaSentenceReader : public SentenceReader 00112 { 00113 public: 00114 00119 DgaSentenceReader(std::istream* is, Corpus* corpus); 00120 00122 bool MoveNext(); 00123 00124 private: 00125 Tanl::XML::XmlReader reader; 00126 }; 00127 00134 class TokenSentenceReader : public SentenceReader 00135 { 00136 public: 00137 00142 TokenSentenceReader(std::istream* is, Corpus* corpus = 0); 00143 00145 bool MoveNext(); 00146 00147 private: 00148 static Tanl::Text::RegExp::Pattern reTok; 00149 00150 # ifdef STEMMER 00151 sb_stemmer* stemmer; 00152 # endif 00153 }; 00154 00158 class TaggedSentenceReader : public SentenceReader 00159 { 00160 public: 00161 TaggedSentenceReader(SentenceReader* reader, Parser::PosTagger* tagger) : 00162 reader(reader), 00163 tagger(tagger) 00164 { } 00165 00167 bool MoveNext(); 00168 00169 SentenceReader* reader; 00170 Parser::PosTagger* tagger; 00171 00172 }; 00173 00179 class SentenceQueueReader : public Enumerator<Sentence*> 00180 { 00181 public: 00182 SentenceQueueReader(std::deque<Sentence*>& vs) : 00183 sentences(vs) 00184 { } 00185 00186 bool MoveNext() { 00187 if (sentences.empty()) 00188 return false; 00189 current = sentences.front(); 00190 sentences.pop_front(); 00191 return true; 00192 } 00193 00194 Sentence* Current() { return current; } 00195 00196 void reset() {} 00197 00198 private: 00199 Sentence* current; 00200 std::deque<Sentence*>& sentences; 00201 }; 00202 00203 } // namespace Tanl 00204 00205 #endif // DeSR_SentenceReader_H