00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef DeSR_Parser_H
00025 #define DeSR_Parser_H
00026
00027
00028 #include "config.h"
00029 #include "IPipe.h"
00030 #include "RefCountable.h"
00031 #include "SentenceReader.h"
00032 #include "ReviseEventStream.h"
00033 #include "WordCounts.h"
00034
00035
00036 #include "conf/conf_bool.h"
00037 #include "conf/conf_int.h"
00038 #include "conf/conf_string.h"
00039
00040 struct _object;
00041
00046 namespace Parser {
00047
00048 extern IXE::conf<std::string> algorithm;
00049 extern IXE::conf<std::string> lang;
00050 extern IXE::conf<std::string> fileVersion;
00051 extern IXE::conf<int> beam;
00052 extern bool showTreelets;
00053
00058 struct GlobalInfo
00059 {
00060 WordCounts timeLemmas;
00061 WordCounts locLemmas;
00062
00067 static float const freqRatio;
00068
00072 virtual void extract(Sentence const& sentence);
00073
00077 void clearRareEntities();
00078
00082 void clear();
00083
00084 void save(std::ofstream& ofs);
00085
00086 void load(std::ifstream& ifs);
00087
00088 };
00089
00090 struct ParserPipe;
00091
00097 class Parser : public IPipe<Sentence*, Sentence*>,
00098 public virtual RefCountable
00099 {
00100 public:
00101
00102 Parser(WordIndex& predIndex) :
00103 predIndex(predIndex)
00104 # ifdef ORACLE
00105 , oracleCorrect(0),
00106 oracleCount(0)
00107 # endif
00108 { }
00109
00110 ~Parser();
00111
00115 static Parser* create(char const* modelFile = 0);
00116
00121 virtual void train(SentenceReader* sentenceReader,
00122 char const* modelFile) {}
00127 virtual Sentence* parse(Sentence* sentence) { return 0; }
00128
00133 virtual void parse(SentenceReader* sentenceReader,
00134 std::ostream& os = std::cout);
00135
00143 virtual void revise(SentenceReader* sentenceReader,
00144 char const* actionFile = 0) { }
00145
00149 std::deque<Sentence*> collectSentences(Enumerator<Sentence*>* sentenceReader);
00150
00152 virtual void showEval(int tokenCount, int las, int uas,
00153 int sentCount);
00154
00156 void writeHeader(std::ostream& os);
00157
00161 static bool readHeader(std::istream& is);
00162
00167 Enumerator<Sentence*>* pipe(Enumerator<std::vector<Token*>*>& tve);
00168
00169 # ifdef HAVE_PYTHON
00170
00174 Enumerator<Sentence*>* pipe(struct _object* pit);
00175
00176 # endif
00177
00182 Enumerator<Sentence*>* pipe(Enumerator<Sentence*>& tce);
00183
00187 virtual void preprocess(Sentence* sentence);
00188
00193 static std::string procStat();
00194
00195 WordIndex& predIndex;
00196 GlobalInfo info;
00197
00199 static IXE::conf<int> featureCutoff;
00200
00202 static IXE::conf<int> lexCutoff;
00203
00205 static IXE::conf<bool> verbose;
00206
00207 # ifdef ORACLE
00208 int oracleCorrect;
00209 int oracleCount;
00210 # endif
00211 };
00212
00216 struct ParserPipe : public Enumerator<Sentence*>
00217 {
00218 public:
00219 ParserPipe(Parser& parser, Enumerator<std::vector<Token*>*>& tve);
00220
00221 bool MoveNext();
00222
00223 Sentence* Current();
00224
00228 void Dispose();
00229
00230 private:
00231 Parser& parser;
00232 Enumerator<std::vector<Token*>*>& tve;
00233 Language const* language;
00234 };
00235
00239 struct ParserSentPipe : public Enumerator<Sentence*>
00240 {
00241 public:
00242 ParserSentPipe(Parser& parser, Enumerator<Sentence*>& tve);
00243
00244 bool MoveNext();
00245
00246 Sentence* Current();
00247
00251 void Dispose();
00252
00253 private:
00254 Parser& parser;
00255 Enumerator<Sentence*>& tve;
00256 };
00257
00259
00260 typedef Parser* ParserFactory(char const*);
00261
00265 struct ParserMap
00266 {
00267 public:
00268 ParserMap(char const* type, ParserFactory* pf)
00269 {
00270 get()[type] = pf;
00271 }
00272
00273 static ParserFactory* get(char const* type);
00274
00275 private:
00276 static std::map<char const*, ParserFactory*>& get();
00277 };
00278
00279 #define REGISTER_PARSER(type, factory) static ParserMap __dummy ## type(#type, factory)
00280
00281 }
00282
00283 #endif // DeSR_Parser_H