00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef DeSR_SplitSentenceReader_H
00025 #define DeSR_SplitSentenceReader_H
00026
00027
00028 #include "Corpus.h"
00029
00030 #undef yyFlexLexer
00031 #define yyFlexLexer ptbFlexLexer
00032 #include "FlexLexer.h"
00033
00034 #include "Tokenizer.h"
00035
00036 #include "SentenceReader.h"
00037
00038
00039 #include "text/less.h"
00040
00041
00042 #include <set>
00043
00044 namespace Tanl {
00045
00052 class SplitSentenceReader : public SentenceReader
00053 {
00054 public:
00055
00059 struct BoundarySet {
00060 static char const* defaultBoundaryTokens[];
00061 static char const* defaultBoundaryFollowers[];
00062
00063 BoundarySet(char const* boundaryTokens[] = defaultBoundaryTokens,
00064 char const* boundaryFollowers[] = defaultBoundaryFollowers,
00065 char const* regionBegin = 0,
00066 char const* regionEnd = 0) {
00067 for (char const** scan = boundaryTokens; *scan; scan++)
00068 boundaries.insert(*scan);
00069 for (char const** scan = boundaryFollowers; *scan; scan++)
00070 followers.insert(*scan);
00071 }
00072
00073 bool ends(char const* tok) const {
00074 return boundaries.find(tok) != boundaries.end();
00075 }
00076
00077 std::set<char const*> boundaries;
00078 std::set<char const*> followers;
00079 };
00080 static BoundarySet const defaultBS;
00081
00087 SplitSentenceReader(std::istream* is,
00088 Corpus* corpus = 0,
00089 BoundarySet const* bs = &defaultBS);
00090
00092 bool MoveNext();
00093
00095 void reset() { tokenizer.Reset(); insideRegion = false; }
00096
00097 private:
00098 Parser::Tokenizer tokenizer;
00099 BoundarySet const* boundarySet;
00100 bool insideRegion;
00101
00102 # ifdef STEMMER
00103 sb_stemmer* stemmer;
00104 # endif
00105 };
00106
00107 }
00108
00109 #endif // DeSR_SplitSentenceReader_H