00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef DeSR_State_H
00025 #define DeSR_State_H
00026
00027
00028
00029
00030 #include "text/RegExp.h"
00031
00032
00033 #include "Classifier.h"
00034
00035
00036 #include <vector>
00037 #include <limits>
00038
00039
00040 #include "Parser.h"
00041 #include "Iterator.h"
00042
00043 namespace Parser {
00044
00045 extern IXE::conf<bool> RightToLeft;
00046
00047 typedef char const* Action;
00048
00049 typedef std::vector<std::string> Features;
00050
00054 class SentenceInfo
00055 {
00056 public:
00057
00058 SentenceInfo(Sentence& sentence, GlobalInfo* info);
00059
00060 GlobalInfo* globalInfo;
00061
00063 std::vector<int> punctCount;
00064
00065 };
00066
00072 class State
00073 {
00074 public:
00080 State(Sentence const& sentence, GlobalInfo* globalInfo);
00081
00082 State(State const& state) :
00083 sentenceInfo(state.sentenceInfo),
00084 sentence(state.sentence.language),
00085 rootNode(state.rootNode),
00086 previous(&state),
00087 verbCount(state.verbCount),
00088 action(0),
00089 input(state.input),
00090 stack(state.stack),
00091 extracted(state.extracted)
00092 {
00093
00094 sentence.resize(state.sentence.size());
00095 std::copy(state.sentence.begin(), state.sentence.end(), sentence.begin());
00096 }
00097
00098 virtual ~State() {
00099 if (!previous || rootNode != previous->rootNode)
00100 delete rootNode;
00101 if (!previous) {
00102 delete sentenceInfo;
00103 }
00104 }
00105
00107 bool hasNext();
00108
00112 State* transition(Action action);
00113
00115 State* Shift();
00116
00118 State* Right(Action action);
00119
00121 State* Left(Action action);
00122
00124 State* right(Action action);
00125
00127 State* left(char const* action);
00128
00130 State* DepLink(Action action);
00131
00133 State* Extract();
00134
00136 State* Insert();
00137
00139 State* Pop();
00140
00141 Sentence* getSentence() {
00142 Sentence* sen = new Sentence(sentence);
00143 if (RightToLeft)
00144 sen->reverse();
00145 return sen;
00146 }
00147
00148 void showStatus();
00149
00150 Sentence sentence;
00151
00152 TreeToken* rootNode;
00153
00155 static Tanl::Text::RegExp::Pattern ispunct;
00156
00157 std::string splitFeature;
00158
00162 State const* previous;
00163
00167 Action action;
00168
00172 std::vector<TreeToken*> input;
00176 std::vector<TreeToken*> stack;
00180 std::vector<TreeToken*> extracted;
00181
00182 protected:
00183
00184
00185 virtual TreeToken* copy(TreeToken* x) { return x; }
00186
00187 SentenceInfo* sentenceInfo;
00188
00192 void predicates(Features& preds, Action action = 0);
00193
00195 int verbCount;
00196
00198 static Tanl::Text::RegExp::Pattern nonWordAscii;
00199
00200 private:
00201 void tokenFeatures(Features& preds);
00202 void prepChildEntities(Features& preds);
00203 };
00204
00210 class TrainState : public State, public Iterator<Tanl::Classifier::Event*>
00211 {
00212 public:
00213
00214 TrainState(Sentence const& sent, GlobalInfo* info);
00215
00216 bool hasNext() { return State::hasNext(); }
00217
00222 Tanl::Classifier::Event* next();
00223
00224 protected:
00228 Action nextAction();
00229
00230 Sentence annotated;
00231
00232 std::vector<int> dependents;
00233 };
00234
00241 class ParseState : public State, public Iterator<Tanl::Classifier::Context*>
00242 {
00243 public:
00244
00245 ParseState(Sentence& sent, GlobalInfo* globalInfo, WordIndex& predIndex);
00246
00247 ParseState(ParseState& prev) :
00248 State((State)prev),
00249 predIndex(prev.predIndex),
00250 lprob(0),
00251 refCount(0)
00252 {
00253 prev.refCount++;
00254 }
00255
00256 ~ParseState();
00257
00258 void prune();
00259
00260 int incRef() { return ++refCount; }
00261 int decRef() { return --refCount; }
00262
00263 bool hasNext();
00264 Tanl::Classifier::Context* next();
00265
00269 ParseState* transition(Action action);
00270
00271 TreeToken* copy(TreeToken* x) {
00272 TreeToken* tok = new TreeToken(*x);
00273 int id = x->id;
00274 if (id)
00275 return sentence[id - 1] = tok;
00276 else
00277 return rootNode = tok;
00278 }
00279
00280 double lprob;
00281
00282 protected:
00283
00284 WordIndex& predIndex;
00285 Tanl::Classifier::Context context;
00286
00287 int refCount;
00288 };
00289
00290 }
00291
00292 #endif // DeSR_State_H