00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "State.h"
00025 #include "Parser.h"
00026 #include "conf_feature.h"
00027 #include <assert.h>
00028
00029
00030
00031
00032 #include "text/WordSet.h"
00033
00034 using namespace std;
00035 using namespace IXE;
00036 using namespace Tanl::Text;
00037 using namespace Tanl::Classifier;
00038
00039 namespace Parser {
00040
00041
00042 conf_feature features("Features");
00043
00044
00045 conf<bool> arcEager("ArcEager", false);
00046
00047
00048 conf_feature SplitFeature("SplitFeature");
00049
00051 conf<bool> ClosestChildren("ClosestChildren", false);
00052
00054 conf<bool> PrepChildEntityType("PrepChildEntityType", false);
00055
00056 conf<bool> StackSize("StackSize", true);
00057 conf<bool> InputSize("InputSize", false);
00058 conf<bool> InPunct("InPunct", false);
00059 conf<bool> VerbCount("VerbCount", true);
00061 conf<bool> UseChildPunct("UseChildPunct", true);
00062 conf<int> PastActions("PastActions", 1);
00063 conf<bool> WordDistance("WordDistance", true);
00064 conf<bool> PunctCount("PunctCount", true);
00066 conf<bool> MorphoAgreement("MorphoAgreement", false);
00068 conf<bool> MorphoSplit("MorphoSplit", false);
00070 conf<bool> LexChildNonWord("LexChildNonWord", true);
00071
00073 conf<bool> CompositeActions("CompositeActions", true);
00074
00075
00076 conf<bool> SecondOrder("SecondOrder", false);
00077 conf<bool> FeatPairs("FeatPairs", false);
00078
00080 conf<bool> RightToLeft("RightToLeft", false);
00081
00082 conf<bool> ShowActions("ShowActions", false);
00083
00084
00085 conf<bool> unambiguous("UnambiguousFeatures", false);
00086
00087 WordSet actionTable;
00088
00089
00090 RegExp::Pattern State::ispunct("^\\p{P}+$",
00091 PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
00092 RegExp::Pattern State::nonWordAscii("^[^$0-9_-zA-Z]+$");
00093
00094 char const* mkAction(char const* a, string const& dep)
00095 {
00096 if (CompositeActions || !strcmp(a, "D")) {
00097 char action[128];
00098 sprintf(action, "%s%s", a, dep.c_str());
00099 return *actionTable.insert(action).first;
00100 } else
00101 return *actionTable.insert(a).first;
00102 }
00103
00104 char const* actionString(char const* a)
00105 {
00106 return *actionTable.insert(a).first;
00107 }
00108
00109
00110
00111
00112 SentenceInfo::SentenceInfo(Sentence& sentence, GlobalInfo* info) :
00113 globalInfo(info)
00114 {
00115 if (sentence.empty())
00116 return;
00117
00118 punctCount.push_back(State::ispunct.test(sentence[0]->token->form));
00119 for (unsigned i = 1; i < sentence.size(); ++i) {
00120 Token* token = sentence[i]->token;
00121 punctCount.push_back(punctCount[i-1] + State::ispunct.test(token->form));
00122 }
00123 }
00124
00125
00126
00127
00128 State::State(Sentence const& sent, GlobalInfo* info) :
00129 sentence(sent),
00130 rootNode(new TreeToken(0, "#NULL")),
00131 action(0),
00132 previous(0)
00133 {
00134 if (RightToLeft)
00135 sentence.reverse();
00136 sentenceInfo = new SentenceInfo(sentence, info);
00137
00138 input.resize(sentence.size());
00139 std::copy(sentence.rbegin(), sentence.rend(), input.begin());
00140
00141 stack.push_back(rootNode);
00142 }
00143
00144 bool State::hasNext()
00145 {
00146 return !input.empty();
00147 }
00148
00152 inline State* State::Shift()
00153 {
00154 TreeToken* next = input.back();
00155 stack.push_back(next);
00156 input.pop_back();
00157 action = "S";
00158 return this;
00159 }
00160
00161 inline State* State::Right(Action action)
00162 {
00163
00164 if (stack.size() == 1)
00165 return 0;
00166 TreeToken* top = copy(stack.back());
00167 stack.pop_back();
00168 TreeToken* next = copy(input.back());
00169 input.back() = next;
00170 next->left.push_back(top);
00171 top->linkHead(next->id, 0);
00172 if (CompositeActions)
00173 top->linkLabel(action+1);
00174 this->action = actionString(action);
00175 return this;
00176 }
00177
00178 inline State* State::Left(Action action)
00179 {
00180
00181 TreeToken* top = copy(stack.back());
00182 TreeToken* next = copy(input.back());
00183 top->right.push_back(next);
00184 if (arcEager) {
00185
00186 stack.push_back(next);
00187 input.pop_back();
00188 } else if (CompositeActions) {
00189 if (stack.size() > 1) {
00190 stack.pop_back();
00191 input.back() = top;
00192 } else {
00193
00194 stack.back() = top;
00195 input.pop_back();
00196 }
00197 } else {
00198 if (stack.size()) {
00199 stack.pop_back();
00200 input.back() = top;
00201 }
00202 }
00203 next->linkHead(top->id, 0);
00204 if (CompositeActions)
00205 next->linkLabel(action+1);
00206 this->action = actionString(action);
00207 return this;
00208 }
00209
00210 inline State* State::right(Action action)
00211 {
00212 unsigned n = action[1] - '0';
00213
00214 if (stack.size() <= n) {
00215
00216
00217 return 0;
00218 }
00219 TreeToken* nthTop = copy(stack[stack.size() - n]);
00220 stack.erase(stack.end() - n);
00221 TreeToken* next = copy(input.back());
00222 next->left.push_back(nthTop);
00223 nthTop->linkHead(next->id, 0);
00224 if (CompositeActions) {
00225 nthTop->linkLabel(action+2);
00226
00227 input.push_back(stack.back());
00228 stack.pop_back();
00229 }
00230 this->action = actionString(action);
00231 return this;
00232 }
00233
00234 inline State* State::left(char const* action)
00235 {
00236 unsigned n = action[1] - '0';
00237
00238
00239 if (stack.size() < n) {
00240
00241 return 0;
00242 }
00243 TreeToken* nthTop = copy(stack[stack.size() - n]);
00244 TreeToken* next = copy(input.back());
00245 nthTop->right.push_back(next);
00246 next->linkHead(nthTop->id, 0);
00247 if (CompositeActions)
00248 next->linkLabel(action+2);
00249
00250 input.back() = stack.back();
00251 stack.pop_back();
00252
00253 for (unsigned i = 0; i < n-2; i++) {
00254 input.push_back(stack.back());
00255 stack.pop_back();
00256 }
00257 if (stack.size() > 1) {
00258
00259 input.push_back(nthTop);
00260 stack.pop_back();
00261 } else {
00262
00263 stack.back() = nthTop;
00264 }
00265 this->action = actionString(action);
00266 return this;
00267 }
00268
00269 inline State* State::DepLink(Action action)
00270 {
00271 TreeToken* next = input.back();
00272 switch (previous ? previous->action[0] : this->action[0]) {
00273 case 'R':
00274 case 'r':
00275
00276 next->left.back()->linkLabel(action+1);
00277
00278
00279 if (this->action[0] == 'r') {
00280 input.push_back(stack.back());
00281 stack.pop_back();
00282 }
00283 this->action = actionString(action);
00284 return this;
00285
00286 case 'L':
00287 case 'l':
00288
00289 next->right.back()->linkLabel(action+1);
00290
00291 if (stack.empty()) {
00292 input.pop_back();
00293 stack.push_back(next);
00294 }
00295 this->action = actionString(action);
00296 return this;
00297 }
00298 return this;
00299 }
00300
00301 State* State::Extract()
00302 {
00303
00304 if (stack.size() < 3 ||
00305 input.size() < 1) {
00306
00307 return 0;
00308 }
00309 TreeToken* nthStack = stack[stack.size() - 2];
00310 extracted.push_back(nthStack);
00311 stack.erase(stack.end() - 2);
00312
00313 TreeToken* next = input.back();
00314 stack.push_back(next);
00315 input.pop_back();
00316 action = "E";
00317 return this;
00318 }
00319
00320 State* State::Insert()
00321 {
00322
00323 if (extracted.empty()) {
00324
00325 return 0;
00326 }
00327 input.push_back(extracted.back());
00328 extracted.pop_back();
00329 action = "I";
00330 return this;
00331 }
00332
00333 State* State::Pop()
00334 {
00335 if (stack.size() < 2)
00336 return 0;
00337 stack.pop_back();
00338 action = "P";
00339 return this;
00340 }
00341
00359 State* State::transition(Action action)
00360 {
00361 # ifdef DEBUG_1
00362 showStatus();
00363 cerr << "Action: " << action << endl;
00364 # endif
00365 switch (action[0]) {
00366 case 'S':
00367 if (input.empty())
00368 return this;
00369 return Shift();
00370 case 'R':
00371 if (stack.size() == 1) {
00372
00373
00374 return Shift();
00375 }
00376 return Right(action);
00377 case 'L':
00378 return Left(action);
00379 case 'r':
00380 return right(action);
00381 case 'l':
00382 return left(action);
00383 case 'D':
00384 return DepLink(action);
00385 case 'E':
00386 return Extract();
00387 case 'I':
00388 return Insert();
00389 case 'P':
00390 return Pop();
00391 }
00392 return 0;
00393 }
00394
00395
00396 void State::predicates(Features& preds, Action action)
00397 {
00398 preds.clear();
00399
00400 if (stack.empty()) {
00401 preds.push_back("(");
00402 if (CompositeActions)
00403 return;
00404 }
00405
00406 if (input.empty()) {
00407 preds.push_back(")");
00408 return;
00409 }
00410
00411
00412 tokenFeatures(preds);
00413
00414
00415 char feature[4096];
00416 if (extracted.size()) {
00417 Token* tok = extracted.back()->token;
00418 string const* lemma = tok->lemma();
00419 if (lemma && !lemma->empty()) {
00420 snprintf(feature, sizeof(feature), "EL%s", lemma->c_str());
00421 preds.push_back(feature);
00422 } else {
00423 snprintf(feature, sizeof(feature), "EW%s", tok->form.c_str());
00424 preds.push_back(feature);
00425 }
00426 string const* pos = tok->pos();
00427 if (pos && !pos->empty()) {
00428 snprintf(feature, sizeof(feature), "EP%s", pos->c_str());
00429 preds.push_back(feature);
00430 }
00431 }
00432
00433 Language const* lang = sentence.language;
00434
00435 if (MorphoAgreement && stack.size() > 1) {
00436 Token* top = stack.back()->token;
00437 Token* next = input.back()->token;
00438 if (!lang->morphoLeft(*top->pos()) &&
00439 !lang->morphoRight(*next->pos())) {
00440 if (top->morpho.number) {
00441 if (lang->numbAgree(top->morpho.number, next->morpho.number))
00442 preds.push_back("=N");
00443 }
00444 if (top->morpho.gender) {
00445 if (lang->gendAgree(top->morpho.gender, next->morpho.gender))
00446 preds.push_back("=G");
00447 }
00448
00449
00450 if (next->morpho.number && next->morpho.gender &&
00451 lang->numbAgree(top->morpho.number, next->morpho.number) &&
00452 lang->gendAgree(top->morpho.gender, next->morpho.gender)) {
00453 if (input.size() > 1) {
00454 Token* ahead = input[input.size()-2]->token;
00455 if (ahead->morpho.number && ahead->morpho.gender &&
00456 !lang->morphoRight(*ahead->pos()) &&
00457 (!lang->numbAgree(next->morpho.number, ahead->morpho.number) ||
00458 !lang->gendAgree(next->morpho.gender, ahead->morpho.gender)))
00459 preds.push_back("=NG!1");
00460
00461 if (input.size() > 2) {
00462 ahead = input[input.size()-3]->token;
00463 if (ahead->morpho.number && ahead->morpho.gender &&
00464 !lang->morphoRight(*ahead->pos()) &&
00465 (!lang->numbAgree(next->morpho.number, ahead->morpho.number) ||
00466 !lang->gendAgree(next->morpho.gender, ahead->morpho.gender)))
00467 preds.push_back("=NG!2");
00468 }
00469 }
00470 }
00471 }
00472 }
00473
00474
00475 if (StackSize && stack.size() > 2)
00476 preds.push_back("((");
00477 if (InputSize && input.size() > 1)
00478 preds.push_back("))");
00479 if (VerbCount) {
00480 int vc = 0;
00481 for (size_t i = 1; i < stack.size(); i++)
00482 if (stack[i]->token->isVerb(lang))
00483 vc++;
00484 if (vc) {
00485 snprintf(feature, sizeof(feature), "VC%d", vc);
00486 preds.push_back(feature);
00487 }
00488 }
00489
00490
00491 int id = input.back()->id;
00492 if (id > 1) {
00493
00494 if (InPunct && sentenceInfo->punctCount[id-2]%2)
00495 preds.push_back(".");
00496
00497 if (PunctCount && sentenceInfo->punctCount[id-2]) {
00498 snprintf(feature, sizeof(feature), ".%d", sentenceInfo->punctCount[id-2]);
00499 preds.push_back(feature);
00500 }
00501 }
00502 if (UseChildPunct) {
00503
00504
00505
00506 if (stack.size() > 1) {
00507 TreeToken* top = stack.back();
00508 FOR_EACH (vector<TreeToken*>, top->left, it) {
00509 if (ispunct.test((*it)->token->form)) {
00510 snprintf(feature, sizeof(feature), "1.<%s", (*it)->token->form.c_str());
00511 preds.push_back(feature);
00512 break;
00513 }
00514 }
00515 for (vector<TreeToken*>::reverse_iterator it = top->right.rbegin();
00516 it != top->right.rend(); it++) {
00517 if (ispunct.test((*it)->token->form)) {
00518 snprintf(feature, sizeof(feature), "1.>%s", (*it)->token->form.c_str());
00519 preds.push_back(feature);
00520 break;
00521 }
00522 }
00523 }
00524 if (input.size()) {
00525 TreeToken* next = input.back();
00526
00527 FOR_EACH (vector<TreeToken*>, next->left, it) {
00528 if (ispunct.test((*it)->token->form)) {
00529 snprintf(feature, sizeof(feature), ".<0%s", (*it)->token->form.c_str());
00530 preds.push_back(feature);
00531 break;
00532 }
00533 }
00534 for (vector<TreeToken*>::reverse_iterator it = next->right.rbegin();
00535 it != next->right.rend(); it++) {
00536 if (ispunct.test((*it)->token->form)) {
00537 snprintf(feature, sizeof(feature), ".>0%s", (*it)->token->form.c_str());
00538 preds.push_back(feature);
00539 break;
00540 }
00541 }
00542 }
00543 }
00544 bool oldVersion = *fileVersion == "1.1.2";
00545
00546 State const* s = this;
00547 for (int i = 0; i < PastActions && s; i++, s = s->previous) {
00548 if (s->action) {
00549 if (oldVersion)
00550 snprintf(feature, sizeof(feature), "A%d%s", i, s->action);
00551 else
00552 snprintf(feature, sizeof(feature), "a%d%s", i, s->action);
00553 preds.push_back(feature);
00554 }
00555 }
00556
00557 if (WordDistance && stack.size()) {
00558 int d = abs((int)input.back()->id - (int)stack.back()->id) - 1;
00559 snprintf(feature, sizeof(feature), "%d", min(d, 4));
00560 preds.push_back(feature);
00561 }
00562
00563
00564
00565 if (PrepChildEntityType)
00566 prepChildEntities(preds);
00567
00568 if (SecondOrder) {
00569
00570 size_t predNo = preds.size();
00571 for (unsigned i = 0; i < predNo; i++) {
00572 for (unsigned j = i+1; j < predNo; j++) {
00573
00574 string combo = (preds[i].compare(preds[j]) < 0) ?
00575 preds[i] + '#' + preds[j] : preds[j] + '#' + preds[i];
00576 preds.push_back(combo.c_str());
00577 }
00578 }
00579 }
00580
00581 if (!CompositeActions && this->action) {
00582
00583 switch (this->action[0]) {
00584 case 'R':
00585 case 'r': {
00586 TreeToken* next = input.back();
00587 string const* npos = next->token->pos();
00588 string const* nlpos = next->left.back()->token->pos();
00589 if (npos && nlpos) {
00590 snprintf(feature, sizeof(feature), "d%s%s", nlpos->c_str(), npos->c_str());
00591 preds.push_back(feature);
00592 }
00593 break;
00594 }
00595 case 'L':
00596 case 'l': {
00597 TreeToken* next = input.back();
00598 string const* npos = next->token->pos();
00599 string const* nrpos = next->right.back()->token->pos();
00600 if (npos && nrpos) {
00601 snprintf(feature, sizeof(feature), "D%s%s", nrpos->c_str(), npos->c_str());
00602 preds.push_back(feature);
00603 }
00604 break;
00605 }
00606 }
00607 }
00608 }
00609
00610
00611
00612 void State::tokenFeatures(Features& preds)
00613 {
00614 char feature[4096];
00615 Token* next = input.back()->token;
00616 set<TreeToken*> lexChildNonWordTokens;
00617
00618 int attrIndex = -1;
00619 bool oldVersion = *fileVersion == "1.1.2";
00620 FOR_EACH (FeatureSpecs, features.value, fit) {
00621 char const* attrName = fit->first;
00622 if (oldVersion)
00623 attrIndex = next->attrIndex(attrName);
00624 else
00625 attrIndex++;
00626 char featId = 'A' + attrIndex;
00627 FOR_EACH (set<TokenPath*>, fit->second, tit) {
00628
00629 TokenPath const& tp = **tit;
00630 TreeToken* tok;
00631 if (tp.root < 0) {
00632 if (-tp.root > (int)stack.size() - 1)
00633 continue;
00634 tok = stack[stack.size() + tp.root];
00635 } else {
00636 if (tp.root >= (int)input.size())
00637 continue;
00638 tok = input[input.size() - 1 - tp.root];
00639 }
00640 #ifdef PATHROOT_CHILDNONWORD
00641 if (LexChildNonWord &&
00642 lexChildNonWordTokens.find(tok) == lexChildNonWordTokens.end()) {
00643
00644 lexChildNonWordTokens.insert(tok);
00645 FOR_EACH (vector<TreeToken*>, tok->left, it) {
00646 if (nonWordAscii.test((*it)->token->form)) {
00647
00648 if (unambiguous)
00649 snprintf(feature, sizeof(feature), "/.%d", tp.root);
00650 else
00651 snprintf(feature, sizeof(feature), ".%d/", tp.root);
00652 preds.push_back(feature);
00653 break;
00654 }
00655 }
00656 for (vector<TreeToken*>::reverse_iterator it = tok->right.rbegin();
00657 it != tok->right.rend(); it++) {
00658 if (nonWordAscii.test((*it)->token->form)) {
00659
00660 if (unambiguous)
00661 snprintf(feature, sizeof(feature), "\\.%d", tp.root);
00662 else
00663 snprintf(feature, sizeof(feature), ".%d\\", tp.root);
00664 preds.push_back(feature);
00665 break;
00666 }
00667 }
00668 }
00669 #endif
00670 tok = tok->follow(tp, sentence);
00671 if (tok) {
00672 string const* item = tok->predicted(attrName);
00673 if (item && !item->empty()) {
00674
00675 if (unambiguous) {
00676
00677 if (tp.root < 0)
00678 snprintf(feature, sizeof(feature), "%s%d%c%s", tp.Code(), -tp.root, featId, item->c_str());
00679 else
00680 snprintf(feature, sizeof(feature), "%s%c%d%s", tp.Code(), featId, tp.root, item->c_str());
00681 } else {
00682 if (tp.root < 0)
00683 snprintf(feature, sizeof(feature), "%d%c%s%s", -tp.root, featId, tp.Code(), item->c_str());
00684 else
00685 snprintf(feature, sizeof(feature), "%c%d%s%s", featId, tp.root, tp.Code(), item->c_str());
00686 }
00687 preds.push_back(feature);
00688 }
00689 #ifndef PATHROOT_CHILDNONWORD
00690 if (LexChildNonWord &&
00691 lexChildNonWordTokens.find(tok) == lexChildNonWordTokens.end()) {
00692
00693 lexChildNonWordTokens.insert(tok);
00694 FOR_EACH (vector<TreeToken*>, tok->left, it) {
00695 if (nonWordAscii.test((*it)->token->form)) {
00696
00697 if (unambiguous)
00698 snprintf(feature, sizeof(feature), "/.%d", tp.root);
00699 else
00700 snprintf(feature, sizeof(feature), ".%d/", tp.root);
00701 preds.push_back(feature);
00702 break;
00703 }
00704 }
00705 for (vector<TreeToken*>::reverse_iterator it = tok->right.rbegin();
00706 it != tok->right.rend(); it++) {
00707 if (nonWordAscii.test((*it)->token->form)) {
00708
00709 if (unambiguous)
00710 snprintf(feature, sizeof(feature), "\\.%d", tp.root);
00711 else
00712 snprintf(feature, sizeof(feature), ".%d\\", tp.root);
00713 preds.push_back(feature);
00714 break;
00715 }
00716 }
00717 }
00718 #endif
00719 }
00720 }
00721 }
00722
00723
00724 FeatureSpecs splits = SplitFeature.value;
00725 FeatureSpecs::const_iterator split = splits.begin();
00726 if (split != splits.end()) {
00727 char const* attrName = split->first;
00728 FOR_EACH (set<TokenPath*>, split->second, tit) {
00729
00730 TokenPath const& tp = **tit;
00731 TreeToken* tok;
00732 if (tp.root < 0) {
00733 if (-tp.root > (int)stack.size() - 1)
00734 continue;
00735 tok = stack[stack.size() + tp.root];
00736 } else {
00737 if (tp.root >= (int)input.size())
00738 continue;
00739 tok = input[input.size() - 1 - tp.root];
00740 }
00741 tok = tok->follow(tp, sentence);
00742 if (tok) {
00743 string const* feat = tok->predicted(attrName);
00744 if (feat)
00745 splitFeature = *feat;
00746 else
00747 cerr << "Missing split feature" << endl;
00748 }
00749 }
00750 }
00751 }
00752
00753 void addComplementFeature(Token* tok, Language const* lang,
00754 GlobalInfo* info, Features& preds,
00755 char const* timePred, char const* locPred)
00756 {
00757 if (tok->isNoun(lang)) {
00758 string const* noun = tok->lemma();
00759 if (noun && !noun->empty()) {
00760 int tc = info->timeLemmas.count(*noun);
00761 int lc = info->locLemmas.count(*noun);
00762
00763 if (tc > info->freqRatio * lc)
00764 preds.push_back(timePred);
00765 if (lc > info->freqRatio * tc)
00766 preds.push_back(locPred);
00767 }
00768 }
00769 }
00770
00772 void State::prepChildEntities(Features& preds)
00773 {
00774
00775 Language const* lang = sentence.language;
00776 GlobalInfo* info = sentenceInfo->globalInfo;
00777 if (stack.size() > 1) {
00778 TreeToken* top = stack.back();
00779 addComplementFeature(top->token, lang, info, preds, "1TIME", "1LOC");
00780 }
00781
00782 TreeToken* next = input.back();
00783 addComplementFeature(next->token, lang, info, preds, "TIME0", "LOC0");
00784 }
00785
00786 void State::showStatus() {
00787 cerr << "Stack:" << endl;
00788 FOR_EACH (vector<TreeToken*>, stack, it)
00789 (*it)->print(cerr);
00790 cerr << "Next:" << endl;
00791 if (input.size())
00792 input.back()->print(cerr);
00793 }
00794
00795
00796
00797
00798 TrainState::TrainState(Sentence const& sent, GlobalInfo* info) :
00799 State(sent, info),
00800 annotated(sentence)
00801 {
00802
00803
00804
00805 size_t len = sentence.size();
00806 dependents.resize(len);
00807 FOR_EACH (Sentence, sentence, sit) {
00808 int head = (*sit)->linkHead();
00809 if (head)
00810 dependents[head-1]++;
00811 }
00812
00813 if (PrepChildEntityType)
00814 info->extract(sentence);
00815
00816
00817
00818 FOR_EACH (vector<TreeToken*>, input, sit) {
00819 (*sit)->linkHead(0);
00820 (*sit)->linkLabel("");
00821 }
00822 if (Parser::lexCutoff > 0)
00823 unambiguous = true;
00824 }
00825
00826 #define ORIG(tok) annotated[(tok)->id-1]
00827
00828
00829 #define NextToStackLink(n) \
00830 (stack.size() > (n) && \
00831 ORIG(stack[stack.size()-(n)])->linkHead() == next->id)
00832
00833
00834 #define StackToNextLink(n) \
00835 (stack.size() > (n) && \
00836 stack[stack.size()-(n)]->id == nextHead)
00837
00838 #define Resolved(tok) (!dependents[(tok)->id-1])
00839
00840 #define top2 stack[stack.size()-2]
00841 #define top3 stack[stack.size()-3]
00842 #define top4 stack[stack.size()-4]
00843
00849 Action TrainState::nextAction()
00850 {
00851 if (input.size() == 0)
00852 return 0;
00853 if (!CompositeActions && this->action && strchr("RrLl", this->action[0])) {
00854 TreeToken* next = input.back();
00855 switch (this->action[0]) {
00856 case 'R':
00857 case 'r':
00858 return mkAction("D", *next->left.back()->get("DEPREL"));
00859 case 'L':
00860 case 'l':
00861 return mkAction("D", *next->right.back()->get("DEPREL"));
00862 }
00863 return 0;
00864 }
00865 TreeToken* next = input.back();
00866 int nextHead = ORIG(next)->linkHead();
00867 string const& nextLabel = ORIG(next)->linkLabel();
00868 if (stack.empty())
00869 return "S";
00870 TreeToken* top = stack.back();
00871
00872 if (extracted.size() && nextHead == extracted.back()->id) {
00873
00874
00875 return "I";
00876 } else if (top->id && ORIG(top)->linkHead() == next->id) {
00877
00878
00879 if (input.size() > 1 &&
00880 ORIG(input[input.size()-2])->linkHead() == top->id) {
00881
00882
00883 return "S";
00884 } else if (top->id && !Resolved(top)) {
00885
00886 return "S";
00887 } else {
00888
00889 dependents[next->id - 1]--;
00890 return mkAction("R", ORIG(top)->linkLabel());
00891 }
00892 } else if (arcEager && stack.size() > 1 && Resolved(top)) {
00893 return "P";
00894 } else if (nextHead == top->id && Resolved(next)) {
00895
00896
00897
00898
00899 if (stack.size() > 1) {
00900 dependents[top->id - 1]--;
00901 }
00902 return mkAction("L", nextLabel);
00903 } else if (NextToStackLink(2) && Resolved(top2)) {
00904
00905
00906
00907 dependents[next->id - 1]--;
00908 return mkAction("r2", ORIG(top2)->linkLabel());
00909 } else if (NextToStackLink(3) && Resolved(top3)) {
00910
00911
00912
00913 dependents[next->id - 1]--;
00914 return mkAction("r3", ORIG(top3)->linkLabel());
00915 } else if (input.size() == 1 &&
00916
00917 NextToStackLink(4) && Resolved(top4)) {
00918
00919
00920
00921 dependents[next->id - 1]--;
00922 return mkAction("r4", ORIG(top4)->linkLabel());
00923 } else if (nextHead == top->id && !Resolved(next)) {
00924
00925
00926 return "S";
00927 } else if (StackToNextLink(2) && Resolved(next)) {
00928
00929
00930
00931
00932
00933 if (stack.size() == 2) {
00934
00935
00936
00937
00938
00939 } else {
00940 dependents[top2->id - 1]--;
00941
00942 }
00943 return mkAction("l2", nextLabel);
00944 } else if (StackToNextLink(3) && Resolved(next)) {
00945
00946
00947
00948 if (stack.size() > 3)
00949 dependents[top3->id - 1]--;
00950 return mkAction("l3", nextLabel);
00951 } else if (StackToNextLink(4) && Resolved(next)) {
00952
00953
00954
00955 if (stack.size() > 4)
00956 dependents[top4->id - 1]--;
00957 return mkAction("l4", nextLabel);
00958 }
00959 return "S";
00960 }
00961
00962 Event* TrainState::next()
00963 {
00964 Action action = nextAction();
00965 Event* ev = new Event(action);
00966 predicates(ev->features, action);
00967 return ev;
00968 }
00969
00970
00971
00972
00973 ParseState::ParseState(Sentence& sent, GlobalInfo* globalInfo, WordIndex& predIndex) :
00974 State(sent, globalInfo),
00975 predIndex(predIndex),
00976 lprob(0),
00977 refCount(0)
00978 {
00979
00980 FOR_EACH (vector<TreeToken*>, sentence, sit) {
00981 (*sit)->linkHead(0);
00982 (*sit)->linkLabel("");
00983 }
00984 }
00985
00986 ParseState::~ParseState() {
00987 assert(refCount == 0);
00988 if (previous) {
00989 ((ParseState*)previous)->decRef();
00990 for (size_t i = 0; i < sentence.size(); i++) {
00991 if (sentence[i] == previous->sentence[i])
00992 sentence[i] = 0;
00993 }
00994 }
00995 }
00996
01000 void ParseState::prune()
01001 {
01002 if (refCount == 0) {
01003 ParseState* p = (ParseState*)previous;
01004 delete this;
01005 if (p)
01006 p->prune();
01007 }
01008 }
01009
01010 bool ParseState::hasNext()
01011 {
01012 bool res = State::hasNext();
01013 if (!res) {
01014
01015 if (stack.size() > 2) {
01016
01017 # ifdef DEBUG
01018 cerr << "Multiple roots: " << stack.size() << endl;
01019 # endif
01020 Language const* lang = sentence.language;
01021
01022 int root = 0;
01023 int rootSize = 0;
01024 FOR_EACH (vector<TreeToken*>, stack, sit) {
01025 TreeToken* node = *sit;
01026 if (node->linkHead() == 0) {
01027 int size = node->size();
01028
01029 string const* tokPos = node->token->pos();
01030 if (size > rootSize && tokPos && lang->rootPos(*tokPos)) {
01031 root = node->id;
01032 rootSize = size;
01033 }
01034 }
01035 }
01036 if (root) {
01037
01038 TreeToken* rootNode = sentence[root-1];
01039 if (rootNode->linkLabel().empty())
01040 rootNode->linkLabel(lang->rootLabel());
01041 TO_EACH (vector<TreeToken*>, stack, sit) {
01042 TreeToken* node = *sit;
01043 if (node->linkHead() == 0 && node->id != root) {
01044 node->linkHead(root);
01045 if (node->linkLabel().empty())
01046 node->linkLabel(lang->rootLabel());
01047 }
01048 }
01049 }
01050 }
01051 }
01052 return res;
01053 }
01054
01055 Tanl::Classifier::Context* ParseState::next()
01056 {
01057 Features preds;
01058 predicates(preds);
01059
01060 context.clear();
01061 FOR_EACH (Features, preds, it) {
01062 string const& pred = *it;
01063 if (predIndex.find(pred.c_str()) != predIndex.end()) {
01064 context.add(predIndex[pred.c_str()]);
01065 } else {
01066
01067 size_t pathLen = strspn(pred.c_str(), TokenPath::dirCode);
01068
01069 if (pathLen + 2 < pred.size()) {
01070 string uf = pred.substr(0, pathLen+2) + "#UNKNOWN";
01071 if (predIndex.find(uf.c_str()) != predIndex.end())
01072 context.add(predIndex[uf.c_str()]);
01073 }
01074 }
01075 }
01076 return &context;
01077 }
01078
01079 ParseState* ParseState::transition(Action action)
01080 {
01081
01082 if (extracted.size() && input.size() &&
01083 (action[0] == 'S' || action[0] == 'L') &&
01084 ispunct.test(input.back()->token->form))
01085 action = "I";
01086 ParseState* next = new ParseState(*this);
01087 if (next->State::transition(action))
01088 return next;
01089 delete next;
01090 return 0;
01091 }
01092
01093 }