00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "SstFeatureExtractor.h"
00026
00027
00028 #include <sstream>
00029
00030
00031 #include "conf/conf_bool.h"
00032 #include "conf/conf_string.h"
00033
00034 using namespace Tanl::Classifier;
00035 using namespace Tanl::Text;
00036
00037 namespace Tanl {
00038 namespace SST {
00039
00040
00041 IXE::conf<string> FORM("FORM", "0");
00042 IXE::conf<string> POSTAG("POSTAG", "-2 -1 0 1 2");
00043 IXE::conf<string> CPOSTAG("CPOSTAG", "-1 0");
00044 IXE::conf<string> SSTAG("SSTAG", "");
00045
00046 FeatureSpec::FeatureSpec(char const* name, char const* list) :
00047 name(name)
00048 {
00049 stringstream ss(list);
00050 int n;
00051 while (ss >> n)
00052 tokens.push_back(n);
00053 }
00054
00055 FeatureSpecs::FeatureSpecs()
00056 {
00057
00058 push_back(FeatureSpec("FORM", FORM));
00059
00060 push_back(FeatureSpec("POSTAG", POSTAG));
00061
00062 push_back(FeatureSpec("CPOSTAG", CPOSTAG));
00063
00064 push_back(FeatureSpec("SSTAG", SSTAG));
00065 }
00066
00067
00068
00072 static string const AllAlpha = "W1";
00073 static string const AllDigits = "W2";
00074 static string const CapPeriod = "initCapPeriod";
00075 static string const SingleCap = "oneCap";
00076 static string const SingleS = "W4";
00077 static string const ContainsApo = "W5";
00078 static string const ContainsDot = "W6";
00079 static string const ContainsComma = "W7";
00080 static string const DigitSlash = "digitSlash";
00081 static string const ContainsPercent = "percent";
00082 static string const DigitsDot = "digitPeriod";
00083 static string const ContainsDollar = "dollar";
00084 static string const ContainsDigit = "containDigit";
00085 static string const TwoDigits = "twoDigit";
00086 static string const FourDigits = "fourDigit";
00087 static string const AllQuoting = "Wf";
00088
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159 char const* AllUpper[4] = {"allCaps_HL", "allCaps_AU", "allCaps_DL",
00160 "allCaps_TXT"};
00161 char const* Capitalized[4] = {"initCaps_HL", "initCaps_AU", "initCaps_DL",
00162 "initCaps_TXT"};
00163
00164
00165 char const* firstWordCap[4] = {"firstWordInitCaps_HL",
00166 "firstWordInitCaps_AU",
00167 "firstWordInitCaps_DL",
00168 "firstWordInitCaps_TXT"};
00169
00170
00171 char const* firstWordLow[4] = {"firstWordNotInitCaps_HL",
00172 "firstWordNotInitCaps_AU",
00173 "firstWordNotInitCaps_DL",
00174 "firstWordNotInitCaps_TXT"};
00175
00176 char const* MixedCase[4] = {"mixedCaps_HL", "mixedCaps_AU",
00177 "mixedCaps_DL", "mixedCaps_TXT"};
00178
00179 string const HyphenCapCap = "U-U";
00180 string const HyphenLowCap = "L-U";
00181 string const HyphenCapLow = "U-L";
00182
00183
00184
00185
00186 char const* prevCap[4] = { "initCapsPrev_HL", "initCapsPrev_AU",
00187 "initCapsPrev_DL", "initCapsPrev_TXT"};
00188
00189 char const* nextCap[4] = { "initCapsNext_HL", "initCapsNext_AU",
00190 "initCapsNext_DL", "initCapsNext_TXT"};
00191
00192 string const seqCap = "I";
00193 string const notSeqCap = "NI";
00194 string const CapNext = "initCaps_Next";
00195 string const LowNext = "nonInitCaps_Next";
00196 string const CapNoNext = "initCaps__Next";
00197 string const LowNoNext = "nonInitCaps__Next";
00198 string const CapPrev = "initCaps_Prev";
00199 string const LowPrev = "nonInitCaps_Prev";
00200 string const CapNoPrev = "initCaps__Prev";
00201 string const LowNoPrev = "nonInitCaps__Prev";
00202
00203 string const Cap = "initCaps_";
00204 string const Low = "nonInitCaps_";
00205
00206
00207 string const withinQuotes = "quote";
00208 string const rare = "rare";
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228 char const* prevTags[] = {
00229 "NCPrevOccur-adj.all", "NCPrevOccur-adj.pert", "NCPrevOccur-adj.ppl",
00230 "adv.all", "NCPrevOccur-noun.Tops", "NCPrevOccur-noun.act",
00231 "NCPrevOccur-noun.animal", "NCPrevOccur-noun.artifact",
00232 "NCPrevOccur-noun.attribute", "NCPrevOccur-noun.body",
00233 "NCPrevOccur-noun.cognition", "NCPrevOccur-noun.communication",
00234 "NCPrevOccur-noun.event", "NCPrevOccur-noun.feeling",
00235 "NCPrevOccur-noun.food", "NCPrevOccur-noun.group",
00236 "NCPrevOccur-noun.location", "NCPrevOccur-noun.motive",
00237 "NCPrevOccur-noun.object", "NCPrevOccur-noun.other",
00238 "NCPrevOccur-noun.person", "NCPrevOccur-noun.phenomenon",
00239 "NCPrevOccur-noun.plant", "NCPrevOccur-noun.possession",
00240 "NCPrevOccur-noun.process", "NCPrevOccur-noun.quantity",
00241 "NCPrevOccur-noun.relation", "NCPrevOccur-noun.shape",
00242 "NCPrevOccur-noun.state", "NCPrevOccur-noun.substance",
00243 "NCPrevOccur-noun.time", "NCPrevOccur-verb.body",
00244 "NCPrevOccur-verb.change", "NCPrevOccur-verb.cognition",
00245 "NCPrevOccur-verb.communication", "NCPrevOccur-verb.competition",
00246 "NCPrevOccur-verb.consumption", "NCPrevOccur-verb.contact",
00247 "NCPrevOccur-verb.creation", "NCPrevOccur-verb.emotion",
00248 "NCPrevOccur-verb.motion", "NCPrevOccur-verb.perception",
00249 "NCPrevOccur-verb.possession", "NCPrevOccur-verb.social",
00250 "NCPrevOccur-verb.stative", "NCPrevOccur-verb.weather"
00251 };
00252
00253
00254 char const* otherLastTags[] = {
00255 "OtherNCS-adj.all", "OtherNCS-adj.pert", "OtherNCS-adj.ppl", "adv.all",
00256 "OtherNCS-noun.Tops", "OtherNCS-noun.act", "OtherNCS-noun.animal",
00257 "OtherNCS-noun.artifact", "OtherNCS-noun.attribute", "OtherNCS-noun.body",
00258 "OtherNCS-noun.cognition", "OtherNCS-noun.communication",
00259 "OtherNCS-noun.event", "OtherNCS-noun.feeling", "OtherNCS-noun.food",
00260 "OtherNCS-noun.group", "OtherNCS-noun.location", "OtherNCS-noun.motive",
00261 "OtherNCS-noun.object", "OtherNCS-noun.other", "OtherNCS-noun.person",
00262 "OtherNCS-noun.phenomenon", "OtherNCS-noun.plant",
00263 "OtherNCS-noun.possession", "OtherNCS-noun.process",
00264 "OtherNCS-noun.quantity", "OtherNCS-noun.relation", "OtherNCS-noun.shape",
00265 "OtherNCS-noun.state", "OtherNCS-noun.substance", "OtherNCS-noun.time",
00266 "OtherNCS-verb.body", "OtherNCS-verb.change", "OtherNCS-verb.cognition",
00267 "OtherNCS-verb.communication", "OtherNCS-verb.competition",
00268 "OtherNCS-verb.consumption", "OtherNCS-verb.contact",
00269 "OtherNCS-verb.creation", "OtherNCS-verb.emotion", "OtherNCS-verb.motion",
00270 "OtherNCS-verb.perception", "OtherNCS-verb.possession",
00271 "OtherNCS-verb.social", "OtherNCS-verb.stative", "OtherNCS-verb.weather"
00272 };
00273
00274 string const otherCapitalized = "otherInitCap";
00275
00276 string const otherNotCapitalized = "otherNotInitCap";
00277
00278
00279
00280
00281
00282 string const Acronym = "acronym";
00283 string const AcronymBegin = "cI";
00284 string const AcronymContinue = "cJ";
00285 string const AcronymEnd = "cK";
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299 bool allUpper(char const* s)
00300 {
00301 char* scan = (char*)s;
00302 while (*scan)
00303 if (!isupper(*scan++))
00304 return false;
00305 return true;
00306 }
00307
00308 Text::RegExp::Pattern patMix("^[a-z]+[A-Z]([a-zA-Z])*$");
00309 bool mixedCase(char const* s)
00310 {
00311 return patMix.test(s);
00312 }
00313
00314 bool noLetter(char const* s)
00315 {
00316 char* scan = (char*)s;
00317 while (*scan)
00318 if (isalpha(*scan++))
00319 return false;
00320 return true;
00321 }
00322
00323 bool containsDigit(char const* s)
00324 {
00325 return ::strpbrk(s, "0123456789");
00326 }
00327
00328 bool allDigits(char const* s)
00329 {
00330 return ::strspn(s, "0123456789") == :: strlen(s);
00331 }
00332
00333 bool digitSlash(char const* s)
00334 {
00335 return containsDigit(s) && strchr(s, '/');
00336 }
00337
00338 bool allQuotes(char const* s)
00339 {
00340 return (::strspn(s, "\"") == ::strlen(s));
00341 }
00342
00343
00344
00345 char const* Resources::classNames[] = {
00346 "adj.all", "adj.pert", "adj.ppl", "adv.all", "noun.Tops", "noun.act",
00347 "noun.animal", "noun.artifact", "noun.attribute", "noun.body",
00348 "noun.cognition", "noun.communication", "noun.event", "noun.feeling",
00349 "noun.food", "noun.group", "noun.location", "noun.motive", "noun.object",
00350 "noun.other", "noun.person", "noun.phenomenon", "noun.plant",
00351 "noun.possession", "noun.process", "noun.quantity", "noun.relation",
00352 "noun.shape", "noun.state", "noun.substance", "noun.time", "verb.body",
00353 "verb.change", "verb.cognition", "verb.communication", "verb.competition",
00354 "verb.consumption", "verb.contact", "verb.creation", "verb.emotion",
00355 "verb.motion", "verb.perception", "verb.possession", "verb.social",
00356 "verb.stative", "verb.weather"
00357 };
00358
00362 Resources::Resources(string& resourceDir, string& locale) :
00363 language(locale.c_str())
00364 {
00365 load(resourceDir);
00366 }
00367
00368 void Resources::load(string& resourceDir)
00369 {
00370 for (int i = 0; i < sizeof(classNames)/sizeof(char*); i++)
00371 classId[classNames[i]] = i;
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397 }
00398
00399 template<class WordSet>
00400 void Resources::load(WordSet* sets, char const* file)
00401 {
00402 ifstream ifs(file);
00403 char line[MAX_LINE_LEN];
00404
00405 while (ifs.getline(line, MAX_LINE_LEN)) {
00406
00407 char* next = line;
00408 char* className = strtok_r(0, " \t", &next);
00409 if (!className)
00410 continue;
00411 char* word = next + strspn(next, " \t");
00412 if (strlen(word) == 0)
00413 continue;
00414
00415 int cid = classId[className];
00416 sets[cid].insert(word);
00417 }
00418 }
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432 Text::RegExp::Pattern number("\\d+");
00433 string numRep("+NUM+");
00434
00435
00436
00437 SstFeatureExtractor::SstFeatureExtractor(Resources& resources) :
00438 resources(resources),
00439 tokenCategorizer(resources.language),
00440 insideQuotes(false)
00441 { }
00442
00443 void SstFeatureExtractor::extract(Classifier::Features& features, const int& position)
00444 {
00445 Token* token = (*sentence)[position];
00446 char const* word = token->form.c_str();
00447 char const* prevWord = (position > 0) ? (*sentence)[position-1]->form.c_str() : 0;
00448 char const* nextWord = (position+1 < sentence->size()) ? (*sentence)[position+1]->form.c_str() : 0;
00449
00450
00451
00452
00453 size_t len = strlen(word);
00454 size_t sentLen = sentence->size();
00455
00456
00457 if (!prevWord && isupper(word[0]))
00458 features.add(firstWordCap[zone]);
00459
00460
00461 if (prevWord && isupper(word[0]))
00462 features.add(Capitalized[zone]);
00463
00464
00465 if (!prevWord && !isupper(word[0]))
00466 features.add(firstWordLow[zone]);
00467
00468
00469 if (allUpper(word)) {
00470 features.add(AllUpper[zone]);
00471 if (acronyms.contains(word))
00472 features.add(Acronym);
00473 } else if (mixedCase(word))
00474 features.add(MixedCase[zone]);
00475
00476
00477 if (isupper(word[0]) && word[len-1] == '.')
00478 features.add(CapPeriod);
00479
00480
00481 if (len == 1 && isupper(word[0]))
00482 features.add(SingleCap);
00483
00484
00485 if (containsDigit(word))
00486 features.add(ContainsDigit);
00487
00488 if (allDigits(word)) {
00489
00490 if (len == 2)
00491 features.add(TwoDigits);
00492
00493 else if (len == 4)
00494 features.add(FourDigits);
00495 }
00496
00497
00498 if (digitSlash(word))
00499 features.add(DigitSlash);
00500
00501
00502 if (strchr(word, '$'))
00503 features.add(ContainsDollar);
00504
00505
00506 if (strchr(word, '%'))
00507 features.add(ContainsDollar);
00508
00509
00510 if (strpbrk(word, "1234567890") && strchr(word, '.'))
00511 features.add(DigitsDot);
00512
00513
00514
00515 string upNext(nextWord ? nextWord : "");
00516 to_upper(upNext);
00517 upNext = number.replace(upNext, numRep, true);
00518 string upPrev(prevWord ? prevWord : "");
00519 to_upper(upPrev);
00520 upPrev = number.replace(upPrev, numRep, true);
00521
00522
00523 if (nextWord) {
00524 if (isupper(word[0]))
00525 features.add(Cap + upNext + "Next");
00526 else
00527 features.add(Low + upNext + "Next");
00528 } else if (isupper(word[0]))
00529 features.add(CapNoNext);
00530 else
00531 features.add(LowNoNext);
00532
00533
00534 if (prevWord) {
00535 if (isupper(word[0]))
00536 features.add(Cap + upPrev + "Prev");
00537 else
00538 features.add(Low + upPrev + "Prev");
00539 } else if (isupper(word[0]))
00540 features.add(CapNoPrev);
00541 else
00542 features.add(LowNoPrev);
00543
00544
00545 if (nextWord && isupper(nextWord[0]))
00546 features.add(nextCap[zone]);
00547
00548
00549 if (prevWord && isupper(prevWord[0]))
00550 features.add(prevCap[zone]);
00551
00552
00553 if (prevWord && nextWord &&
00554 isupper(prevWord[0]) && isupper(word[0]) && isupper(nextWord[0]))
00555 features.add(seqCap);
00556 else
00557 features.add(notSeqCap);
00558
00559
00560 if (strchr(word, '-')) {
00561 char copy[800];
00562 strncpy(copy, word, sizeof(copy));
00563 char* next = copy;
00564 char* word1 = strtok_r(0, "-", &next);
00565 char* word2 = strtok_r(0, "-", &next);
00566 if (word1 && word2) {
00567 bool up1 = isupper(word1[0]);
00568 bool up2 = isupper(word2[0]);
00569 if (next[0] == '\0') {
00570 if (up1 && up2)
00571 features.add(HyphenCapCap);
00572 else {
00573 to_upper(word1);
00574 to_upper(word2);
00575 if (up1) {
00576 features.add(HyphenCapLow);
00577 features.add(string("U-") + word1);
00578 features.add(string("L_") + word2);
00579 } else if (up2) {
00580 features.add(HyphenLowCap);
00581 features.add(string("L-") + word1);
00582 features.add(string("U_") + word2);
00583 }
00584 }
00585 }
00586 }
00587 }
00588
00589 string upper(word);
00590 to_upper(upper);
00591 upper = number.replace(upper, numRep, true);
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616
00617
00618
00619
00620
00621
00622
00623
00624
00625
00626
00627
00628
00629
00630
00631
00632
00633
00634 char feature[1024];
00635 FOR_EACH (FeatureSpecs, featureSpecs, fit) {
00636 char const* attrName = fit->name;
00637 int attrIndex = token->attrIndex(attrName);
00638 char featId = 'A' + attrIndex;
00639 FOR_EACH (vector<int>, fit->tokens, tit) {
00640
00641 int dx = *tit;
00642 int idx = position + dx;
00643 Token* tok = (0 <= idx && idx < sentLen) ? (*sentence)[idx] : 0;
00644 if (tok) {
00645 string const* item = tok->get(attrName);
00646 if (item && !item->empty()) {
00647
00648 if (dx < 0)
00649 sprintf(feature, "%d%c%s", -dx, featId, item->c_str());
00650 else
00651 sprintf(feature, "%c%d%s", featId, dx, item->c_str());
00652 features.add(feature);
00653 }
00654 }
00655 }
00656 }
00657
00658
00659 unordered_map<string, bool>::iterator it = capitalized.find(upper);
00660 if (it != capitalized.end()) {
00661 if (it->second)
00662 features.add(otherCapitalized);
00663 else
00664 features.add(otherNotCapitalized);
00665 } else if (position > 0)
00666 capitalized[upper] = isupper(word[0]);
00667
00668
00669 if (nextWord && isupper(word[0])) {
00670 string acronym;
00671 for (int i = position; i < sentLen; i++) {
00672 char c = (*sentence)[i]->form[0];
00673 if (isupper(c))
00674 acronym.push_back(c);
00675 else
00676 break;
00677 }
00678 if (acronym.size() > 1)
00679 acronyms.insert(acronym.c_str());
00680 }
00681
00682
00683 for (int i = 0; i < Resources::nClasses; i++) {
00684 if (otherLast[i].contains(word))
00685 features.add(otherLastTags[i]);
00686 }
00687
00688
00689 for (int i = 0; i < Resources::nClasses; i++) {
00690 if (prevClass[i].contains(word))
00691 features.add(prevTags[i]);
00692 }
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705 if (prevWord && nextWord &&
00706 strpbrk(prevWord, "\"()[]{}'<>") &&
00707 strpbrk(nextWord, "\"()[]{}'<>"))
00708 features.add(withinQuotes);
00709
00710 if (allQuotes(word))
00711 insideQuotes = !insideQuotes;
00712 }
00713
00714 void SstFeatureExtractor::analyze(Sentence* sent, int zone)
00715 {
00716 sentence = sent;
00717 this->zone = zone;
00718 int len = sentence->size();
00719 tokenTypes.resize(len);
00720 insideQuotes = false;
00721 int i = 0;
00722 for (Sentence::const_iterator sit = sentence->begin();
00723 sit != sentence->end(); ++sit, ++i) {
00724 Token* tok = *sit;
00725 tokenTypes[i] = tokenCategorizer.analyze(tok->form.c_str());
00726 }
00727 }
00728
00729 void SstFeatureExtractor::reset()
00730 {
00731 insideQuotes = false;
00732
00733
00734 for (int i = 0; i < Resources::nClasses; ++i) {
00735 prevClass[i].clear();
00736 otherLast[i].clear();
00737 }
00738 acronyms.clear();
00739 capitalized.clear();
00740 }
00741
00742 void SstFeatureExtractor::classified(int position, char const* className)
00743 {
00744 Token* token = (*sentence)[position];
00745 char const* word = token->form.c_str();
00746 designated(word, className);
00747
00748 token->set("SSTAG", className);
00749 }
00750
00751 void SstFeatureExtractor::designated(char const* word, char const* ssTag)
00752 {
00753 if (strlen(ssTag) < 2)
00754 return;
00755 for (int i = 0; i < Resources::nClasses; ++i) {
00756 if (!strcmp(ssTag+2, resources.classNames[i]))
00757 prevClass[i].insert(word);
00758 }
00759 }
00760
00761 }
00762 }