00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "NerFeatureExtractor.h"
00026
00027
00028 #include <sstream>
00029
00030
00031 #include "conf/conf_bool.h"
00032 #include "conf/conf_string.h"
00033
00034 using namespace Tanl::Classifier;
00035 using namespace Tanl::Text;
00036
00037 namespace Tanl {
00038 namespace NER {
00039
00040
00041 IXE::conf<bool> oldIOB("OldIOB", true);
00042
00043 IXE::conf<bool> refine("Refine", true);
00044
00045
00046
00050 static vector<string> dictTags;
00051 static string const Money = "D1";
00052 static string const Name = "D2";
00053 static string const Time = "D4";
00054 static string const PrevName = "D5";
00055 static string const NextPerson= "D6";
00056
00057
00058
00059
00060
00061
00062
00063 char const* TokenTypes[] = {
00064 "UNKNOWN",
00065 "WORD",
00066 "QUANTITY",
00067 "NAME",
00068 "TIME",
00069 "SYMBOL",
00070 "PUNCT",
00071 "ADDRESS",
00072 "PERSON",
00073 "ORGANIZATION",
00074 "LOCATION",
00075 "PRODUCT",
00076 "NUMBER",
00077 "MONEY",
00078 "MEASURE",
00079 "DURATION",
00080 "DATE",
00081 "TIME",
00082 "URL",
00083 "EMAIL"
00084 };
00085
00086
00087
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123 char const* Capitalized[4] = {"initCaps_HL", "initCaps_AU", "initCaps_DL",
00124 "initCaps_TXT"};
00125
00126
00127 char const* firstWordCap[4] = {"firstWordInitCaps_HL",
00128 "firstWordInitCaps_AU",
00129 "firstWordInitCaps_DL",
00130 "firstWordInitCaps_TXT"};
00131
00132
00133 char const* firstWordLow[4] = {"firstWordNotInitCaps_HL",
00134 "firstWordNotInitCaps_AU",
00135 "firstWordNotInitCaps_DL",
00136 "firstWordNotInitCaps_TXT"};
00137
00138 string const HyphenCapCap = "U-U";
00139 string const HyphenLowCap = "L-U";
00140 string const HyphenCapLow = "U-L";
00141
00142
00143
00144 string const seqCap = "I";
00145 string const notSeqCap = "NI";
00146 string const CapNoNext = "initCaps__Next";
00147 string const LowNoNext = "nonInitCaps__Next";
00148 string const CapNoPrev = "initCaps__Prev";
00149 string const LowNoPrev = "nonInitCaps__Prev";
00150
00151 string const Cap = "initCaps_";
00152 string const Low = "nonInitCaps_";
00153
00154
00155 string const withinQuotes = "quote";
00156 string const rare = "rare";
00157
00158
00159 vector<string> bigramTags;
00160
00161
00162 vector<string> prefixTags;
00163
00164
00165 vector<string> suffixTags;
00166
00167
00168 vector<string> firstWordTags;
00169
00170
00171 vector<string> lastWordTags;
00172
00173
00174 vector<string> lowerSeqTags;
00175
00176
00177 vector<string> lowerIntermTags;
00178
00179
00180
00181
00182 vector<string> prevTags;
00183
00184
00185 vector<string> otherFirstTags;
00186
00187
00188 vector<string> otherLastTags;
00189
00190 string const otherCapitalized = "otherInitCap";
00191
00192 string const otherNotCapitalized = "otherNotInitCap";
00193
00194
00195
00196
00197
00198 string const Acronym = "acronym";
00199 string const AcronymBegin = "cI";
00200 string const AcronymContinue = "cJ";
00201 string const AcronymEnd = "cK";
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215 bool allUpper(char const* s)
00216 {
00217 char* scan = (char*)s;
00218 while (*scan)
00219 if (!isupper(*scan++))
00220 return false;
00221 return true;
00222 }
00223
00224 bool allQuotes(char const* s)
00225 {
00226 return (::strspn(s, "\"") == ::strlen(s));
00227 }
00228
00229
00230
00231 IXE::conf_set<std::string> Resources::entityTypes("Classes");
00232
00233 void Resources::resize()
00234 {
00235 size_t nClasses = entityTypes->size();
00236 dict.resize(nClasses);
00237 designators.resize(nClasses);
00238 preBigrams.resize(nClasses);
00239 prefixes.resize(nClasses);
00240 suffixes.resize(nClasses);
00241 firstWords.resize(nClasses);
00242 lastWords.resize(nClasses);
00243 lowerInterm.resize(nClasses);
00244 dictTags.resize(nClasses);
00245 bigramTags.resize(nClasses);
00246 prefixTags.resize(nClasses);
00247 suffixTags.resize(nClasses);
00248 firstWordTags.resize(nClasses);
00249 lastWordTags.resize(nClasses);
00250 lowerSeqTags.resize(nClasses);
00251 lowerIntermTags.resize(nClasses);
00252 prevTags.resize(nClasses);
00253 otherFirstTags.resize(nClasses);
00254 otherLastTags.resize(nClasses);
00255 int i = 0;
00256 FOR_EACH (set<string>, *entityTypes, it) {
00257 dictTags[i] = "UNI-" + *it;
00258 bigramTags[i] = "BI-" + *it;
00259 prefixTags[i] = "Pref-" + *it;
00260 suffixTags[i] = "Suf-" + *it;
00261 firstWordTags[i] = "FW-" + *it;
00262 lastWordTags[i] = "LW-" + *it;
00263 lowerSeqTags[i] = "lS" + *it;
00264 lowerIntermTags[i] = "lI" + *it;
00265 prevTags[i] = "PrevOccur-" + *it;
00266 otherFirstTags[i] = "OtherFirst-" + *it;
00267 otherLastTags[i] = "OtherLast-" + *it;
00268 i++;
00269 }
00270 }
00271
00275 Resources::Resources(string& resourceDir, string& locale,
00276 char const* POStag, char const* NEtag) :
00277 language(locale.c_str()),
00278 POStag(POStag),
00279 NEtag(NEtag)
00280 {
00281 load(resourceDir);
00282 }
00283
00284 void Resources::load(string& resourceDir)
00285 {
00286 resize();
00287 int i = 0;
00288 FOR_EACH (std::set<std::string>, *entityTypes, it)
00289 classId[it->c_str()] = i++;
00290
00291
00292 load(dict, (resourceDir + "/Dict.lst").c_str());
00293 moneyDict.load((resourceDir + "/money.lst").c_str());
00294 namesDict.load((resourceDir + "/names.lst").c_str());
00295 timeDict.load((resourceDir + "/time.lst").c_str());
00296
00297
00298 FWL.load((resourceDir + "/FrequentWords.lst").c_str());
00299
00300
00301 load(designators, (resourceDir + "/Designators.lst").c_str());
00302
00303
00304 load(preBigrams, (resourceDir + "/Bigrams.lst").c_str());
00305
00306
00307 load(prefixes, (resourceDir + "/Prefix.lst").c_str());
00308
00309
00310 load(suffixes, (resourceDir + "/Suffix.lst").c_str());
00311
00312
00313 load(firstWords, (resourceDir + "/First.lst").c_str());
00314
00315
00316 load(lastWords, (resourceDir + "/Last.lst").c_str());
00317
00318
00319 load(lowerInterm, (resourceDir + "/LowerIn.lst").c_str());
00320
00321 }
00322
00323 template<class WordSet>
00324 void Resources::load(vector<WordSet>& sets, char const* file)
00325 {
00326 ifstream ifs(file);
00327 char line[MAX_LINE_LEN];
00328
00329 while (ifs.getline(line, MAX_LINE_LEN)) {
00330
00331 char* next = line;
00332 char* className = strtok_r(0, " \t", &next);
00333 if (!className)
00334 continue;
00335 char* word = next + strspn(next, " \t");
00336 if (strlen(word) == 0)
00337 continue;
00338
00339 int cid = classId[className];
00340 sets[cid].insert(word);
00341 }
00342 }
00343
00344 char const* Resources::typeName(EntityType et)
00345 {
00346 if (et.subtype != (EntityType::Subtype)EntityType::NoType)
00347 return TokenTypes[et.subtype];
00348 else if (et.type != EntityType::NoType)
00349 return TokenTypes[et.type];
00350 else
00351 return 0;
00352 }
00353
00354 Text::RegExp::Pattern number("\\d+");
00355 string numRep("+NUM+");
00356
00357
00358
00359 NerFeatureExtractor::NerFeatureExtractor(Resources& resources) :
00360 resources(resources),
00361 tokenCategorizer(resources.language),
00362 insideQuotes(false)
00363 {
00364 prevClass.resize(resources.typesCount());
00365 otherFirst.resize(resources.typesCount());
00366 otherLast.resize(resources.typesCount());
00367 }
00368
00369 void NerFeatureExtractor::extract(Classifier::Features& features, const int& position)
00370 {
00371 Token* token = (*sentence)[position];
00372 char const* word = token->form.c_str();
00373 char const* prevWord = (position > 0) ? (*sentence)[position-1]->form.c_str() : 0;
00374 char const* nextWord = (position+1 < sentence->size()) ? (*sentence)[position+1]->form.c_str() : 0;
00375
00376
00377
00378
00379 size_t len = strlen(word);
00380 size_t sentLen = sentence->size();
00381
00382
00383 if (!prevWord && isupper(word[0]))
00384 features.add(firstWordCap[zone]);
00385
00386
00387 if (prevWord && isupper(word[0]))
00388 features.add(Capitalized[zone]);
00389
00390
00391 if (!prevWord && !isupper(word[0]))
00392 features.add(firstWordLow[zone]);
00393
00394
00395 if (allUpper(word)) {
00396 if (acronyms.contains(word))
00397 features.add(Acronym);
00398 }
00399
00400
00401
00402 string upNext(nextWord ? nextWord : "");
00403 to_upper(upNext);
00404 number.replace(upNext, numRep, true);
00405 string upPrev(prevWord ? prevWord : "");
00406 to_upper(upPrev);
00407 number.replace(upPrev, numRep, true);
00408
00409 string nextPos(nextWord ? (*sentence)[position+1]->get(resources.POStag)->c_str() : "");
00410 string prevPos(position > 0 ? (*sentence)[position-1]->get(resources.POStag)->c_str() : "");
00411
00412
00413 if (nextWord) {
00414 if (isupper(word[0]))
00415 features.add(Cap + upNext + "Next");
00416
00417 else
00418 features.add(Low + upNext + "Next");
00419
00420 } else if (isupper(word[0]))
00421 features.add(CapNoNext);
00422 else
00423 features.add(LowNoNext);
00424
00425
00426 if (prevWord) {
00427 if (isupper(word[0]))
00428 features.add(Cap + upPrev + "Prev");
00429
00430 else
00431 features.add(Low + upPrev + "Prev");
00432
00433 } else if (isupper(word[0]))
00434 features.add(CapNoPrev);
00435 else
00436 features.add(LowNoPrev);
00437
00438
00439 if (prevWord && nextWord &&
00440 isupper(prevWord[0]) && isupper(word[0]) && isupper(nextWord[0]))
00441 features.add(seqCap);
00442 else
00443 features.add(notSeqCap);
00444
00445
00446 if (strchr(word, '-')) {
00447 char copy[800];
00448 strncpy(copy, word, sizeof(copy));
00449 char* next = copy;
00450 char* word1 = strtok_r(0, "-", &next);
00451 char* word2 = strtok_r(0, "-", &next);
00452 if (word1 && word2) {
00453 bool up1 = isupper(word1[0]);
00454 bool up2 = isupper(word2[0]);
00455 if (next[0] == '\0') {
00456 if (up1 && up2)
00457 features.add(HyphenCapCap);
00458 else {
00459 to_upper(word1);
00460 string upper1(word1);
00461 number.replace(upper1, numRep, true);
00462 to_upper(word2);
00463 string upper2(word2);
00464 number.replace(upper2, numRep, true);
00465 if (up1) {
00466 features.add(HyphenCapLow);
00467 features.add(string("U-") + upper1);
00468 features.add(string("L_") + upper2);
00469 } else if (up2) {
00470 features.add(HyphenLowCap);
00471 features.add(string("L-") + upper1);
00472 features.add(string("U_") + upper2);
00473 }
00474 }
00475 }
00476 }
00477 }
00478
00479 string upper(word);
00480 to_upper(upper);
00481 number.replace(upper, numRep, true);
00482
00483
00484 if (!resources.FWL.contains(upper))
00485 features.add(rare);
00486
00487 size_t nClasses = resources.typesCount();
00488
00489 if (position > 1) {
00490 string bigram = (*sentence)[position-2]->form + '_' + prevWord;
00491 to_upper(bigram);
00492 for (int i = 0; i < nClasses; ++i) {
00493 if (resources.preBigrams[i].contains(bigram))
00494 features.add(bigramTags[i]);
00495 }
00496 }
00497
00498 for (int i = 0; i < nClasses; ++i) {
00499 if (resources.firstWords[i].contains(word))
00500 features.add(firstWordTags[i]);
00501 }
00502
00503 #ifdef EXTRA
00504 for (int i = 0; i < nClasses; ++i) {
00505 if (resources.lastWords[i].contains(word))
00506 features.add(lastWordTags[i]);
00507 }
00508
00509 for (int i = 0; i < nClasses; ++i) {
00510 if (resources.lowerInterm[i].contains(word))
00511 features.add(lowerIntermTags[i]);
00512 }
00513 #endif
00514
00515
00516
00517
00518 if (position > 0) {
00519 for (int i = 0; i < nClasses; ++i) {
00520 if (resources.dict[i].contains(word))
00521 features.add(dictTags[i]);
00522 }
00523 }
00524
00525
00526 char feature[256];
00527 int featIndex = 0;
00528
00529 FOR_EACH (FeatureSpecs, *morphFeatureSpecs, fit) {
00530 string const& attrName = (*fit)->name;
00531 Text::RegExp::Pattern const& p = (*fit)->pattern;
00532 char featId = 'A' + featIndex++;
00533 FOR_EACH (vector<int>, (*fit)->tokens, tit) {
00534
00535 int dx = *tit;
00536 int idx = position + dx;
00537 Token* tok = (0 <= idx && idx < sentLen) ? (*sentence)[idx] : 0;
00538 if (tok) {
00539 string const* item = tok->get(attrName.c_str());
00540 if (item && !item->empty() && p.test(*item)) {
00541
00542 if (dx < 0)
00543 snprintf(feature, sizeof(feature), "%dM%c", -dx, featId);
00544 else
00545 snprintf(feature, sizeof(feature), "M%c%d", featId, dx);
00546 features.add(feature);
00547 }
00548 }
00549 }
00550 }
00551
00552 FOR_EACH (FeatureSpecs, *featureSpecs, fit) {
00553 string const& attrName = (*fit)->name;
00554 Text::RegExp::Pattern const& p = (*fit)->pattern;
00555 int attrIndex = token->attrIndex(attrName.c_str());
00556 char featId = 'A' + attrIndex;
00557 FOR_EACH (vector<int>, (*fit)->tokens, tit) {
00558
00559 int dx = *tit;
00560 int idx = position + dx;
00561 Token* tok = (0 <= idx && idx < sentLen) ? (*sentence)[idx] : 0;
00562 if (tok) {
00563 string const* item = tok->get(attrName.c_str());
00564 if (item && !item->empty() && p.test(*item)) {
00565
00566 if (dx < 0)
00567 snprintf(feature, sizeof(feature), "%d%c%s", -dx, featId, item->c_str());
00568 else
00569 snprintf(feature, sizeof(feature), "%c%d%s", featId, dx, item->c_str());
00570 features.add(feature);
00571 }
00572 }
00573 }
00574 }
00575
00576 features.add("word=" + upper);
00577
00578
00579 unordered_map<string, bool>::iterator it = capitalized.find(upper);
00580 if (it != capitalized.end()) {
00581 if (it->second)
00582 features.add(otherCapitalized);
00583 else
00584 features.add(otherNotCapitalized);
00585 } else if (position > 0)
00586 capitalized[upper] = isupper(word[0]);
00587
00588
00589 if (nextWord && isupper(word[0])) {
00590 string acronym;
00591 for (int i = position; i < sentLen; i++) {
00592 char c = (*sentence)[i]->form[0];
00593 if (isupper(c))
00594 acronym.push_back(c);
00595 else
00596 break;
00597 }
00598 if (acronym.size() > 1)
00599 acronyms.insert(acronym.c_str());
00600 }
00601
00602 #ifdef HYP_BUG
00603
00604
00605 if (position < sentLen-1) {
00606 if (isupper(word[0])) {
00607
00608 int endPos;
00609 for (endPos = position + 1; endPos < sentLen; endPos++)
00610 if (!isupper(((*sentence)[endPos]->form)[0]))
00611 break;
00612
00613 for (int j = position + 1; j < endPos; j++) {
00614 for (int i = 0; i < nClasses; i++) {
00615 char const* other = (*sentence)[j]->form.c_str();
00616 if (resources.lastWords[i].contains(other)) {
00617 features.add(otherLastTags[i]);
00618 otherLast[i].insert(other);
00619 break;
00620 }
00621 }
00622 }
00623 }
00624 }
00625 #endif
00626
00627
00628 for (int i = 0; i < nClasses; i++) {
00629 if (otherFirst[i].contains(word))
00630 features.add(otherFirstTags[i]);
00631 }
00632
00633
00634 for (int i = 0; i < nClasses; i++) {
00635 if (otherLast[i].contains(word))
00636 features.add(otherLastTags[i]);
00637 }
00638
00639
00640 for (int i = 0; i < nClasses; i++) {
00641 if (prevClass[i].contains(upper))
00642 features.add(prevTags[i]);
00643 }
00644
00645
00646 if (upper.size() > 3) {
00647 for (int i = 0; i < nClasses; ++i) {
00648 if (resources.suffixes[i].match(upper))
00649 features.add(suffixTags[i]);
00650 }
00651 }
00652
00653
00654 if (upper.size() > 3) {
00655 upper.erase(3);
00656 for (int i = 0; i < nClasses; ++i) {
00657 if (resources.prefixes[i].contains(upper))
00658 features.add(prefixTags[i]);
00659 }
00660 }
00661
00662
00663 if (insideQuotes)
00664 features.add(withinQuotes);
00665
00666
00667
00668
00669
00670
00671
00672 if (allQuotes(word))
00673 insideQuotes = !insideQuotes;
00674 }
00675
00676 void NerFeatureExtractor::analyze(Sentence* sent, int zone)
00677 {
00678 sentence = sent;
00679 this->zone = zone;
00680 int len = sentence->size();
00681 tokenTypes.resize(len);
00682 insideQuotes = false;
00683 int i = 0;
00684 for (Sentence::const_iterator sit = sentence->begin();
00685 sit != sentence->end(); ++sit, ++i) {
00686 Token* tok = *sit;
00687 tokenTypes[i] = tokenCategorizer.analyze(tok->form.c_str());
00688 if (oldIOB) {
00689
00690 string const* tag = tok->get(resources.NEtag);
00691 if (!tag)
00692 continue;
00693 if (*tag == "O" || (*tag)[0] == 'B' || tag->empty())
00694 continue;
00695 string const* prevTag = (i > 0) ? (*sent)[i-1]->get(resources.NEtag) : 0;
00696 string nt(*tag);
00697 if (!prevTag || (*prevTag)[0] == 'O') {
00698 nt[0] = 'B';
00699 tok->set(resources.NEtag, nt);
00700 }
00701 }
00702 if (refine) {
00703
00704 string const* tag = tok->get(resources.NEtag);
00705 if (!tag)
00706 continue;
00707 if (*tag == "O" || tag->empty())
00708 continue;
00709 string const* nextTag = (i < len-1) ? (*sent)[i+1]->get(resources.NEtag) : 0;
00710 string nt(*tag);
00711 if (nextTag) {
00712 if ((*nextTag)[0] == 'B' || (*nextTag)[0] == 'O') {
00713 if ((*tag)[0] == 'B') {
00714 nt[0] = 'U';
00715 tok->set(resources.NEtag, nt);
00716 } else {
00717 nt[0] = 'E';
00718 tok->set(resources.NEtag, nt);
00719 }
00720 }
00721 } else if ((*tag)[0] == 'B') {
00722 nt[0] = 'U';
00723 tok->set(resources.NEtag, nt);
00724 } else {
00725 nt[0] = 'E';
00726 tok->set(resources.NEtag, nt);
00727 }
00728 }
00729 }
00730 }
00731
00732 void NerFeatureExtractor::reset()
00733 {
00734 insideQuotes = false;
00735
00736
00737 for (int i = 0; i < resources.typesCount(); ++i) {
00738 prevClass[i].clear();
00739 otherFirst[i].clear();
00740 otherLast[i].clear();
00741 }
00742 acronyms.clear();
00743 capitalized.clear();
00744 }
00745
00746 void NerFeatureExtractor::classified(int position, char const* className)
00747 {
00748 Token* token = (*sentence)[position];
00749 char const* word = token->form.c_str();
00750 designated(word, className);
00751
00752 token->set(resources.NEtag, className);
00753 }
00754
00755 void NerFeatureExtractor::designated(char const* word, char const* neTag)
00756 {
00757 if (strlen(neTag) < 2)
00758 return;
00759 int i = 0;
00760 FOR_EACH (set<string>, *resources.entityTypes, it) {
00761 if (neTag+2 == *it) {
00762 string upper(word);
00763 to_upper(upper);
00764 number.replace(upper, numRep, true);
00765 prevClass[i].insert(upper.c_str());
00766 }
00767 i++;
00768 }
00769 }
00770
00771 }
00772 }