tanl: tanl: tag/NER/NerFeatureExtractor.cpp Source File

00001 /*
00002 **  Tanl
00003 **  NER/NerFeatureExtractor.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of Tanl.
00009 **
00010 **  Tanl is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  Tanl is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 // Local
00025 #include "NerFeatureExtractor.h"
00026 
00027 // standard
00028 #include <sstream>
00029 
00030 // library
00031 #include "conf/conf_bool.h"
00032 #include "conf/conf_string.h"
00033 
00034 using namespace Tanl::Classifier;
00035 using namespace Tanl::Text;
00036 
00037 namespace Tanl {
00038 namespace NER {
00039 
00040 // Data uses old IOB conventions.
00041 IXE::conf<bool> oldIOB("OldIOB", true);
00042 
00043 IXE::conf<bool> refine("Refine", true);
00044 
00045 // ======================================================================
00046 
00050   static vector<string> dictTags;
00051   static string const Money     = "D1";
00052   static string const Name      = "D2";
00053   static string const Time      = "D4";
00054   static string const PrevName  = "D5";
00055   static string const NextPerson= "D6";
00056 
00057 // ======================================================================
00058 
00059   /*
00060    * Token Types (not used).
00061    */
00062   // These must be aligned with EntityType::Type
00063   char const* TokenTypes[] = {
00064     "UNKNOWN",
00065     "WORD",
00066     "QUANTITY",
00067     "NAME",
00068     "TIME",
00069     "SYMBOL",
00070     "PUNCT",
00071     "ADDRESS",
00072     "PERSON",
00073     "ORGANIZATION",
00074     "LOCATION",
00075     "PRODUCT",
00076     "NUMBER",
00077     "MONEY",
00078     "MEASURE",
00079     "DURATION",
00080     "DATE",
00081     "TIME",
00082     "URL",
00083     "EMAIL"
00084   };
00085 
00086 // ======================================================================
00087 
00091   // Global lists ////////////////////////////////////////
00092   //
00093   // FWL (Frequent Word List): words that occur in more than 10 documents
00094   //
00095   // CPW (Common Preceding Words): 20 words that most often precede
00096   //    a certain class
00097   //
00098   // CPB (Common Preceding Bigrams): bigrams that often precede
00099   //    a certain class
00100   //
00101   // SUF (Suffix for Class): common 3-4 letter suffix for a certain class
00102   //    (-ian, -ish)
00103   //
00104   // EFW (Entity First Words): list of words starting a Named entity sequence
00105   //                            Added for Italian.
00106   //    Organization: centro, giunta, banca
00107   //    Locations: via, piazza
00108   //    Person: De, S.
00109   //
00110   // ELW (Entity Last Words): list of words terminating a Named entity sequence
00111   //    Organization: Inc, Org, Co
00112   //    Locations: center, museum, square, street
00113   //    Person: Jr, II, III
00114   //
00115   // LIW (Lowercase Intermediate Words): list of lowercase words appearing in a
00116   //    Named entity sequence
00117   //    Organization: al, in, zonder, vor, for
00118   //    Person: "van der", "de", "of"
00119   //
00120 
00121   // Lexical Features ////////////////////////////////////////
00122 
00123   char const* Capitalized[4]    = {"initCaps_HL", "initCaps_AU", "initCaps_DL",
00124                                    "initCaps_TXT"};
00125 
00126   // first word of sentence is capitalized
00127   char const* firstWordCap[4]   = {"firstWordInitCaps_HL",
00128                                    "firstWordInitCaps_AU",
00129                                    "firstWordInitCaps_DL",
00130                                    "firstWordInitCaps_TXT"};
00131 
00132   // first word but not capitalized
00133   char const* firstWordLow[4]   = {"firstWordNotInitCaps_HL",
00134                                    "firstWordNotInitCaps_AU",
00135                                    "firstWordNotInitCaps_DL",
00136                                    "firstWordNotInitCaps_TXT"};
00137 
00138   string const HyphenCapCap     = "U-U"; // Str1-Str2
00139   string const HyphenLowCap     = "L-U"; // str1-Str2
00140   string const HyphenCapLow     = "U-L"; // Str1-str2
00141 
00142   // Local Features ////////////////////////////////////////
00143 
00144   string const seqCap           = "I";  // previous, current and next are Capitalized
00145   string const notSeqCap        = "NI"; // not SeqCap
00146   string const CapNoNext        = "initCaps__Next";     // current is Capitalized, no next
00147   string const LowNoNext        = "nonInitCaps__Next";  // current is not Capitalized, no next
00148   string const CapNoPrev        = "initCaps__Prev";     // current is Capitalized, no previous
00149   string const LowNoPrev        = "nonInitCaps__Prev";  // current is not Capitalized, no previous is not
00150 
00151   string const Cap              = "initCaps_";
00152   string const Low              = "nonInitCaps_";
00153 
00154 
00155   string const withinQuotes     = "quote";      // word is in sequence within quotes
00156   string const rare             = "rare";       // word not present in FWL
00157 
00158   // w-2, w-1 appear in CPB list
00159   vector<string> bigramTags;
00160 
00161   // 3-letter suffix present in PRE list
00162   vector<string> prefixTags;
00163 
00164   // 3-letter suffix present in SUF list
00165   vector<string> suffixTags;
00166 
00167   // present in List of First words
00168   vector<string> firstWordTags;
00169 
00170   // present in List of Last words
00171   vector<string> lastWordTags;
00172 
00173   // word in List ELW appears within sequence of Caps
00174   vector<string> lowerSeqTags;  // FIXME: not used
00175 
00176   // word appearing in list Lower Words appearing within names for given class
00177   vector<string> lowerIntermTags;
00178 
00179   // Global features ////////////////////////////////////////
00180 
00181   // another occurrence of current word was tagged as given class
00182   vector<string> prevTags;
00183 
00184   // another occurrence of current word was first word
00185   vector<string> otherFirstTags;
00186 
00187   // another occurrence of current word was last word
00188   vector<string> otherLastTags;
00189 
00190   string const otherCapitalized = "otherInitCap";// an occurrence not as first word was
00191                                 // Capitalized
00192   string const otherNotCapitalized      = "otherNotInitCap";// an occurrence not as first word was
00193                                 // not Capitalized
00194 
00195   // Acronyms
00196   // An AllUpper word is stored as an acronym.
00197   // Caps sequences with those initials will be given the following features:
00198   string const Acronym          = "acronym";    // e.g. FCC
00199   string const AcronymBegin     = "cI"; // e.g. Federal
00200   string const AcronymContinue  = "cJ"; // e.g. Communication
00201   string const AcronymEnd       = "cK"; // e.g. Committee
00202 
00203   /*
00204     Not used: we rely on POS tagger for disambiguating these cases.
00205 
00206   // Longest Common Caps Sequence
00207   // To avoid confusing "Also Texas Instruments Corp." as a whole company name,
00208   // the longest common Caps sequence occurring in a document is noted.
00209   // So if "Xerox Corp." also appears, the following features are set:
00210   string const LongestB         = "cL"; // Texas
00211   string const LongestC         = "cM"; // Instruments
00212   string const LongestE         = "cN"; // Corp.
00213   */
00214 
00215 bool allUpper(char const* s)
00216 {
00217   char* scan = (char*)s;
00218   while (*scan)
00219     if (!isupper(*scan++))
00220       return false;
00221   return true;
00222 }
00223 
00224 bool allQuotes(char const* s)
00225 {
00226   return (::strspn(s, "\"") == ::strlen(s));
00227 }
00228 
00229 // ======================================================================
00230 
00231 IXE::conf_set<std::string> Resources::entityTypes("Classes");
00232 
00233 void Resources::resize()
00234 {
00235   size_t nClasses = entityTypes->size();
00236   dict.resize(nClasses);
00237   designators.resize(nClasses);
00238   preBigrams.resize(nClasses);
00239   prefixes.resize(nClasses);
00240   suffixes.resize(nClasses);
00241   firstWords.resize(nClasses);
00242   lastWords.resize(nClasses);
00243   lowerInterm.resize(nClasses);
00244   dictTags.resize(nClasses);
00245   bigramTags.resize(nClasses);
00246   prefixTags.resize(nClasses);
00247   suffixTags.resize(nClasses);
00248   firstWordTags.resize(nClasses);
00249   lastWordTags.resize(nClasses);
00250   lowerSeqTags.resize(nClasses);
00251   lowerIntermTags.resize(nClasses);
00252   prevTags.resize(nClasses);
00253   otherFirstTags.resize(nClasses);
00254   otherLastTags.resize(nClasses);
00255   int i = 0;
00256   FOR_EACH (set<string>, *entityTypes, it) {
00257     dictTags[i] = "UNI-" + *it;
00258     bigramTags[i] = "BI-" + *it;
00259     prefixTags[i] = "Pref-" + *it;
00260     suffixTags[i] = "Suf-" + *it;
00261     firstWordTags[i] = "FW-" + *it;
00262     lastWordTags[i] = "LW-" + *it;
00263     lowerSeqTags[i] = "lS" + *it;
00264     lowerIntermTags[i] = "lI" + *it;
00265     prevTags[i] = "PrevOccur-" + *it;
00266     otherFirstTags[i] = "OtherFirst-" + *it;
00267     otherLastTags[i] = "OtherLast-" + *it;
00268     i++;
00269   }
00270 }
00271 
00275 Resources::Resources(string& resourceDir, string& locale,
00276                      char const* POStag, char const* NEtag) :
00277   language(locale.c_str()),
00278   POStag(POStag),
00279   NEtag(NEtag)
00280 {
00281   load(resourceDir);
00282 }
00283 
00284 void Resources::load(string& resourceDir)
00285 {
00286   resize();
00287   int i = 0;
00288   FOR_EACH (std::set<std::string>, *entityTypes, it)
00289     classId[it->c_str()] = i++;
00290 
00291   // Dictionaries
00292   load(dict, (resourceDir + "/Dict.lst").c_str());
00293   moneyDict.load((resourceDir + "/money.lst").c_str());
00294   namesDict.load((resourceDir + "/names.lst").c_str());
00295   timeDict.load((resourceDir + "/time.lst").c_str());
00296 
00297   // Frequent Word List
00298   FWL.load((resourceDir + "/FrequentWords.lst").c_str());
00299 
00300   // Designators
00301   load(designators, (resourceDir + "/Designators.lst").c_str());
00302 
00303   // Preceding Bigrams
00304   load(preBigrams, (resourceDir + "/Bigrams.lst").c_str());
00305 
00306   // Prefix
00307   load(prefixes, (resourceDir + "/Prefix.lst").c_str());
00308 
00309   // Suffix
00310   load(suffixes, (resourceDir + "/Suffix.lst").c_str());
00311 
00312   // First word
00313   load(firstWords, (resourceDir + "/First.lst").c_str());
00314 
00315   // Last word
00316   load(lastWords, (resourceDir + "/Last.lst").c_str());
00317 
00318   // Lower intermediate word
00319   load(lowerInterm, (resourceDir + "/LowerIn.lst").c_str());
00320 
00321 }
00322 
00323 template<class WordSet>
00324 void Resources::load(vector<WordSet>& sets, char const* file)
00325 {
00326   ifstream ifs(file);
00327   char line[MAX_LINE_LEN];
00328 
00329   while (ifs.getline(line, MAX_LINE_LEN)) {
00330     // split line
00331     char* next = line;
00332     char* className = strtok_r(0, " \t", &next);
00333     if (!className)
00334       continue;
00335     char* word = next + strspn(next, " \t");
00336     if (strlen(word) == 0)
00337       continue;
00338     // determine class
00339     int cid = classId[className];
00340     sets[cid].insert(word);
00341   }
00342 }
00343 
00344 char const* Resources::typeName(EntityType et)
00345 {
00346   if (et.subtype != (EntityType::Subtype)EntityType::NoType)
00347     return TokenTypes[et.subtype];
00348   else if (et.type != EntityType::NoType)
00349     return TokenTypes[et.type];
00350   else
00351     return 0;
00352 }
00353 
00354 Text::RegExp::Pattern number("\\d+");
00355 string numRep("+NUM+");
00356 
00357 // ======================================================================
00358 
00359 NerFeatureExtractor::NerFeatureExtractor(Resources& resources) :
00360   resources(resources),
00361   tokenCategorizer(resources.language),
00362   insideQuotes(false)
00363 {
00364   prevClass.resize(resources.typesCount());
00365   otherFirst.resize(resources.typesCount());
00366   otherLast.resize(resources.typesCount());
00367 }
00368 
00369 void NerFeatureExtractor::extract(Classifier::Features& features, const int& position)
00370 {
00371   Token* token = (*sentence)[position];
00372   char const* word = token->form.c_str();
00373   char const* prevWord = (position > 0) ? (*sentence)[position-1]->form.c_str() : 0;
00374   char const* nextWord = (position+1 < sentence->size()) ? (*sentence)[position+1]->form.c_str() : 0;
00375 
00376   //======================================================================
00377   // Lexical Features
00378 
00379   size_t len = strlen(word);
00380   size_t sentLen = sentence->size();
00381 
00382   // 1st word and initCaps
00383   if (!prevWord && isupper(word[0]))
00384     features.add(firstWordCap[zone]);
00385 
00386   // initCaps and not 1st word
00387   if (prevWord && isupper(word[0]))
00388       features.add(Capitalized[zone]);
00389 
00390   // 1st word and not initCaps
00391   if (!prevWord && !isupper(word[0]))
00392     features.add(firstWordLow[zone]);
00393 
00394   // AllUpper
00395   if (allUpper(word)) {
00396     if (acronyms.contains(word))
00397       features.add(Acronym);
00398   }
00399 
00400   //======================================================================
00401   // Local Features
00402   string upNext(nextWord ? nextWord : "");
00403   to_upper(upNext);
00404   number.replace(upNext, numRep, true);
00405   string upPrev(prevWord ? prevWord : "");
00406   to_upper(upPrev);
00407   number.replace(upPrev, numRep, true);
00408 
00409   string nextPos(nextWord ? (*sentence)[position+1]->get(resources.POStag)->c_str() : "");
00410   string prevPos(position > 0 ? (*sentence)[position-1]->get(resources.POStag)->c_str() : "");
00411 
00412   // next word
00413   if (nextWord) {
00414     if (isupper(word[0]))
00415       features.add(Cap + upNext + "Next");
00416     //features.add(Cap + nextPos + "Next");
00417     else
00418       features.add(Low + upNext + "Next");
00419     //features.add(Low + nextPos + "Next");
00420   } else if (isupper(word[0]))
00421     features.add(CapNoNext);
00422   else
00423     features.add(LowNoNext);
00424 
00425   // previous word
00426   if (prevWord) {
00427     if (isupper(word[0]))
00428       features.add(Cap + upPrev + "Prev");
00429     //features.add(Cap + prevPos + "Prev");
00430      else
00431        features.add(Low + upPrev + "Prev");
00432     //features.add(Low + prevPos + "Prev");
00433   } else if (isupper(word[0]))
00434     features.add(CapNoPrev);
00435   else
00436     features.add(LowNoPrev);
00437 
00438   // Case Sequence
00439   if (prevWord && nextWord &&
00440       isupper(prevWord[0]) && isupper(word[0]) && isupper(nextWord[0]))
00441     features.add(seqCap);
00442   else
00443     features.add(notSeqCap);
00444 
00445   // hyphenate words
00446   if (strchr(word, '-')) {
00447     char copy[800];
00448     strncpy(copy, word, sizeof(copy));
00449     char* next = copy;
00450     char* word1 = strtok_r(0, "-", &next);
00451     char* word2 = strtok_r(0, "-", &next);
00452     if (word1 && word2) {
00453       bool up1 = isupper(word1[0]);
00454       bool up2 = isupper(word2[0]);
00455       if (next[0] == '\0') {    // just two
00456         if (up1 && up2)
00457           features.add(HyphenCapCap);
00458         else {
00459           to_upper(word1);
00460           string upper1(word1);
00461           number.replace(upper1, numRep, true);
00462           to_upper(word2);
00463           string upper2(word2);
00464           number.replace(upper2, numRep, true);
00465           if (up1) {
00466             features.add(HyphenCapLow);
00467             features.add(string("U-") + upper1);
00468             features.add(string("L_") + upper2);
00469           } else if (up2) {
00470             features.add(HyphenLowCap);
00471             features.add(string("L-") + upper1);
00472             features.add(string("U_") + upper2);
00473           }
00474         }
00475       }
00476     }
00477   }
00478 
00479   string upper(word);
00480   to_upper(upper);
00481   number.replace(upper, numRep, true);
00482 
00483   // rare words
00484   if (!resources.FWL.contains(upper))
00485     features.add(rare);
00486 
00487   size_t nClasses = resources.typesCount();
00488   // bigrams
00489   if (position > 1) {
00490     string bigram = (*sentence)[position-2]->form + '_' + prevWord;
00491     to_upper(bigram);
00492     for (int i = 0; i < nClasses; ++i) {
00493       if (resources.preBigrams[i].contains(bigram))
00494         features.add(bigramTags[i]);
00495     }
00496   }
00497 
00498   for (int i = 0; i < nClasses; ++i) {
00499     if (resources.firstWords[i].contains(word))
00500       features.add(firstWordTags[i]);
00501   }
00502 
00503 #ifdef EXTRA
00504   for (int i = 0; i < nClasses; ++i) {
00505     if (resources.lastWords[i].contains(word))
00506       features.add(lastWordTags[i]);
00507   }
00508 
00509   for (int i = 0; i < nClasses; ++i) {
00510     if (resources.lowerInterm[i].contains(word))
00511       features.add(lowerIntermTags[i]);
00512   }
00513 #endif
00514 
00515   //======================================================================
00516   // Dictionary Features
00517 
00518   if (position > 0) {
00519     for (int i = 0; i < nClasses; ++i) {
00520       if (resources.dict[i].contains(word)) 
00521         features.add(dictTags[i]);
00522     }
00523   }
00524 
00525   // features according to feature model
00526   char feature[256];
00527   int featIndex = 0;
00528   // Morphological features (constant feature, not including token attribute)
00529   FOR_EACH (FeatureSpecs, *morphFeatureSpecs, fit) {
00530     string const& attrName = (*fit)->name;
00531     Text::RegExp::Pattern const& p = (*fit)->pattern;
00532     char featId = 'A' + featIndex++;
00533     FOR_EACH (vector<int>, (*fit)->tokens, tit) {
00534       // find token
00535       int dx = *tit;
00536       int idx = position + dx;
00537       Token* tok = (0 <= idx && idx < sentLen) ? (*sentence)[idx] : 0;
00538       if (tok) {
00539         string const* item = tok->get(attrName.c_str());
00540         if (item && !item->empty() && p.test(*item)) {
00541           // skip empty attributes
00542           if (dx < 0)
00543             snprintf(feature, sizeof(feature), "%dM%c", -dx, featId);
00544           else
00545             snprintf(feature, sizeof(feature), "M%c%d", featId, dx);
00546           features.add(feature);
00547         }
00548       }
00549     }
00550   }
00551   // Composite feature: identifier + token attribute.
00552   FOR_EACH (FeatureSpecs, *featureSpecs, fit) {
00553     string const& attrName = (*fit)->name;
00554     Text::RegExp::Pattern const& p = (*fit)->pattern;
00555     int attrIndex = token->attrIndex(attrName.c_str());
00556     char featId = 'A' + attrIndex;              // feature type identifier
00557     FOR_EACH (vector<int>, (*fit)->tokens, tit) {
00558       // find token
00559       int dx = *tit;
00560       int idx = position + dx;
00561       Token* tok = (0 <= idx && idx < sentLen) ? (*sentence)[idx] : 0;
00562       if (tok) {
00563         string const* item = tok->get(attrName.c_str());
00564         if (item && !item->empty() && p.test(*item)) {
00565           // skip empty attributes
00566           if (dx < 0)
00567             snprintf(feature, sizeof(feature), "%d%c%s", -dx, featId, item->c_str());
00568           else
00569             snprintf(feature, sizeof(feature), "%c%d%s", featId, dx, item->c_str());
00570           features.add(feature);
00571         }
00572       }
00573     }
00574   }
00575   // Add word by default
00576   features.add("word=" + upper);
00577 
00578   // Global features
00579   unordered_map<string, bool>::iterator it = capitalized.find(upper);
00580   if (it != capitalized.end()) {
00581     if (it->second)
00582       features.add(otherCapitalized);
00583     else
00584       features.add(otherNotCapitalized);
00585   } else if (position > 0)      // non ambiguous position
00586     capitalized[upper] = isupper(word[0]); // FIXME: only first occurrence counts
00587 
00588   // Acronyms
00589   if (nextWord && isupper(word[0])) {
00590     string acronym;
00591     for (int i = position; i < sentLen; i++) {
00592       char c = (*sentence)[i]->form[0];
00593       if (isupper(c))
00594         acronym.push_back(c);
00595       else
00596         break;
00597     }
00598     if (acronym.size() > 1)
00599       acronyms.insert(acronym.c_str());
00600   }
00601 
00602 #ifdef HYP_BUG
00603   /* FIXIM: hyp has bug array cont[4] is not initialized */
00604   // presence of a Last Word in capitalized sequence
00605   if (position < sentLen-1) {
00606     if (isupper(word[0])) {
00607       // Find end of initcaps
00608       int endPos;
00609       for (endPos = position + 1; endPos < sentLen; endPos++)
00610         if (!isupper(((*sentence)[endPos]->form)[0]))
00611           break;
00612 
00613       for (int j = position + 1; j < endPos; j++) {
00614         for (int i = 0; i < nClasses; i++) {
00615           char const* other = (*sentence)[j]->form.c_str();
00616           if (resources.lastWords[i].contains(other)) {
00617             features.add(otherLastTags[i]);
00618             otherLast[i].insert(other);
00619             break;
00620           }
00621         }
00622       }
00623     }
00624   }
00625 #endif
00626 
00627   // Global Class of word in Cap sequence that occurred as first word
00628   for (int i = 0; i < nClasses; i++) {
00629     if (otherFirst[i].contains(word))
00630       features.add(otherFirstTags[i]);
00631   }
00632 
00633   // Global Class of word in Cap sequence that occurred as last word
00634   for (int i = 0; i < nClasses; i++) {
00635     if (otherLast[i].contains(word))
00636       features.add(otherLastTags[i]);
00637   }
00638 
00639   // Global Name Class of previous occurences
00640   for (int i = 0; i < nClasses; i++) {
00641     if (prevClass[i].contains(upper))
00642       features.add(prevTags[i]);
00643   }
00644 
00645   // char suffixes
00646   if (upper.size() > 3) {
00647     for (int i = 0; i < nClasses; ++i) {
00648       if (resources.suffixes[i].match(upper))
00649         features.add(suffixTags[i]);
00650     }
00651   }
00652 
00653   // char prefixes
00654   if (upper.size() > 3) {
00655     upper.erase(3);
00656     for (int i = 0; i < nClasses; ++i) {
00657       if (resources.prefixes[i].contains(upper))
00658         features.add(prefixTags[i]);
00659     }
00660   }
00661 
00662   // inside quotes
00663   if (insideQuotes)
00664     features.add(withinQuotes);
00665   /* Chieu was cheating here, assuming to know the word after the entity:
00666   if (prevWord && nextWord &&
00667       strpbrk(prevWord, "\"()[]{}'<>") &&
00668       strpbrk(afterWWord, "\"()[]{}'<>"))
00669     features.add(withinQuotes);
00670   */
00671   // prepare for next position
00672   if (allQuotes(word))
00673     insideQuotes = !insideQuotes;
00674 }
00675 
00676 void NerFeatureExtractor::analyze(Sentence* sent, int zone)
00677 {
00678   sentence = sent;
00679   this->zone = zone;
00680   int len = sentence->size();
00681   tokenTypes.resize(len);
00682   insideQuotes = false;
00683   int i = 0;
00684   for (Sentence::const_iterator sit = sentence->begin();
00685        sit != sentence->end(); ++sit, ++i) {
00686     Token* tok = *sit;
00687     tokenTypes[i] = tokenCategorizer.analyze(tok->form.c_str());
00688     if (oldIOB) {
00689       // convert to new IOB convention:
00690       string const* tag = tok->get(resources.NEtag);
00691       if (!tag)
00692         continue;
00693       if (*tag == "O" || (*tag)[0] == 'B' || tag->empty()) // empty when tagging
00694         continue;
00695       string const* prevTag = (i > 0) ? (*sent)[i-1]->get(resources.NEtag) : 0;
00696       string nt(*tag);
00697       if (!prevTag || (*prevTag)[0] == 'O') {
00698         nt[0] = 'B';            // begin
00699         tok->set(resources.NEtag, nt);
00700       }
00701     }
00702     if (refine) {
00703       // refine classes, for each C: B-C, I-C, E-C (end), U-C (unique)
00704       string const* tag = tok->get(resources.NEtag);
00705       if (!tag)
00706         continue;
00707       if (*tag == "O" || tag->empty()) // empty when tagging
00708         continue;
00709       string const* nextTag = (i < len-1) ? (*sent)[i+1]->get(resources.NEtag) : 0;
00710       string nt(*tag);
00711       if (nextTag) {
00712         if ((*nextTag)[0] == 'B' || (*nextTag)[0] == 'O') {
00713           if ((*tag)[0] == 'B') {
00714             nt[0] = 'U';                // unique
00715             tok->set(resources.NEtag, nt);
00716           } else {              // last
00717             nt[0] = 'E';
00718             tok->set(resources.NEtag, nt);
00719           }
00720         }
00721       } else if ((*tag)[0] == 'B') { // last unique
00722         nt[0] = 'U';
00723         tok->set(resources.NEtag, nt);
00724       } else {                  // inside
00725         nt[0] = 'E';            // last
00726         tok->set(resources.NEtag, nt);
00727       }
00728     }
00729   }
00730 }
00731 
00732 void NerFeatureExtractor::reset()
00733 {
00734   insideQuotes = false;
00735 
00736   // global document features
00737   for (int i = 0; i < resources.typesCount(); ++i) {
00738     prevClass[i].clear();
00739     otherFirst[i].clear();
00740     otherLast[i].clear();
00741   }
00742   acronyms.clear();
00743   capitalized.clear();
00744 }
00745 
00746 void NerFeatureExtractor::classified(int position, char const* className)
00747 {
00748   Token* token = (*sentence)[position];
00749   char const* word = token->form.c_str();
00750   designated(word, className);
00751   // set the NETAG of token
00752   token->set(resources.NEtag, className);
00753 }
00754 
00755 void NerFeatureExtractor::designated(char const* word, char const* neTag)
00756 {
00757   if (strlen(neTag) < 2)        // O
00758     return;
00759   int i = 0;
00760   FOR_EACH (set<string>, *resources.entityTypes, it) {
00761     if (neTag+2 == *it) { // skip X- from tag
00762       string upper(word);
00763       to_upper(upper);
00764       number.replace(upper, numRep, true);
00765       prevClass[i].insert(upper.c_str());
00766     }
00767     i++;
00768   }
00769 }
00770 
00771 } // namespace NER
00772 } // namespace Tanl