tanl: tanl: tag/SST/SstFeatureExtractor.cpp Source File

00001 /*
00002 **  Tanl
00003 **  SST/SstFeatureExtractor.cpp
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of Tanl.
00009 **
00010 **  Tanl is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  Tanl is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 // Local
00025 #include "SstFeatureExtractor.h"
00026 
00027 // standard
00028 #include <sstream>
00029 
00030 // library
00031 #include "conf/conf_bool.h"
00032 #include "conf/conf_string.h"
00033 
00034 using namespace Tanl::Classifier;
00035 using namespace Tanl::Text;
00036 
00037 namespace Tanl {
00038 namespace SST {
00039 
00040 // Default feature model
00041 IXE::conf<string>       FORM("FORM", "0");
00042 IXE::conf<string>       POSTAG("POSTAG", "-2 -1 0 1 2");
00043 IXE::conf<string>       CPOSTAG("CPOSTAG", "-1 0");
00044 IXE::conf<string>       SSTAG("SSTAG", "");
00045 
00046 FeatureSpec::FeatureSpec(char const* name, char const* list) :
00047   name(name)
00048 {
00049   stringstream ss(list);
00050   int n;
00051   while (ss >> n)
00052     tokens.push_back(n);
00053 }
00054 
00055 FeatureSpecs::FeatureSpecs()
00056 {
00057   // FORM
00058   push_back(FeatureSpec("FORM", FORM));
00059   // POS
00060   push_back(FeatureSpec("POSTAG", POSTAG));
00061   // CPOSTAG
00062   push_back(FeatureSpec("CPOSTAG", CPOSTAG));
00063   // SSTAG
00064   push_back(FeatureSpec("SSTAG", SSTAG));
00065 }
00066 
00067 // ======================================================================
00068 
00072   static string const AllAlpha          = "W1";
00073   static string const AllDigits         = "W2";
00074   static string const CapPeriod         = "initCapPeriod";
00075   static string const SingleCap         = "oneCap";
00076   static string const SingleS           = "W4";
00077   static string const ContainsApo       = "W5";
00078   static string const ContainsDot       = "W6";
00079   static string const ContainsComma     = "W7";
00080   static string const DigitSlash        = "digitSlash";
00081   static string const ContainsPercent   = "percent";
00082   static string const DigitsDot         = "digitPeriod";
00083   static string const ContainsDollar    = "dollar";
00084   static string const ContainsDigit     = "containDigit";
00085   static string const TwoDigits         = "twoDigit";
00086   static string const FourDigits        = "fourDigit";
00087   static string const AllQuoting        = "Wf";
00088 /*
00092   static char const*  dictTags[] = { "UNI-LOC", "UNI-MISC", "UNI-ORG", "UNI-PER" };
00093   static string const Money     = "D1";
00094   static string const Name      = "D2";
00095   static string const Time      = "D4";
00096   static string const PrevName  = "D5";
00097   static string const NextPerson= "D6";*/
00098 
00099 // ======================================================================
00100 
00101   /*
00102    * Token Types (not used).
00103    */
00104   // These must be aligned with EntityType::Type
00105   /*char const* TokenTypes[] = {
00106     "UNKNOWN",
00107     "WORD",
00108     "QUANTITY",
00109     "NAME",
00110     "TIME",
00111     "SYMBOL",
00112     "PUNCT",
00113     "ADDRESS",
00114     "PERSON",
00115     "ORGANIZATION",
00116     "LOCATION",
00117     "PRODUCT",
00118     "NUMBER",
00119     "MONEY",
00120     "MEASURE",
00121     "DURATION",
00122     "DATE",
00123     "TIME",
00124     "URL",
00125     "EMAIL"
00126   };*/
00127 
00128 // ======================================================================
00129 
00133   // Global lists
00134   //
00135   // FWL (Frequent Word List): words that occur in more than 10 documents
00136   //
00137   // CPW (Common Preceding Words): 20 words that most often precede
00138   //    a certain class
00139   //
00140   // CPB (Common Preceding Bigrams): bigrams that often precede
00141   //    a certain class
00142   //
00143   // SUF (Suffix for Class): common 3-4 letter suffix for a certain class
00144   //    (-ian, -ish)
00145   //
00146   // NLW (Name Last Words): list of words terminating a Name sequence
00147   //    Organization: Inc, Org, Co
00148   //    Locations: center, museum, square, street
00149   //    Person: Jr, II, III
00150   //
00151   // LIW (Lowercase Intermediate Words): list of lowercase words appearing in a
00152   //    Name sequence
00153   //    Organization: al, in, zonder, vor, for
00154   //    Person: "van der", "de", "of"
00155   //
00156 
00157   // Lexical Features
00158 
00159   char const* AllUpper[4]       = {"allCaps_HL", "allCaps_AU", "allCaps_DL",
00160                                    "allCaps_TXT"};
00161   char const* Capitalized[4]    = {"initCaps_HL", "initCaps_AU", "initCaps_DL",
00162                                    "initCaps_TXT"};
00163 
00164   // first word of sentence is capitalized
00165   char const* firstWordCap[4]   = {"firstWordInitCaps_HL",
00166                                    "firstWordInitCaps_AU",
00167                                    "firstWordInitCaps_DL",
00168                                    "firstWordInitCaps_TXT"};
00169 
00170   // first word but not capitalized
00171   char const* firstWordLow[4]   = {"firstWordNotInitCaps_HL",
00172                                    "firstWordNotInitCaps_AU",
00173                                    "firstWordNotInitCaps_DL",
00174                                    "firstWordNotInitCaps_TXT"};
00175 
00176   char const* MixedCase[4]      = {"mixedCaps_HL", "mixedCaps_AU",
00177                                    "mixedCaps_DL", "mixedCaps_TXT"};
00178 
00179   string const HyphenCapCap     = "U-U"; // Str1-Str2
00180   string const HyphenLowCap     = "L-U"; // str1-Str2
00181   string const HyphenCapLow     = "U-L"; // Str1-str2
00182 
00183   // Local Features
00184 
00185   // previous is capitalized
00186   char const* prevCap[4]        = { "initCapsPrev_HL", "initCapsPrev_AU",
00187                                     "initCapsPrev_DL", "initCapsPrev_TXT"};
00188   // next is capitalized
00189   char const* nextCap[4]        = { "initCapsNext_HL", "initCapsNext_AU",
00190                                     "initCapsNext_DL", "initCapsNext_TXT"};
00191 
00192   string const seqCap           = "I";  // previous, current and next are Capitalized
00193   string const notSeqCap        = "NI"; // not SeqCap
00194   string const CapNext          = "initCaps_Next";      // current is Capitalized, append next
00195   string const LowNext          = "nonInitCaps_Next";   // current is not Capitalized, append next
00196   string const CapNoNext        = "initCaps__Next";     // current is Capitalized, no next
00197   string const LowNoNext        = "nonInitCaps__Next";  // current is not Capitalized, no next
00198   string const CapPrev          = "initCaps_Prev";      // current and previous are Capitalized
00199   string const LowPrev          = "nonInitCaps_Prev";   // current is Capitalized but previous is not
00200   string const CapNoPrev        = "initCaps__Prev";     // current is Capitalized, no previous
00201   string const LowNoPrev        = "nonInitCaps__Prev";  // current is not Capitalized, no previous is not
00202 
00203   string const Cap              = "initCaps_";
00204   string const Low      = "nonInitCaps_";
00205 
00206 
00207   string const withinQuotes     = "quote";      // word is in sequence within quotes
00208   string const rare             = "rare";       // word not present in FWL
00209 
00210   /*// w-2, w-1 appear in CPB list
00211   char const* bigramTags[]      = { "BI-LOC", "BI-MISC", "BI-ORG", "BI-PER" };
00212 
00213   // 3-letter suffix present in SUF list
00214   char const* suffixTags[]      = { "Suf_LOC", "Suf_MISC", "Suf_ORG", "Suf_PER" };
00215 
00216   // present in List of Last words
00217   string const lastWordTags[]   = { "NCS-LOC", "NCS-LISC", "NCS-ORG", "NCS-PER" };
00218 
00219   // word in List NLW appears within sequence of Caps
00220   string const lowerSeqTags[]   = { "lSL", "lSM", "lSO", "lSP" };
00221 
00222   // word appearing in list Lower Words appearing within names for given class
00223   string const lowerIntermTags[]= { "lL", "lM", "lO", "lP" };*/
00224 
00225   // Global features
00226 
00227   // another occurrence of current word was tagged as given class
00228 char const* prevTags[]  = {
00229   "NCPrevOccur-adj.all", "NCPrevOccur-adj.pert", "NCPrevOccur-adj.ppl",
00230   "adv.all", "NCPrevOccur-noun.Tops", "NCPrevOccur-noun.act",
00231   "NCPrevOccur-noun.animal", "NCPrevOccur-noun.artifact",
00232   "NCPrevOccur-noun.attribute", "NCPrevOccur-noun.body",
00233   "NCPrevOccur-noun.cognition", "NCPrevOccur-noun.communication",
00234   "NCPrevOccur-noun.event", "NCPrevOccur-noun.feeling",
00235   "NCPrevOccur-noun.food", "NCPrevOccur-noun.group",
00236   "NCPrevOccur-noun.location", "NCPrevOccur-noun.motive",
00237   "NCPrevOccur-noun.object", "NCPrevOccur-noun.other",
00238   "NCPrevOccur-noun.person", "NCPrevOccur-noun.phenomenon",
00239   "NCPrevOccur-noun.plant", "NCPrevOccur-noun.possession",
00240   "NCPrevOccur-noun.process", "NCPrevOccur-noun.quantity",
00241   "NCPrevOccur-noun.relation", "NCPrevOccur-noun.shape",
00242   "NCPrevOccur-noun.state", "NCPrevOccur-noun.substance",
00243   "NCPrevOccur-noun.time", "NCPrevOccur-verb.body",
00244   "NCPrevOccur-verb.change", "NCPrevOccur-verb.cognition",
00245   "NCPrevOccur-verb.communication", "NCPrevOccur-verb.competition",
00246   "NCPrevOccur-verb.consumption", "NCPrevOccur-verb.contact",
00247   "NCPrevOccur-verb.creation", "NCPrevOccur-verb.emotion",
00248   "NCPrevOccur-verb.motion", "NCPrevOccur-verb.perception",
00249   "NCPrevOccur-verb.possession", "NCPrevOccur-verb.social",
00250   "NCPrevOccur-verb.stative", "NCPrevOccur-verb.weather"
00251 };
00252 
00253 // another occurrence of current word was last word
00254 char const* otherLastTags[]     = {
00255   "OtherNCS-adj.all", "OtherNCS-adj.pert", "OtherNCS-adj.ppl", "adv.all",
00256   "OtherNCS-noun.Tops", "OtherNCS-noun.act", "OtherNCS-noun.animal",
00257   "OtherNCS-noun.artifact", "OtherNCS-noun.attribute", "OtherNCS-noun.body",
00258   "OtherNCS-noun.cognition", "OtherNCS-noun.communication",
00259   "OtherNCS-noun.event", "OtherNCS-noun.feeling", "OtherNCS-noun.food",
00260   "OtherNCS-noun.group", "OtherNCS-noun.location", "OtherNCS-noun.motive",
00261   "OtherNCS-noun.object", "OtherNCS-noun.other", "OtherNCS-noun.person",
00262   "OtherNCS-noun.phenomenon", "OtherNCS-noun.plant",
00263   "OtherNCS-noun.possession", "OtherNCS-noun.process",
00264   "OtherNCS-noun.quantity", "OtherNCS-noun.relation", "OtherNCS-noun.shape",
00265   "OtherNCS-noun.state", "OtherNCS-noun.substance", "OtherNCS-noun.time",
00266   "OtherNCS-verb.body", "OtherNCS-verb.change", "OtherNCS-verb.cognition",
00267   "OtherNCS-verb.communication", "OtherNCS-verb.competition",
00268   "OtherNCS-verb.consumption", "OtherNCS-verb.contact",
00269   "OtherNCS-verb.creation", "OtherNCS-verb.emotion", "OtherNCS-verb.motion",
00270   "OtherNCS-verb.perception", "OtherNCS-verb.possession",
00271   "OtherNCS-verb.social", "OtherNCS-verb.stative", "OtherNCS-verb.weather"
00272   };
00273 
00274   string const otherCapitalized = "otherInitCap";// an occurrence not as first word was
00275                                 // Capitalized
00276   string const otherNotCapitalized      = "otherNotInitCap";// an occurrence not as first word was
00277                                 // not Capitalized
00278 
00279   // Acronyms
00280   // An AllUpper word is stored as an acronym.
00281   // Caps sequences with those initials will be given the following features:
00282   string const Acronym          = "acronym";    // e.g. FCC
00283   string const AcronymBegin     = "cI"; // e.g. Federal
00284   string const AcronymContinue  = "cJ"; // e.g. Communication
00285   string const AcronymEnd       = "cK"; // e.g. Committee
00286 
00287  /*
00288     Not used: we rely on POS tagger for disambiguating these cases.
00289 
00290   // Longest Common Caps Sequence
00291   // To avoid confusing "Also Texas Instruments Corp." as a whole company name,
00292   // the longest common Caps sequence occurring in a document is noted.
00293   // So if "Xerox Corp." also appears, the following features are set:
00294   string const LongestB         = "cL"; // Texas
00295   string const LongestC         = "cM"; // Instruments
00296   string const LongestE         = "cN"; // Corp.
00297   */
00298 
00299 bool allUpper(char const* s)
00300 {
00301   char* scan = (char*)s;
00302   while (*scan)
00303     if (!isupper(*scan++))
00304       return false;
00305   return true;
00306 }
00307 
00308 Text::RegExp::Pattern patMix("^[a-z]+[A-Z]([a-zA-Z])*$");
00309 bool mixedCase(char const* s)
00310 {
00311   return patMix.test(s);
00312 }
00313 
00314 bool noLetter(char const* s)
00315 {
00316   char* scan = (char*)s;
00317   while (*scan)
00318     if (isalpha(*scan++))
00319       return false;
00320   return true;
00321 }
00322 
00323 bool containsDigit(char const* s)
00324 {
00325   return ::strpbrk(s, "0123456789");
00326 }
00327 
00328 bool allDigits(char const* s)
00329 {
00330   return ::strspn(s, "0123456789") == :: strlen(s);
00331 }
00332 
00333 bool digitSlash(char const* s)
00334 {
00335   return containsDigit(s) && strchr(s, '/');
00336 }
00337 
00338 bool allQuotes(char const* s)
00339 {
00340   return (::strspn(s, "\"") == ::strlen(s));
00341 }
00342 
00343 // ======================================================================
00344 
00345 char const* Resources::classNames[] = {
00346   "adj.all", "adj.pert", "adj.ppl", "adv.all", "noun.Tops", "noun.act",
00347   "noun.animal", "noun.artifact", "noun.attribute", "noun.body",
00348   "noun.cognition", "noun.communication", "noun.event", "noun.feeling",
00349   "noun.food", "noun.group", "noun.location", "noun.motive", "noun.object",
00350   "noun.other", "noun.person", "noun.phenomenon", "noun.plant",
00351   "noun.possession", "noun.process", "noun.quantity", "noun.relation",
00352   "noun.shape", "noun.state", "noun.substance", "noun.time", "verb.body",
00353   "verb.change", "verb.cognition", "verb.communication", "verb.competition",
00354   "verb.consumption", "verb.contact", "verb.creation", "verb.emotion",
00355   "verb.motion", "verb.perception", "verb.possession", "verb.social",
00356   "verb.stative", "verb.weather"
00357 };
00358 
00362 Resources::Resources(string& resourceDir, string& locale) :
00363   language(locale.c_str())
00364 {
00365   load(resourceDir);
00366 }
00367 
00368 void Resources::load(string& resourceDir)
00369 {
00370   for (int i = 0; i < sizeof(classNames)/sizeof(char*); i++)
00371     classId[classNames[i]] = i;
00372 
00373   // Dictionaries
00374   /*load(dict, (resourceDir + "/Dict.lst").c_str());
00375   moneyDict.load((resourceDir + "/money.lst").c_str());
00376   namesDict.load((resourceDir + "/names.lst").c_str());
00377   timeDict.load((resourceDir + "/time.lst").c_str());
00378 
00379   // Frequent Word List
00380   FWL.load((resourceDir + "/FrequentWords.lst").c_str());
00381 
00382   // Designators
00383   load(designators, (resourceDir + "/Designators.lst").c_str());
00384 
00385   // Preceding Bigrams
00386   load(preBigrams, (resourceDir + "/Bigrams.lst").c_str());
00387 
00388   // Suffix
00389   load(suffixes, (resourceDir + "/Suffix.lst").c_str());
00390 
00391   // Last word
00392   load(lastWords, (resourceDir + "/Last.lst").c_str());
00393 
00394   // Lower intermediate word
00395   load(lowerInterm, (resourceDir + "/LowerIn.lst").c_str());*/
00396 
00397 }
00398 
00399 template<class WordSet>
00400 void Resources::load(WordSet* sets, char const* file)
00401 {
00402   ifstream ifs(file);
00403   char line[MAX_LINE_LEN];
00404 
00405   while (ifs.getline(line, MAX_LINE_LEN)) {
00406     // split line
00407     char* next = line;
00408     char* className = strtok_r(0, " \t", &next);
00409     if (!className)
00410       continue;
00411     char* word = next + strspn(next, " \t");
00412     if (strlen(word) == 0)
00413       continue;
00414     // determine class
00415     int cid = classId[className];
00416     sets[cid].insert(word);
00417   }
00418 }
00419 
00420 /*
00421 char const* Resources::typeName(EntityType et)
00422 {
00423   if (et.subtype != (EntityType::Subtype)EntityType::NoType)
00424     return TokenTypes[et.subtype];
00425   else if (et.type != EntityType::NoType)
00426     return TokenTypes[et.type];
00427   else
00428     return 0;
00429 }
00430 */
00431 
00432 Text::RegExp::Pattern number("\\d+");
00433 string numRep("+NUM+");
00434 
00435 // ======================================================================
00436 
00437 SstFeatureExtractor::SstFeatureExtractor(Resources& resources) :
00438     resources(resources),
00439     tokenCategorizer(resources.language),
00440     insideQuotes(false)
00441 { }
00442 
00443 void SstFeatureExtractor::extract(Classifier::Features& features, const int& position)
00444 {
00445   Token* token = (*sentence)[position];
00446   char const* word = token->form.c_str();
00447   char const* prevWord = (position > 0) ? (*sentence)[position-1]->form.c_str() : 0;
00448   char const* nextWord = (position+1 < sentence->size()) ? (*sentence)[position+1]->form.c_str() : 0;
00449 
00450   //======================================================================
00451   // Lexical Features
00452 
00453   size_t len = strlen(word);
00454   size_t sentLen = sentence->size();
00455 
00456   // 1st word and initCaps
00457   if (!prevWord && isupper(word[0]))
00458     features.add(firstWordCap[zone]);
00459 
00460   // initCaps and not 1st word
00461   if (prevWord && isupper(word[0]))
00462       features.add(Capitalized[zone]);
00463 
00464   // 1st word and not initCaps
00465   if (!prevWord && !isupper(word[0]))
00466     features.add(firstWordLow[zone]);
00467 
00468   // AllUpper
00469   if (allUpper(word)) {
00470     features.add(AllUpper[zone]);
00471     if (acronyms.contains(word))
00472       features.add(Acronym);
00473   } else if (mixedCase(word))
00474     features.add(MixedCase[zone]);
00475 
00476   // Cap Period
00477   if (isupper(word[0]) && word[len-1] == '.')
00478     features.add(CapPeriod);
00479 
00480   // Single Cap
00481   if (len == 1 && isupper(word[0]))
00482     features.add(SingleCap);
00483 
00484   // Contains Digit
00485   if (containsDigit(word))
00486     features.add(ContainsDigit);
00487 
00488   if (allDigits(word)) {
00489     // TwoDigits
00490     if (len == 2)
00491       features.add(TwoDigits);
00492     // TwoDigits
00493     else if (len == 4)
00494       features.add(FourDigits);
00495   }
00496 
00497   // Digits and /
00498   if (digitSlash(word))
00499     features.add(DigitSlash);
00500     
00501   // Contains $
00502   if (strchr(word, '$'))
00503     features.add(ContainsDollar);
00504     
00505   // Contains %
00506   if (strchr(word, '%'))
00507     features.add(ContainsDollar);
00508     
00509   // Digits and .
00510   if (strpbrk(word, "1234567890") && strchr(word, '.'))
00511     features.add(DigitsDot);
00512 
00513   //======================================================================
00514   // Local Features
00515   string upNext(nextWord ? nextWord : "");
00516   to_upper(upNext);
00517   upNext = number.replace(upNext, numRep, true);
00518   string upPrev(prevWord ? prevWord : "");
00519   to_upper(upPrev);
00520   upPrev = number.replace(upPrev, numRep, true);
00521 
00522   // next word
00523   if (nextWord) {
00524     if (isupper(word[0]))
00525         features.add(Cap + upNext + "Next");
00526     else
00527       features.add(Low + upNext + "Next");
00528   } else if (isupper(word[0]))
00529     features.add(CapNoNext);
00530   else
00531     features.add(LowNoNext);
00532 
00533   // previous word
00534   if (prevWord) {
00535     if (isupper(word[0]))
00536         features.add(Cap + upPrev + "Prev");
00537      else
00538       features.add(Low + upPrev + "Prev");
00539   } else if (isupper(word[0]))
00540     features.add(CapNoPrev);
00541   else
00542     features.add(LowNoPrev);
00543 
00544   // initCapsNext
00545   if (nextWord && isupper(nextWord[0]))
00546     features.add(nextCap[zone]);
00547 
00548   // initCapsPrev
00549   if (prevWord && isupper(prevWord[0]))
00550     features.add(prevCap[zone]);
00551 
00552   // Case Sequence
00553   if (prevWord && nextWord &&
00554       isupper(prevWord[0]) && isupper(word[0]) && isupper(nextWord[0]))
00555     features.add(seqCap);
00556   else
00557     features.add(notSeqCap);
00558 
00559   // hyphenate words
00560   if (strchr(word, '-')) {
00561     char copy[800];
00562     strncpy(copy, word, sizeof(copy));
00563     char* next = copy;
00564     char* word1 = strtok_r(0, "-", &next);
00565     char* word2 = strtok_r(0, "-", &next);
00566     if (word1 && word2) {
00567       bool up1 = isupper(word1[0]);
00568       bool up2 = isupper(word2[0]);
00569       if (next[0] == '\0') {    // just two
00570         if (up1 && up2)
00571           features.add(HyphenCapCap);
00572         else {
00573           to_upper(word1);
00574           to_upper(word2);
00575           if (up1) {
00576             features.add(HyphenCapLow);
00577             features.add(string("U-") + word1);
00578             features.add(string("L_") + word2);
00579           } else if (up2) {
00580             features.add(HyphenLowCap);
00581             features.add(string("L-") + word1);
00582             features.add(string("U_") + word2);
00583           }
00584         }
00585       }
00586     }
00587   }
00588 
00589   string upper(word);
00590   to_upper(upper);
00591   upper = number.replace(upper, numRep, true);
00592 
00593   /*// rare words
00594   if (!resources.FWL.contains(upper))
00595     features.add(rare);*/
00596 
00597   /*// bigrams
00598   if (position > 1) {
00599     string bigram = (*sentence)[position-2]->form + '_' + prevWord;
00600     to_upper(bigram);
00601     for (int i = 0; i < Resources::nClasses; ++i) {
00602       if (resources.preBigrams[i].contains(bigram))
00603         features.add(bigramTags[i]);
00604     }
00605   }*/
00606 /*
00607 #ifdef EXTRA
00608   for (int i = 0; i < Resources::nClasses; ++i) {
00609     if (resources.lastWords[i].contains(word))
00610       features.add(lastWordTags[i]);
00611   }
00612 #endif
00613 */
00614 /*
00615   for (int i = 0; i < Resources::nClasses; ++i) {
00616     if (resources.lowerInterm[i].contains(word))
00617       features.add(lowerIntermTags[i]);
00618   }*/
00619 
00620   //FIXME: Word Shape
00621 
00622 
00623   //======================================================================
00624   // Dictionary Features
00625 /*
00626   if (position > 0) {
00627     for (int i = 0; i < Resources::nClasses; ++i) {
00628       if (resources.dict[i].contains(word)) 
00629         features.add(dictTags[i]);
00630     }
00631   }
00632 */
00633   // features according to feature model
00634   char feature[1024];
00635   FOR_EACH (FeatureSpecs, featureSpecs, fit) {
00636     char const* attrName = fit->name;
00637     int attrIndex = token->attrIndex(attrName);
00638     char featId = 'A' + attrIndex;              // feature type identifier
00639     FOR_EACH (vector<int>, fit->tokens, tit) {
00640       // find token
00641       int dx = *tit;
00642       int idx = position + dx;
00643       Token* tok = (0 <= idx && idx < sentLen) ? (*sentence)[idx] : 0;
00644       if (tok) {
00645         string const* item = tok->get(attrName);
00646         if (item && !item->empty()) {
00647           // skip empty attributes
00648           if (dx < 0)
00649             sprintf(feature, "%d%c%s", -dx, featId, item->c_str());
00650           else
00651             sprintf(feature, "%c%d%s", featId, dx, item->c_str());
00652           features.add(feature);
00653         }
00654       }
00655     }
00656   }
00657 
00658   // Global features
00659   unordered_map<string, bool>::iterator it = capitalized.find(upper);
00660   if (it != capitalized.end()) {
00661     if (it->second)
00662       features.add(otherCapitalized);
00663     else
00664       features.add(otherNotCapitalized);
00665   } else if (position > 0)      // non ambiguous position
00666     capitalized[upper] = isupper(word[0]); // FIXME: only first occurrence counts
00667 
00668   // Acronyms
00669   if (nextWord && isupper(word[0])) {
00670     string acronym;
00671     for (int i = position; i < sentLen; i++) {
00672       char c = (*sentence)[i]->form[0];
00673       if (isupper(c))
00674         acronym.push_back(c);
00675       else
00676         break;
00677     }
00678     if (acronym.size() > 1)
00679       acronyms.insert(acronym.c_str());
00680   }
00681 
00682   // Global Class of word in Cap sequence that occurred as last word
00683   for (int i = 0; i < Resources::nClasses; i++) {
00684     if (otherLast[i].contains(word))
00685       features.add(otherLastTags[i]);
00686   }
00687 
00688   // Global Name Class of previous occurences
00689   for (int i = 0; i < Resources::nClasses; i++) {
00690     if (prevClass[i].contains(word))
00691       features.add(prevTags[i]);
00692   }
00693 
00694   /*// char suffixes
00695   for (int i = 0; i < Resources::nClasses; ++i) {
00696     if (resources.suffixes[i].match(upper))
00697       features.add(suffixTags[i]);
00698   }*/
00699 
00700   // inside quotes
00701   /*
00702   if (insideQuotes)
00703     features.add(withinQuotes);
00704   */
00705   if (prevWord && nextWord &&
00706       strpbrk(prevWord, "\"()[]{}'<>") &&
00707       strpbrk(nextWord, "\"()[]{}'<>"))
00708     features.add(withinQuotes);
00709   // prepare for next position
00710   if (allQuotes(word))
00711     insideQuotes = !insideQuotes;
00712 }
00713 
00714 void SstFeatureExtractor::analyze(Sentence* sent, int zone)
00715 {
00716   sentence = sent;
00717   this->zone = zone;
00718   int len = sentence->size();
00719   tokenTypes.resize(len);
00720   insideQuotes = false;
00721   int i = 0;
00722   for (Sentence::const_iterator sit = sentence->begin();
00723        sit != sentence->end(); ++sit, ++i) {
00724     Token* tok = *sit;
00725     tokenTypes[i] = tokenCategorizer.analyze(tok->form.c_str());
00726   }
00727 }
00728 
00729 void SstFeatureExtractor::reset()
00730 {
00731   insideQuotes = false;
00732 
00733   // global document features
00734   for (int i = 0; i < Resources::nClasses; ++i) {
00735     prevClass[i].clear();
00736     otherLast[i].clear();
00737   }
00738   acronyms.clear();
00739   capitalized.clear();
00740 }
00741 
00742 void SstFeatureExtractor::classified(int position, char const* className)
00743 {
00744   Token* token = (*sentence)[position];
00745   char const* word = token->form.c_str();
00746   designated(word, className);
00747   // set the SSTAG of token
00748   token->set("SSTAG", className);
00749 }
00750 
00751 void SstFeatureExtractor::designated(char const* word, char const* ssTag)
00752 {
00753   if (strlen(ssTag) < 2)
00754     return;
00755   for (int i = 0; i < Resources::nClasses; ++i) {
00756     if (!strcmp(ssTag+2, resources.classNames[i])) // skip X- from tag
00757       prevClass[i].insert(word);
00758   }
00759 }
00760 
00761 } // namespace SST
00762 } // namespace Tanl