tanl: tanl: tag/NER/NerFeatureExtractor.h Source File

00001 /*
00002 **  Tanl
00003 **  NER/NerFeatureExtractor.h
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of Tanl.
00009 **
00010 **  Tanl is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  Tanl is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #ifndef Tanl_NER_NerFeatureExtractor_H
00025 #define Tanl_NER_NerFeatureExtractor_H
00026 
00027 // Library
00028 #include "conf/conf_set.h"
00029 #include "text/WordIndex.h"
00030 #include "text/WordSet.h"
00031 #include "text/Suffixes.h"
00032 
00033 // Local
00034 #include "TokenCategorizer.h"
00035 #include "FeatureSpecs.h"
00036 #include "Corpus/Token.h"
00037 #include "classifier/Classifier.h"
00038 #include "classifier/FeatureExtractor.h"
00039 
00040 // Standard
00041 #include <cstring>
00042 #include <string>
00043 #include <vector>
00044 
00045 namespace Tanl {
00046 namespace NER {
00047 
00048 //======================================================================
00049 
00050 // Used to get the encoding of a tag corresponding to previous/next positions.
00051 typedef unordered_map<char const*, std::string> TagSet;
00052 
00053 struct Resources
00054 {
00056   static IXE::conf_set<std::string>     entityTypes;
00057 
00059   Text::WordIndex       classId;
00060 
00061   Resources(char const* POStag, char const* NEtag) :
00062     POStag(POStag),
00063     NEtag(NEtag)
00064   { resize(); }
00065 
00066   Resources(std::string& locale, char const* POStag, char const* NEtag) :
00067     language(locale.c_str()),
00068     POStag(POStag),
00069     NEtag(NEtag)
00070   { resize(); }
00071 
00072   Resources(std::string& resourceDir, std::string& locale,
00073             char const* POStag, char const* NEtag);
00074 
00075   char const*           language;
00076   char const*           POStag;
00077   char const*           NEtag;
00078 
00079   size_t        typesCount() { return entityTypes->size(); }
00080 
00081   // Token Types
00082   char const*   typeName(EntityType et);
00083   TagSet        prevTokenType; // feature Type for prev word
00084   TagSet        nextTokenType; // feature Type for next word
00085 
00086   // Dictionaries
00087   std::vector<Tanl::Text::NormWordSet>  dict; // uni
00088   Tanl::Text::NormWordSet       moneyDict;
00089   Tanl::Text::NormWordSet       namesDict;
00090   Tanl::Text::NormWordSet       timeDict;
00091   Tanl::Text::NormWordSet       prodDict;
00092 
00094   Tanl::Text::NormWordSet       FWL;            // Frequent Word List
00095 
00099   std::vector<Tanl::Text::NormWordSet>  designators; // func
00100 
00104   std::vector<Tanl::Text::NormWordSet>  preBigrams; // ubi
00105 
00108   std::vector<Tanl::Text::NormWordSet>  prefixes; // wordPre
00109 
00112   std::vector<Tanl::Text::Suffixes>     suffixes; // wordSuf
00113 
00116   std::vector<Tanl::Text::NormWordSet> firstWords; // classPref
00117 
00120   std::vector<Tanl::Text::NormWordSet> lastWords; // classSuf
00121 
00125   //std::vector<Tanl::Text::NormWordSet>        afterWords;
00126 
00132   std::vector<Tanl::Text::NormWordSet> lowerInterm;
00133 
00135   void load(std::string& resourceDir);
00136 
00143   template<class WordSet>
00144   void          load(std::vector<WordSet>& sets, char const* file);
00145 
00146 private:
00147   void          resize();
00148 
00149 };
00150 
00151 //======================================================================
00155 //======================================================================
00156 
00157 class NerFeatureExtractor :
00158  public Classifier::FeatureExtractor<Classifier::Features, const int>
00159 {
00160 public:
00161   NerFeatureExtractor(Resources& resources);
00162 
00163   ~NerFeatureExtractor() { reset(); }
00164 
00166   void          analyze(Sentence* sent, int zone);
00167 
00179   void  extract(Classifier::Features& feats, const int& pos);
00180 
00185   void  reset();
00186 
00191   void  classified(int position, char const* className);
00192 
00193   Resources&            resources;
00194 
00195 protected:
00196   bool                  insideQuotes;
00197   TokenCategorizer      tokenCategorizer;       
00198   std::vector<EntityType>       tokenTypes; 
00199 
00200   Sentence*             sentence; 
00201 
00202   // Global Maps
00203 
00205   unordered_map<string, bool>   capitalized; // firstInitCap
00206 
00208   std::vector<Tanl::Text::NormWordSet>  prevClass; // NCPrevOccur
00209 
00211   std::vector<Tanl::Text::NormWordSet>  otherFirst; // otherNCF
00212 
00214   std::vector<Tanl::Text::NormWordSet>  otherLast; // otherNCL
00215 
00217   Tanl::Text::NormWordSet       acronyms;
00218 
00219 private:
00220   void designated(char const* word, char const* neTag);
00221   int           zone;   // zone within document
00222 };
00223 
00224 extern char const* UnknownTag;
00225 
00226 } // namespace NER
00227 } // namespace Tanl
00228 
00229 #endif // Tanl_NER_NerFeatureExtractor_H