tanl: tanl: tag/SST/SstFeatureExtractor.h Source File

00001 /*
00002 **  Tanl
00003 **  SST/SstFeatureExtractor.h
00004 **  ----------------------------------------------------------------------
00005 **  Copyright (c) 2005  Giuseppe Attardi (attardi@di.unipi.it).
00006 **  ----------------------------------------------------------------------
00007 **
00008 **  This file is part of Tanl.
00009 **
00010 **  Tanl is free software; you can redistribute it and/or modify it
00011 **  under the terms of the GNU General Public License, version 3,
00012 **  as published by the Free Software Foundation.
00013 **
00014 **  Tanl is distributed in the hope that it will be useful,
00015 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017 **  GNU General Public License for more details.
00018 **
00019 **  You should have received a copy of the GNU General Public License
00020 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00021 **  ----------------------------------------------------------------------
00022 */
00023 
00024 #ifndef Tanl_SST_SstFeatureExtractor_H
00025 #define Tanl_SST_SstFeatureExtractor_H
00026 
00027 // Library
00028 #include "text/WordIndex.h"
00029 #include "text/WordSet.h"
00030 #include "text/Suffixes.h"
00031 
00032 // Local
00033 #include "TokenCategorizer.h"
00034 #include "Corpus/Token.h"
00035 #include "classifier/Classifier.h"
00036 #include "classifier/FeatureExtractor.h"
00037 
00038 // Standard
00039 #include <cstring>
00040 #include <string>
00041 #include <vector>
00042 
00043 namespace Tanl {
00044 namespace SST {
00045 
00046 // Representation of feature model.
00047 struct FeatureSpec {
00048   FeatureSpec(char const* name, char const* tokens);
00049   char const* name;             
00050   std::vector<int>      tokens; 
00051 };
00052 
00053 struct FeatureSpecs : public std::vector<FeatureSpec>
00054 {
00055   FeatureSpecs();
00056 };
00057 
00058 //======================================================================
00059 
00060 // Used to get the encoding of a tag corresponding to previous/next positions.
00061 typedef unordered_map<char const*, std::string> TagSet;
00062 
00063 #define NUM_CLASSES 46
00064 
00065 struct Resources
00066 {
00068   static char const* classNames[];
00069   
00071   Text::WordIndex       classId;
00072 
00074   static const int nClasses = NUM_CLASSES;
00075 
00076   Resources() {}
00077 
00078   Resources(std::string& locale) : language(locale.c_str())
00079   { }
00080 
00081   Resources(std::string& resourceDir, std::string& locale);
00082 
00083   char const*           language;
00084 
00085   // Token Types
00086   char const*   typeName(EntityType et);
00087   TagSet        prevTokenType; // feature Type for prev word
00088   TagSet        nextTokenType; // feature Type for next word
00089 
00091   Tanl::Text::NormWordSet       FWL;    // Frequent Word List
00092 
00096   Tanl::Text::NormWordSet       designators[NUM_CLASSES]; // func
00097 
00101   Tanl::Text::NormWordSet       preBigrams[NUM_CLASSES]; // ubi
00102 
00105   Tanl::Text::Suffixes          suffixes[NUM_CLASSES]; // wordSuf
00106 
00109   Tanl::Text::NormWordSet       lastWords[NUM_CLASSES]; // classSuf
00110 
00116   Tanl::Text::NormWordSet       lowerInterm[NUM_CLASSES];
00117 
00119   void load(std::string& resourceDir);
00120 
00127   template<class WordSet>
00128   void          load(WordSet* sets, char const* file);
00129 };
00130 
00131 //======================================================================
00135 //======================================================================
00136 
00137 class SstFeatureExtractor :
00138  public Classifier::FeatureExtractor<Classifier::Features, const int>
00139 {
00140 public:
00141   SstFeatureExtractor(Resources& resources);
00142 
00143   ~SstFeatureExtractor() { reset(); }
00144 
00146   void          analyze(Sentence* sent, int zone);
00147 
00159   void  extract(Classifier::Features& feats, const int& pos);
00160 
00165   void  reset();
00166 
00171   void  classified(int position, char const* className);
00172 
00173 protected:
00174   Resources&            resources;
00175   bool                  insideQuotes;
00176   TokenCategorizer      tokenCategorizer;       
00177   std::vector<EntityType>       tokenTypes; 
00178 
00179   Sentence*             sentence; 
00180 
00181   // Global Maps
00182 
00184   unordered_map<string, bool>   capitalized; // firstInitCap
00185 
00187   Tanl::Text::NormWordSet       prevClass[NUM_CLASSES]; // NCPrevOccur
00188 
00190   Tanl::Text::NormWordSet       otherLast[NUM_CLASSES]; // otherNCS
00191 
00193   Tanl::Text::NormWordSet       acronyms;
00194 
00195 private:
00196   void designated(char const* word, char const* ssTag);
00197   FeatureSpecs  featureSpecs;
00198   int           zone;   // zone within document
00199 };
00200 
00201 bool allUpper(char const* s);
00202 bool mixedCase(char const* s);
00203 bool noLetter(char const* s);
00204 bool containsDigit(char const* s);
00205 bool allDigits(char const* s);
00206 bool allQuotes(char const* s);
00207 
00208 extern char const* UnknownTag;
00209 extern std::string const otherPrevLoc;
00210 extern std::string const otherPrevOrg;
00211 extern std::string const otherPrevPers;
00212 
00213 } // namespace SST
00214 } // namespace Tanl
00215 
00216 #endif // Tanl_SST_SstFeatureExtractor_H