00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_SST_SstFeatureExtractor_H
00025 #define Tanl_SST_SstFeatureExtractor_H
00026
00027
00028 #include "text/WordIndex.h"
00029 #include "text/WordSet.h"
00030 #include "text/Suffixes.h"
00031
00032
00033 #include "TokenCategorizer.h"
00034 #include "Corpus/Token.h"
00035 #include "classifier/Classifier.h"
00036 #include "classifier/FeatureExtractor.h"
00037
00038
00039 #include <cstring>
00040 #include <string>
00041 #include <vector>
00042
00043 namespace Tanl {
00044 namespace SST {
00045
00046
00047 struct FeatureSpec {
00048 FeatureSpec(char const* name, char const* tokens);
00049 char const* name;
00050 std::vector<int> tokens;
00051 };
00052
00053 struct FeatureSpecs : public std::vector<FeatureSpec>
00054 {
00055 FeatureSpecs();
00056 };
00057
00058
00059
00060
00061 typedef unordered_map<char const*, std::string> TagSet;
00062
00063 #define NUM_CLASSES 46
00064
00065 struct Resources
00066 {
00068 static char const* classNames[];
00069
00071 Text::WordIndex classId;
00072
00074 static const int nClasses = NUM_CLASSES;
00075
00076 Resources() {}
00077
00078 Resources(std::string& locale) : language(locale.c_str())
00079 { }
00080
00081 Resources(std::string& resourceDir, std::string& locale);
00082
00083 char const* language;
00084
00085
00086 char const* typeName(EntityType et);
00087 TagSet prevTokenType;
00088 TagSet nextTokenType;
00089
00091 Tanl::Text::NormWordSet FWL;
00092
00096 Tanl::Text::NormWordSet designators[NUM_CLASSES];
00097
00101 Tanl::Text::NormWordSet preBigrams[NUM_CLASSES];
00102
00105 Tanl::Text::Suffixes suffixes[NUM_CLASSES];
00106
00109 Tanl::Text::NormWordSet lastWords[NUM_CLASSES];
00110
00116 Tanl::Text::NormWordSet lowerInterm[NUM_CLASSES];
00117
00119 void load(std::string& resourceDir);
00120
00127 template<class WordSet>
00128 void load(WordSet* sets, char const* file);
00129 };
00130
00131
00135
00136
00137 class SstFeatureExtractor :
00138 public Classifier::FeatureExtractor<Classifier::Features, const int>
00139 {
00140 public:
00141 SstFeatureExtractor(Resources& resources);
00142
00143 ~SstFeatureExtractor() { reset(); }
00144
00146 void analyze(Sentence* sent, int zone);
00147
00159 void extract(Classifier::Features& feats, const int& pos);
00160
00165 void reset();
00166
00171 void classified(int position, char const* className);
00172
00173 protected:
00174 Resources& resources;
00175 bool insideQuotes;
00176 TokenCategorizer tokenCategorizer;
00177 std::vector<EntityType> tokenTypes;
00178
00179 Sentence* sentence;
00180
00181
00182
00184 unordered_map<string, bool> capitalized;
00185
00187 Tanl::Text::NormWordSet prevClass[NUM_CLASSES];
00188
00190 Tanl::Text::NormWordSet otherLast[NUM_CLASSES];
00191
00193 Tanl::Text::NormWordSet acronyms;
00194
00195 private:
00196 void designated(char const* word, char const* ssTag);
00197 FeatureSpecs featureSpecs;
00198 int zone;
00199 };
00200
00201 bool allUpper(char const* s);
00202 bool mixedCase(char const* s);
00203 bool noLetter(char const* s);
00204 bool containsDigit(char const* s);
00205 bool allDigits(char const* s);
00206 bool allQuotes(char const* s);
00207
00208 extern char const* UnknownTag;
00209 extern std::string const otherPrevLoc;
00210 extern std::string const otherPrevOrg;
00211 extern std::string const otherPrevPers;
00212
00213 }
00214 }
00215
00216 #endif // Tanl_SST_SstFeatureExtractor_H