00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_NER_NerFeatureExtractor_H
00025 #define Tanl_NER_NerFeatureExtractor_H
00026
00027
00028 #include "conf/conf_set.h"
00029 #include "text/WordIndex.h"
00030 #include "text/WordSet.h"
00031 #include "text/Suffixes.h"
00032
00033
00034 #include "TokenCategorizer.h"
00035 #include "FeatureSpecs.h"
00036 #include "Corpus/Token.h"
00037 #include "classifier/Classifier.h"
00038 #include "classifier/FeatureExtractor.h"
00039
00040
00041 #include <cstring>
00042 #include <string>
00043 #include <vector>
00044
00045 namespace Tanl {
00046 namespace NER {
00047
00048
00049
00050
00051 typedef unordered_map<char const*, std::string> TagSet;
00052
00053 struct Resources
00054 {
00056 static IXE::conf_set<std::string> entityTypes;
00057
00059 Text::WordIndex classId;
00060
00061 Resources(char const* POStag, char const* NEtag) :
00062 POStag(POStag),
00063 NEtag(NEtag)
00064 { resize(); }
00065
00066 Resources(std::string& locale, char const* POStag, char const* NEtag) :
00067 language(locale.c_str()),
00068 POStag(POStag),
00069 NEtag(NEtag)
00070 { resize(); }
00071
00072 Resources(std::string& resourceDir, std::string& locale,
00073 char const* POStag, char const* NEtag);
00074
00075 char const* language;
00076 char const* POStag;
00077 char const* NEtag;
00078
00079 size_t typesCount() { return entityTypes->size(); }
00080
00081
00082 char const* typeName(EntityType et);
00083 TagSet prevTokenType;
00084 TagSet nextTokenType;
00085
00086
00087 std::vector<Tanl::Text::NormWordSet> dict;
00088 Tanl::Text::NormWordSet moneyDict;
00089 Tanl::Text::NormWordSet namesDict;
00090 Tanl::Text::NormWordSet timeDict;
00091 Tanl::Text::NormWordSet prodDict;
00092
00094 Tanl::Text::NormWordSet FWL;
00095
00099 std::vector<Tanl::Text::NormWordSet> designators;
00100
00104 std::vector<Tanl::Text::NormWordSet> preBigrams;
00105
00108 std::vector<Tanl::Text::NormWordSet> prefixes;
00109
00112 std::vector<Tanl::Text::Suffixes> suffixes;
00113
00116 std::vector<Tanl::Text::NormWordSet> firstWords;
00117
00120 std::vector<Tanl::Text::NormWordSet> lastWords;
00121
00125
00126
00132 std::vector<Tanl::Text::NormWordSet> lowerInterm;
00133
00135 void load(std::string& resourceDir);
00136
00143 template<class WordSet>
00144 void load(std::vector<WordSet>& sets, char const* file);
00145
00146 private:
00147 void resize();
00148
00149 };
00150
00151
00155
00156
00157 class NerFeatureExtractor :
00158 public Classifier::FeatureExtractor<Classifier::Features, const int>
00159 {
00160 public:
00161 NerFeatureExtractor(Resources& resources);
00162
00163 ~NerFeatureExtractor() { reset(); }
00164
00166 void analyze(Sentence* sent, int zone);
00167
00179 void extract(Classifier::Features& feats, const int& pos);
00180
00185 void reset();
00186
00191 void classified(int position, char const* className);
00192
00193 Resources& resources;
00194
00195 protected:
00196 bool insideQuotes;
00197 TokenCategorizer tokenCategorizer;
00198 std::vector<EntityType> tokenTypes;
00199
00200 Sentence* sentence;
00201
00202
00203
00205 unordered_map<string, bool> capitalized;
00206
00208 std::vector<Tanl::Text::NormWordSet> prevClass;
00209
00211 std::vector<Tanl::Text::NormWordSet> otherFirst;
00212
00214 std::vector<Tanl::Text::NormWordSet> otherLast;
00215
00217 Tanl::Text::NormWordSet acronyms;
00218
00219 private:
00220 void designated(char const* word, char const* neTag);
00221 int zone;
00222 };
00223
00224 extern char const* UnknownTag;
00225
00226 }
00227 }
00228
00229 #endif // Tanl_NER_NerFeatureExtractor_H