00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_Classifier_MaxEnt_H
00025 #define Tanl_Classifier_MaxEnt_H
00026
00027 #include "Classifier.h"
00028 #include "include/unordered_map.h"
00029 #include "text/strings.h"
00030
00031
00032 #include <list>
00033
00034
00035 #define SLACK
00036
00040 BEGIN_NAMESPACE_HASH
00041 template<> struct hash<std::pair<unsigned, unsigned> > {
00042 size_t operator()(const std::pair<unsigned, unsigned>& x) const {
00043 unsigned a = x.first + ~(x.first << 13);
00044 unsigned b = x.second + ~(x.second << 9);
00045 a += (b >> 13);
00046 return a ^ b;
00047
00048 }
00049 };
00050
00051 template<> struct hash<std::pair<unsigned, std::vector<unsigned> > > {
00052 size_t operator()(const std::pair<unsigned, std::vector<unsigned> >& x) const {
00053 const std::vector<unsigned>& v = x.second;
00054 int n = x.first;
00055 for (int i = 0; i < v.size(); ++i)
00056 n += v[i] * (i+1);
00057 return n + v.size() * 47;
00058 }
00059 };
00060 END_NAMESPACE_HASH
00061
00062 namespace Tanl {
00063 namespace Classifier {
00064
00065 typedef std::pair<PID, ClassID> Feature;
00067 typedef unordered_map<Feature, double> FeatureMap;
00068 typedef unordered_map<std::string, int> WordCounts;
00069
00079 class MaxEnt : public Classifier
00080 {
00081
00082 public:
00083
00084 MaxEnt() { }
00085
00086 MaxEnt(int iterations, int cutoff) :
00087 iterations(iterations), cutoff(cutoff), pID(0),
00088 correctionConstant(1),
00089 correctionParam(0.0)
00090 { }
00091
00092 MaxEnt(char const* file);
00093
00094 MaxEnt(std::istream& ifs);
00095
00096 virtual ~MaxEnt();
00097
00109 void estimate(Context& context, double prob[]) {
00110 estimate((std::vector<PID>&)context, prob);
00111 }
00112
00124 void estimate(Features& features, double prob[]) {
00125 Context context(features, predIndex);
00126 estimate((std::vector<PID>&)context, prob);
00127 }
00128
00136 ClassID BestOutcome(double* ocs) const;
00137
00143 void load(std::istream& is);
00144
00148 void save(char const* file);
00149
00159 void read(EventStream& eventStream);
00160
00161 protected:
00162
00166 void readEvent(Event* ev);
00167
00174 ClassID estimate(const std::vector<PID>& predicates, double alpha[]);
00175
00176 FeatureMap lambda;
00177 PID numPreds;
00178 int numTokens;
00179 typedef unordered_map<std::pair<ClassID, std::vector<PID> >, int> EventMap;
00180
00181 EventMap eventMap;
00182
00183 unsigned cutoff;
00184 int iterations;
00185
00186 size_t correctionConstant;
00187 double correctionParam;
00188
00189
00190 std::list<Event*> events;
00191 WordCounts counter;
00192 int pID;
00193
00195 void loadZhang(std::istream& is);
00196
00197 friend std::ostream& operator <<(std::ostream& s, MaxEnt const& m);
00198
00199
00200
00201
00202 static int buildIndex(std::list<Event*>& events,
00203 Text::WordIndex& predIndex,
00204 EventMap& eventMap,
00205 std::vector<char const*>& outcomeLabels,
00206 int evCutoff, bool verbose);
00207 };
00208
00209 std::ostream& operator <<(std::ostream& s, MaxEnt const& m);
00210
00211 }
00212 }
00213
00214 #endif // Tanl_Classifier_MaxEnt_H