00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_POS_suffix_guesser_H
00025 #define Tanl_POS_suffix_guesser_H
00026
00027 #include "Tag.h"
00028
00029
00030 #include <map>
00031 #include <string>
00032 #include <vector>
00033
00034 namespace Tanl { namespace POS {
00035
00036 #define EMPTY_NODE (new TrieNode())
00037
00038
00047 typedef std::map<TagID, int> TagCounts;
00048
00056 struct Counts
00057 {
00063 Counts() : count(0) {}
00064
00072 Counts(Counts* cpy) :
00073 count(cpy->count)
00074 {
00075 TagCounts temp(cpy->tagCounts);
00076 tagCounts = temp;
00077 }
00078
00084 void serialize(std::ostream &out);
00085
00091 void serialize(std::istream &in);
00092
00104 Counts* update(TagID tag, int count);
00105
00106 int count;
00107 TagCounts tagCounts;
00108 };
00109
00119 struct TrieNode : public std::map<char, TrieNode*>
00120 {
00121 Counts* tag_info;
00122 bool terminal;
00123
00129 TrieNode() : tag_info(0), terminal(false) { }
00130
00131 ~TrieNode()
00132 {
00133 if (tag_info) delete tag_info;
00134 FOR_EACH(TrieNode, (*this), it)
00135 delete it->second;
00136 }
00137
00145 void set_tag_info(Counts* tag);
00146
00152 void serialize(std::ostream &out);
00153
00159 void serialize(std::istream &in);
00160
00206 TrieNode* add_char(Counts* legacy_counts, bool after_branch, int ix,
00207 int stop, std::string& word, TagID tag, int count);
00208
00216 bool empty_node();
00217
00223 struct counts_iterator
00224 {
00225 std::string& word;
00226 int stop;
00227 int ix;
00228 TrieNode* node;
00229 Counts* legacy_counts;
00230 Counts empty_counts;
00231
00240 counts_iterator(std::string& word, TrieNode* node) :
00241 word(word),
00242 ix(word.length() - 1),
00243 stop(0),
00244 node(node),
00245 legacy_counts(&empty_counts)
00246 { }
00247
00253 Counts* next()
00254 {
00255
00256 if (!node)
00257 return 0;
00258
00259
00260 Counts* ct;
00261 if (node->tag_info)
00262 ct = node->tag_info;
00263 else
00264 ct = legacy_counts;
00265
00266
00267 if (ix >= stop)
00268 {
00269 legacy_counts = ct;
00270 TrieNode::const_iterator tit = node->find(word[ix]);
00271 if (tit == node->end())
00272 node = 0;
00273 else
00274 node = tit->second;
00275 ix--;
00276 }
00277 else
00278 node = 0;
00279
00280
00281 return ct;
00282 }
00283 };
00284 };
00285
00298 struct SuffixGuesser
00299 {
00300 double theta;
00301 TrieNode trie;
00302 Counts empty_counts;
00303
00309 void serialize(std::ostream &out);
00310
00316 void serialize(std::istream &in);
00317
00326 void add_word(int n, std::string& word, TagID tag, int count);
00327
00331 double tagprob(std::string& word, int tagid);
00332
00336 double tagprobs(std::string& word, std::vector<double>& probs);
00337
00341 static double calculate_theta(std::vector<double>& apriori_tag_probs);
00342 };
00343
00344 }
00345 }
00346
00347 #endif // Tanl_POS_suffix_guesser_H