00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "SuffixGuesser.h"
00025 #include "serialization.h"
00026
00027
00028 #include <limits>
00029 #include <cmath>
00030
00031 using namespace std;
00032
00033 namespace Tanl { namespace POS {
00034
00035 Counts* Counts::update(TagID tag, int count)
00036 {
00037 if (tagCounts.find(tag) == tagCounts.end())
00038 tagCounts[tag] = count;
00039 else
00040 tagCounts[tag] += count;
00041 this->count += count;
00042 return this;
00043 }
00044
00045 void Counts::serialize(ostream &out)
00046 {
00047 POS::serialize(count, out);
00048 POS::serialize(tagCounts, out);
00049 }
00050
00051 void Counts::serialize(istream &in)
00052 {
00053 POS::serialize(count, in);
00054 POS::serialize(tagCounts, in);
00055 }
00056
00057
00058
00059 void TrieNode::serialize(std::ostream &out)
00060 {
00061 POS::serialize(terminal, out);
00062 if (tag_info) {
00063 POS::serialize(true, out);
00064 tag_info->serialize(out);
00065 } else
00066 POS::serialize(false, out);
00067
00068 POS::serialize(size(), out);
00069 FOR_EACH (TrieNode, (*this), it) {
00070 POS::serialize(it->first, out);
00071 it->second->serialize(out);
00072 }
00073 }
00074
00075 void TrieNode::serialize(std::istream &in)
00076 {
00077 POS::serialize(terminal, in);
00078 bool istag = false;
00079 POS::serialize(istag, in);
00080 if (istag) {
00081 tag_info = new Counts();
00082 tag_info->serialize(in);
00083 } else
00084 tag_info = 0;
00085
00086 size_t size;
00087 POS::serialize(size, in);
00088 for (size_t i = 0; i < size; i++) {
00089 char first;
00090 TrieNode *second = new TrieNode;
00091 POS::serialize(first, in);
00092 second->serialize(in);
00093 (*this)[first] = second;
00094 }
00095 }
00096
00097 bool TrieNode::empty_node()
00098 {
00099 return size() == 0 && tag_info == 0 && !terminal;
00100 }
00101
00102 void TrieNode::set_tag_info(Counts* tag)
00103 {
00104 if (tag_info && tag != tag_info)
00105 delete tag_info;
00106 tag_info = tag;
00107 }
00108
00109 TrieNode* TrieNode::add_char(Counts* legacy_counts, bool after_branch, int ix,
00110 int stop, string& word, TagID tag, int count)
00111 {
00112 if (!legacy_counts)
00113 legacy_counts = new Counts();
00114 Counts* inherited_counts = tag_info ? tag_info : legacy_counts;
00115
00116 if (empty_node() && ix < stop) {
00117
00118 terminal = true;
00119 } else if (size() == 1 && ix < stop) {
00120
00121 TrieNode* child = begin()->second;
00122 if (!child->empty_node() && !child->tag_info)
00123 child->tag_info = new Counts(inherited_counts);
00124 } else if (size() > 1 && ix < stop) {
00125
00126 } else if (empty_node()) {
00127
00128 Counts* temp = new Counts(inherited_counts);
00129 TrieNode* child = EMPTY_NODE->add_char(temp, false, ix - 1, stop, word, tag, count);
00130 (*this)[word[ix]] = child;
00131 } else if (terminal) {
00132
00133 TrieNode* child = EMPTY_NODE->add_char(0, true, ix - 1, stop, word, tag, count);
00134 terminal = false;
00135 (*this)[word[ix]] = child;
00136 } else if (size() == 1 && begin()->first == word[ix]) {
00137
00138 char c = word[ix];
00139 TrieNode* child = (*this)[c];
00140 child->add_char(new Counts(inherited_counts), false, ix - 1, stop, word, tag, count);
00141 } else if (size() == 1) {
00142
00143 TrieNode* child = begin()->second;
00144 if (!child->empty_node() && !child->tag_info)
00145 child->tag_info = new Counts(inherited_counts);
00146 TrieNode* newchild = EMPTY_NODE->add_char(0, true, ix - 1, stop, word, tag, count);
00147 (*this)[word[ix]] = newchild;
00148 } else {
00149
00150 char c = word[ix];
00151 if (find(c) == end())
00152 (*this)[c] = EMPTY_NODE->add_char(0, true, ix - 1, stop, word, tag, count);
00153 else
00154 (*this)[c]->add_char(new Counts(inherited_counts), true, ix - 1, stop, word, tag, count);
00155 }
00156
00157 if (after_branch)
00158 set_tag_info(inherited_counts->update(tag, count));
00159 if (tag_info != legacy_counts)
00160 delete legacy_counts;
00161
00162 return this;
00163 }
00164
00165
00166
00167 void SuffixGuesser::serialize(std::ostream &out)
00168 {
00169 POS::serialize(theta, out);
00170 trie.serialize(out);
00171 }
00172
00173 void SuffixGuesser::serialize(std::istream &in)
00174 {
00175 POS::serialize(theta, in);
00176 trie.serialize(in);
00177 }
00178
00179 void SuffixGuesser::add_word(int n, string& word, int tag, int count)
00180 {
00181 int start = word.length() - 1;
00182 int stop = max(0, start - n + 1);
00183 trie = *(trie.add_char(0, true, start, stop, word, tag, count));
00184 }
00185
00186 double SuffixGuesser::tagprob(string& word, int tagid)
00187 {
00188 double theta_plus_one = theta + 1.0;
00189 double accu = 0.0;
00190 TrieNode::counts_iterator it(word, &trie);
00191 Counts* tag_info;
00192 while ((tag_info = it.next())) {
00193 int suff_count = tag_info->count;
00194 TagCounts& tag_counts = tag_info->tagCounts;
00195 TagCounts::iterator cit = tag_counts.find(tagid);
00196 double tag_prob = 0.0;
00197 if (cit != tag_counts.end())
00198 tag_prob = cit->second / suff_count;
00199 accu = (accu + tag_prob * theta) / theta_plus_one;
00200 }
00201 return log(accu);
00202 }
00203
00204 double SuffixGuesser::tagprobs(string& word, vector<double>& probs)
00205 {
00206 double theta_plus_one = theta + 1.0;
00207
00208 for (unsigned int i = 0; i < probs.size(); ++i)
00209 probs[i] = 0.0;
00210
00211 TrieNode::counts_iterator it(word, &trie);
00212 Counts* tag_info;
00213 while ((tag_info = it.next())) {
00214 int suff_count = tag_info->count;
00215 TagCounts& tag_counts = tag_info->tagCounts;
00216 FOR_EACH (TagCounts, tag_counts, tagit) {
00217 int tagid = tagit->first;
00218 int freq = tagit->second;
00219 double tag_prob = double(freq) / suff_count;
00220 probs[tagid] = (probs[tagid] + tag_prob * theta) / theta_plus_one;
00221 }
00222 }
00223
00224
00225
00226
00227
00228
00229 double max = -1 * numeric_limits<double>::infinity();
00230 for (unsigned int i = 0; i < probs.size(); ++i) {
00231
00232
00233
00234
00235 double prob = log(probs[i]);
00236 probs[i] = prob;
00237 if (prob > max)
00238 max = prob;
00239 }
00240 return max;
00241 }
00242
00243 double SuffixGuesser::calculate_theta(vector<double>& apriori_tag_probs)
00244 {
00245
00246
00247
00248 double p_av = 0.0;
00249 for (unsigned int i = 0; i < apriori_tag_probs.size(); i++) {
00250 double tagp = apriori_tag_probs[i];
00251 p_av += tagp * tagp;
00252 }
00253 double theta = 0.0;
00254 for (unsigned int i = 0; i < apriori_tag_probs.size(); i++) {
00255 double tagp = apriori_tag_probs[i];
00256 double d = tagp - p_av;
00257 theta += tagp * d * d;
00258 }
00259 return sqrt(theta);
00260 }
00261
00262 }
00263 }