00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "Tag.h"
00025 #include <cmath>
00026
00027 namespace Tanl { namespace POS {
00028
00029 template <class CT>
00030 void Context<CT>::add_word(std::vector<int>::const_iterator context, int n,
00031 CT& word)
00032 {
00033
00034 words[word]++;
00035 freq++;
00036 if (n > 0) {
00037 Context<CT>* cmap = childs[*context];
00038 if (!cmap) {
00039 cmap = new Context<CT>();
00040 childs[*context] = cmap;
00041 }
00042 cmap->add_word(++context, n-1, word);
00043 }
00044 }
00045
00046 template <class CT>
00047 void Context<CT>::serialize(std::ostream &out)
00048 {
00049 POS::serialize(freq, out);
00050 POS::serialize(childs.size(), out);
00051 FOR_EACH (typename CMap, childs, cit) {
00052 POS::serialize(cit->first, out);
00053 cit->second->serialize(out);
00054 }
00055 POS::serialize(words.size(), out);
00056 FOR_EACH (typename WordFreq, words, wit) {
00057 POS::serialize(wit->first, out);
00058 POS::serialize(wit->second, out);
00059 }
00060 }
00061
00062 template <class CT>
00063 void Context<CT>::serialize(std::istream &in)
00064 {
00065 POS::serialize(freq, in);
00066 size_t size;
00067 POS::serialize(size, in);
00068 for (size_t i = 0; i < size; i++) {
00069 int first;
00070 POS::serialize(first, in);
00071 Context<CT>* second = new Context<CT>;
00072 second->serialize(in);
00073 childs[first] = second;
00074 }
00075 POS::serialize(size, in);
00076 for (size_t i = 0; i < size; i++) {
00077 CT first;
00078 POS::serialize(first, in);
00079 double second;
00080 POS::serialize(second, in);
00081 words[first] = second;
00082 }
00083 }
00084
00085
00086
00087
00088 template <class CT>
00089 void Context<CT>::adjust_lambdas(std::vector<double>& lambdas,
00090 std::vector<Context<CT>*>& context_nodes,
00091 int max_level)
00092 {
00093 if (context_nodes.size() < max_level) {
00094
00095 FOR_EACH (typename CMap, this->childs, it) {
00096 Context<CT>* child = it->second;
00097 std::vector<Context<CT>*> path1(context_nodes);
00098 path1.push_back(child);
00099 child->adjust_lambdas(lambdas, path1, max_level);
00100 }
00101 return;
00102 }
00103
00104 WordFreq words = context_nodes[0]->get_words();
00105 FOR_EACH (typename WordFreq, words, wit) {
00106 CT const& word = wit->first;
00107 double freq = wit->second;
00108 double max = 0.0;
00109 int maxi = 0;
00110
00111 for (int i = 0; i < context_nodes.size(); ++i) {
00112 Context<CT>& node = *context_nodes[i];
00113 double word_freq = node.words[word];
00114
00115
00116
00117
00118 double ratio = (node.freq == 1.0 || word_freq == 1.0)
00119 ? -1.0
00120 : (word_freq - 1.0) / (node.freq - 1.0);
00121
00122 if (ratio > max) {
00123 max = ratio; maxi = i;
00124 }
00125 }
00126 lambdas[maxi] += freq;
00127 }
00128 }
00129
00134 template <class CT>
00135 std::vector<double> Context<CT>::calculate_lambdas(int level)
00136 {
00137 std::vector<double> lambdas(level+2);
00138 std::vector<Context<CT>*> path;
00139 adjust_lambdas(lambdas, path, level);
00140 lambdas[0] = 0.0;
00141
00142
00143 double sum = 0.0;
00144 for (int i = 0; i < lambdas.size(); i++)
00145 sum += lambdas[i];
00146 for (int i = 0; i < lambdas.size(); i++)
00147 lambdas[i] /= sum;
00148 return lambdas;
00149 }
00150
00166 template <class CT>
00167 void Context<CT>::counts_to_prob(std::vector<double>& lambdas)
00168 {
00169 if (words.empty())
00170 return;
00171 if (lambdas.empty())
00172 throw ProbError("Context::counts_to_prob: empty lambdas");
00173
00174
00175
00176
00177 TO_EACH (typename WordFreq, words, wit) {
00178 CT const& word = wit->first;
00179 double wfreq = wit->second;
00180 wit->second = lambdas[0] + lambdas[1] * (wfreq / freq);
00181 }
00182 std::vector<double>::const_iterator lit = lambdas.begin();
00183 lit += 2;
00184 FOR_EACH (typename CMap, childs, cit)
00185 cit->second->estimate_at_context(words, lit);
00186 }
00187
00188 template <class CT>
00189 void Context<CT>::estimate_at_context(WordFreq& parent_words,
00190 std::vector<double>::const_iterator lambdas)
00191 {
00192
00193
00194 double l = *lambdas;
00195
00196 TO_EACH (typename WordFreq, words, wit) {
00197 CT const& word = wit->first;
00198 double wfreq = wit->second;
00199 wit->second = parent_words[word] + l * (wfreq / freq);
00200 }
00201
00202
00203 FOR_EACH (typename CMap, childs, cit)
00204 cit->second->estimate_at_context(words, ++lambdas);
00205 }
00206
00207
00208 template <class CT>
00209 double Context<CT>::wordprob(CT const& word, std::vector<int>& context)
00210 {
00211 double prob = 0.0;
00212 Context<CT>* node = this;
00213 int i = 0;
00214 do {
00215 typename WordFreq::const_iterator wit = node->words.find(word);
00216 if (wit != node->words.end())
00217 prob = wit->second;
00218
00219 if (i < context.size() &&
00220 node->childs.find(context[i]) != node->childs.end())
00221 node = node->childs[context[i++]];
00222 else
00223 break;
00224 } while (true);
00225 return log(prob);
00226 }
00227
00228 }
00229 }