00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef Tanl_POS_hmm_tagger_H
00025 #define Tanl_POS_hmm_tagger_H
00026
00027 #include "linear_interpolated_lm.h"
00028 #include "lexicon.h"
00029 #include "MorphTable.h"
00030 #include "Viterbi.h"
00031 #include "Vocab.h"
00032 #include "SuffixGuesser.h"
00033
00034
00035
00036 namespace Tanl { namespace POS {
00037
00038 typedef std::pair<int, double> TagWeight;
00039 typedef std::vector<TagWeight> TagWeights;
00040
00041 struct Model : public HMM
00042 {
00043 Model(int tag_order, int emission_order);
00044
00046 Model(char const* model_file, char const* morph_file);
00047
00048 std::vector<Transition> transitions(State& from, Observation& obs);
00049
00050 void serialize(std::istream&);
00051 void serialize(std::ostream&);
00052
00053
00054 int tag_order;
00055 int emission_order;
00056 ProbLM<int> tag_lm;
00057 ProbLM<std::string> obs_lm;
00058 ProbLM<std::string> spec_lm;
00059 Lexicon obs_lex;
00060 Lexicon spec_lex;
00061 Vocab tag_vocab;
00062 SuffixGuesser low_suffixes;
00063 SuffixGuesser upp_suffixes;
00064 int eos;
00065 int bos;
00066 std::vector<double> apriori_tag_probs;
00067 double theta;
00068
00069
00070 int max_guessed_tags;
00071 std::vector<double> suffix_accu;
00072
00073 private:
00074 std::vector<Transition> tags2transition(std::vector<std::string>& tags, State& from, double emission);
00075
00076 Morphtable morphtable;
00077 };
00078
00079 class HmmTagger
00080 {
00081
00082 public:
00084 HmmTagger(int tag_order, int emission_order);
00085
00087 HmmTagger(char const* model_file, char const* morph_file,
00088 int max_guessed_tags, double logtheta);
00089
00095 bool add_sentence(std::vector<std::string>& words,
00096 std::vector<std::string>& tags);
00097
00098 void calculate_probs();
00099
00100 void build_suffixtries(int maxfreq, int maxlength);
00101
00102 void print_stat();
00103
00104 void save(char const* file_name);
00105
00114 void tag_sentence(std::vector<std::string>& words,
00115 std::vector<std::string>& tags,
00116 std::vector<HMM::Observation>* observations = 0);
00117
00118 private:
00119
00120 struct train_stat {
00121 int tokens;
00122 int sentences;
00123 int types;
00124 int rare_low;
00125 int rare_upp;
00126
00127 train_stat() :
00128 tokens(0),
00129 sentences(0),
00130 types(0),
00131 rare_low(0),
00132 rare_upp(0)
00133 { }
00134 };
00135
00136 Model m;
00137 train_stat stat;
00138 Viterbi StateViterbi;
00139
00140
00141 int max_guessed_tags;
00142 double logtheta;
00143
00144 };
00145
00146 }
00147 }
00148
00149 #endif // Tanl_POS_hmm_tagger_H