00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifdef _WIN32
00025 #include "lib/strtok_r.h"
00026 #endif
00027
00028
00029 #include "conf/conf_int.h"
00030 #include "conf/conf_float.h"
00031 #include "text/WordIndex.h"
00032
00033 #include "Parser.h"
00034 #include "ap.h"
00035 #include "EventStream.h"
00036
00037
00038 #include <list>
00039
00040
00041
00042
00043 using namespace std;
00044
00045 namespace Parser {
00046
00048 IXE::conf<int> apIterations("ApIterations", 20);
00050 IXE::conf<float> apPercent("ApPercent", 0.1F);
00052 IXE::conf<int> partitionSize("ApPartition", 1);
00053
00056
00062 struct ApParser : public Parser
00063 {
00064 ApParser(char const* modelFile, int iter = 0);
00065
00066 void train(SentenceReader* sentenceReader, char const* modelFile);
00067 Sentence* parse(Sentence* sentence);
00068 void revise(SentenceReader* sentenceReader, char const* actionFile = 0);
00069
00070 APSV ap;
00071 int iter;
00072 };
00073
00079 Parser* ApParserFactory(char const* modelFile = 0)
00080 {
00081 return new ApParser(modelFile, apIterations);
00082 }
00083
00084 REGISTER_PARSER(AP, ApParserFactory);
00085
00086 ApParser::ApParser(char const* modelFile, int iter) :
00087 Parser(ap.predIndex),
00088 iter(iter)
00089 {
00090 AP::verbose = Parser::verbose;
00091 AP::updatePercent = apPercent;
00092
00093
00094 if (!modelFile)
00095 return;
00096 ifstream ifs(modelFile);
00097 if (!ifs)
00098 throw IXE::FileError(string("Missing model file: ") + modelFile);
00099
00100 readHeader(ifs);
00101 ap.load(ifs);
00102
00103 info.load(ifs);
00104 ifs.close();
00105 }
00106
00107 void ApParser::train(SentenceReader* sentenceReader, char const* modelFile)
00108 {
00109 WordIndex labelIndex;
00110 vector<string> labels;
00111
00112 vector<string> predLabels;
00113
00114
00115 list<Tanl::Classifier::Event*> events;
00116
00117 WordCounts predCount;
00118 int evCount = 0;
00119 Tanl::Classifier::PID pID = 1;
00120
00121
00122 EventStream eventStream(sentenceReader, &info);
00123 while (eventStream.hasNext()) {
00124 Tanl::Classifier::Event* ev = eventStream.next();
00125 events.push_back(ev);
00126 evCount++;
00127 if (verbose) {
00128 if (evCount % 10000 == 0)
00129 cerr << '+' << flush;
00130 else if (evCount % 1000 == 0)
00131 cerr << '.' << flush;
00132 }
00133 vector<string>& ec = ev->features;
00134 for (unsigned j = 0; j < ec.size(); j++) {
00135 string& pred = ec[j];
00136
00137 if (predIndex.find(pred.c_str()) == predIndex.end()) {
00138
00139 WordCounts::iterator wcit = predCount.find(pred);
00140
00141 int count;
00142 if (wcit == predCount.end())
00143 count = predCount[pred] = 1;
00144 else
00145 count = ++wcit->second;
00146 if (count >= featureCutoff) {
00147 predLabels.push_back(pred);
00148 predIndex[pred.c_str()] = pID++;
00149 predCount.erase(pred);
00150 }
00151 }
00152 }
00153 }
00154 if (verbose)
00155 cerr << endl;
00156
00157
00158 Cases cases;
00159 cases.reserve(evCount);
00160 int n = 0;
00161 Tanl::Classifier::ClassID oID = 0;
00162 while (!events.empty()) {
00163 Tanl::Classifier::Event* ev = events.front();
00164 events.pop_front();
00165 cases.push_back(Case());
00166 X& x = cases[n].first;
00167
00168 vector<string>& ec = ev->features;
00169 char const* c = ev->className.c_str();
00170 for (unsigned j = 0; j < ec.size(); j++) {
00171 string& pred = ec[j];
00172 WordIndex::const_iterator pit = predIndex.find(pred.c_str());
00173 if (pit != predIndex.end()) {
00174 x.push_back(pit->second);
00175 }
00176 }
00177 if (x.size()) {
00178 if (labelIndex.find(c) == labelIndex.end()) {
00179 labelIndex[c] = oID++;
00180 labels.push_back(c);
00181 }
00182 cases[n].second = labelIndex[c];
00183 n++;
00184 if (verbose) {
00185 if (n % 10000 == 0)
00186 cerr << '+' << flush;
00187 else if (n % 1000 == 0)
00188 cerr << '.' << flush;
00189 }
00190 x.push_back(0);
00191 }
00192 delete ev;
00193 }
00194 cases.resize(n);
00195 if (verbose)
00196 cerr << endl;
00197
00198 int predSize = predLabels.size();
00199 predSize++;
00200 APSV ap(labels.size(), predSize);
00201
00202 ofstream ofs(modelFile, ios::binary | ios::trunc);
00203
00204 writeHeader(ofs);
00205
00206 ofs << labels.size() << endl;
00207 FOR_EACH (vector<string>, labels, pit)
00208 ofs << *pit << endl;
00209
00210 ofs << predLabels.size() << endl;
00211 FOR_EACH (vector<string>, predLabels, pit)
00212 ofs << *pit << endl;
00213
00214 predIndex.clear();
00215 WordIndex().swap(predIndex);
00216 labelIndex.clear();
00217 WordIndex().swap(labelIndex);
00218
00219 info.clearRareEntities();
00220
00221 ap.train(cases, iter);
00222
00223 ap.save(ofs);
00224
00225 info.save(ofs);
00226 }
00227
00228 Sentence* ApParser::parse(Sentence* sentence)
00229 {
00230 preprocess(sentence);
00231 ParseState state(*sentence, &info, predIndex);
00232
00233 while (state.hasNext()) {
00234 Tanl::Classifier::Context& pIDs = *state.next();
00235 X x;
00236 x.push_back(0);
00237
00238 for (unsigned i = 0; i < pIDs.size(); i++) {
00239 x.push_back(pIDs[i]);
00240 }
00241 Y prediction = ap.predict(x);
00242 string& outcome = ap.labels[prediction];
00243 if (!state.transition(outcome.c_str()))
00244 state.transition("S");
00245 }
00246 return state.getSentence();
00247 }
00248
00249 void ApParser::revise(SentenceReader* sentenceReader, char const* actionFile)
00250 {
00251 if (actionFile) {
00252
00253 ifstream ifs(actionFile);
00254
00255 ReviseContextStream contextStream(sentenceReader, predIndex);
00256
00257 char line[4000];
00258 while (contextStream.hasNext()) {
00259 ++contextStream.cur;
00260 ifs.getline(line, sizeof(line));
00261 char* next = line;
00262 char const* outcome = strtok_r(0, " \t", &next);
00263 contextStream.actions.push_back(outcome);
00264 }
00265 } else {
00266 ReviseContextStream contextStream(sentenceReader, predIndex);
00267
00268 while (contextStream.hasNext()) {
00269 Tanl::Classifier::Context& pIDs = *contextStream.next();
00270 X x;
00271 x.push_back(0);
00272
00273 for (unsigned i = 0; i < pIDs.size(); i++) {
00274 x.push_back(pIDs[i]);
00275 }
00276 FeatureID prediction = ap.predict(x);
00277 string& outcome = ap.labels[prediction];
00278 contextStream.actions.push_back(outcome);
00279 }
00280 }
00281 }
00282
00283 }