00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "conf/conf_bool.h"
00026 #include "conf/conf_int.h"
00027 #include "conf/conf_float.h"
00028 #include "conf/conf_string.h"
00029 #include "io/File.h"
00030
00031 #include "SST.h"
00032 #include "SstEventStream.h"
00033
00034
00035 #define MODEL GIS
00036 #if (MODEL == GIS)
00037 #include "classifier/GIS.h"
00038 #else
00039 #include "classifier/LBFGS.h"
00040 #endif
00041
00042
00043 #include <assert.h>
00044 #include <math.h>
00045 #include <limits>
00046
00047 using namespace std;
00048 using namespace Tanl::Text;
00049 using namespace Tanl::Text::RegExp;
00050 using namespace Tanl::Classifier;
00051 using namespace IXE;
00052
00053 namespace Tanl {
00054 namespace SST {
00055
00057
00059
00061 conf<string> resourceDir("ResourceDir", "Resource");
00063 conf<string> language("Language", "italian");
00065 conf<int> cutoff("Cutoff", 2);
00067 conf<int> iter("Iterations", 100);
00069 conf<float> alpha("Alpha", 0.0);
00071 conf<bool> verbose("Verbose", false);
00072
00073 SST::SST(char const* modelFile, char const* configFile) :
00074 model(0),
00075 resources(*language)
00076 {
00077
00078 if (configFile && IXE::io::File(configFile).exists())
00079 IXE::Configuration::load(configFile);
00080 if (IXE::io::File(modelFile).exists())
00081 model = new MaxEnt(modelFile);
00082 resources.load(*resourceDir);
00083 }
00084
00085 SST::~SST() { delete model; }
00086
00087 Text::RegExp::Pattern datePat("[12][0-9][0-9][0-9]-[0-9]+-[0-9]+");
00088
00089 bool checkDate(std::vector<Token*>& sent)
00090 {
00091
00092 for (int i = 0; i < sent.size(); i++)
00093 if (datePat.test(sent[i]->form))
00094 return true;
00095 return false;
00096 }
00097
00098 void SST::train(SentenceReader* sentenceReader, char const* modelFile)
00099 {
00100 MODEL trainer(iter, cutoff, alpha);
00101 trainer.verbose = verbose;
00102
00103
00104
00105 SstEventStream eventStream(resources);
00106
00107 while (sentenceReader->MoveNext()) {
00108 std::vector<Token*>* sent = sentenceReader->Current();
00109 if (sent->size() == 1 && (*sent)[0]->form == "-DOCSTART-") {
00110 FOR_EACH (std::vector<Token*>, *sent, sit)
00111 delete *sit;
00112 delete sent;
00113 eventStream.reset();
00114 continue;
00115 }
00116 eventStream.analyze(sent);
00117 trainer.read(eventStream);
00118
00119 FOR_EACH (std::vector<Token*>, *sent, sit)
00120 delete *sit;
00121 delete sent;
00122 }
00123
00124 trainer.train();
00125
00126
00127 trainer.save(modelFile);
00128 }
00129
00130 Enumerator<std::vector<Token*>*>* SST::pipe(Enumerator<std::vector<Token*>*>& sen)
00131 {
00132 return new SSTPipe(*this, sen);
00133 }
00134
00135
00136
00137 static char const* superSenses[] = {
00138 "adj.all", "adj.pert", "adj.ppl", "adv.all", "noun.Tops", "noun.act",
00139 "noun.animal", "noun.artifact", "noun.attribute", "noun.body",
00140 "noun.cognition", "noun.communication", "noun.event", "noun.feeling",
00141 "noun.food", "noun.group", "noun.location", "noun.motive",
00142 "noun.object", "noun.other", "noun.person", "noun.phenomenon",
00143 "noun.plant", "noun.possession", "noun.process", "noun.quantity",
00144 "noun.relation", "noun.shape", "noun.state", "noun.substance",
00145 "noun.time", "verb.body", "verb.change", "verb.cognition",
00146 "verb.communication", "verb.competition", "verb.consumption",
00147 "verb.contact", "verb.creation", "verb.emotion", "verb.motion",
00148 "verb.perception", "verb.possession", "verb.social",
00149 "verb.stative", "verb.weather"
00150 };
00151
00152 static const int nSenses = sizeof(superSenses)/sizeof(char const*);
00153 static const int ssTags = 1 + 2 * nSenses;
00154 static char const* tags[ssTags];
00155
00156 static int initTags()
00157 {
00158 tags[0] = "O";
00159
00160 char buffer[256];
00161 for (int i = 0; i < nSenses; i++) {
00162 sprintf(buffer, "B-%s", superSenses[i]);
00163 tags[1 + i] = strdup(buffer);
00164 sprintf(buffer, "I-%s", superSenses[i]);
00165 tags[1 + nSenses + i] = strdup(buffer);
00166 }
00167 return 0;
00168 }
00169
00170 static int dummyInitVar = initTags();
00171
00172
00173
00174 SSTPipe::SSTPipe(SST& sst, Enumerator<std::vector<Token*>*>& se) :
00175 sst(sst),
00176 se(se),
00177 eventStream(sst.resources)
00178 {
00179
00180 int nTags = sst.model->NumOutcomes();
00181 for (int i = 0; i < nTags; ++i) {
00182 char const* tag = sst.model->OutcomeName(i);
00183 for (int j = 0; j < ssTags; ++j)
00184 if (!strcmp(tag, tags[j])) {
00185 outcomeId[j] = i;
00186 break;
00187 }
00188 }
00189 }
00190
00191 bool SSTPipe::MoveNext()
00192 {
00193 while (se.MoveNext()) {
00194 std::vector<Token*>* sent = se.Current();
00195 if (sent->size() == 1 && (*sent)[0]->form == "-DOCSTART-") {
00196 eventStream.reset();
00197 delete sent;
00198 } else
00199 return true;
00200 }
00201 return false;
00202 }
00203
00204
00205 std::vector<Token*>* SSTPipe::Current()
00206 {
00207 std::vector<Token*>* sentence = se.Current();
00208 int len = sentence->size();
00209
00210 for (int i = 0; i < len; i++)
00211 (*sentence)[i]->set("SSTAG", "");
00212 eventStream.analyze(sentence);
00213
00214 int nTags = sst.model->NumOutcomes();
00215 string predTags[len];
00216 double best[len];
00217 best[0] = -numeric_limits<double>::infinity();
00218 ClassID O = sst.model->OutcomeID("O");
00219 static const int beam = 5;
00220 string tag[len];
00221 double p[beam][nTags];
00222 int n = 0;
00223
00224 while (eventStream.hasNext()) {
00225 Classifier::Event* event = eventStream.next();
00226 # ifndef NO_BEAM_SEARCH
00227 sst.model->estimate(event->features, p[n % beam]);
00228 delete event;
00229
00230 if (n < beam-1 && n != len-1) {
00231 n++;
00232 continue;
00233 }
00234
00235
00236 int iFirst = (n < beam) ? 1 : n+1;
00237 for (int i = iFirst; i <= n+1; i++) {
00238
00239
00240 for (int j = i - 1; j >= 0 && j >= i-5; j--) {
00241 int w = i - j;
00242
00243
00244 for (int k = j; k < i; k++) {
00245 if (k == j)
00246 tag[k] = "B";
00247 else
00248 tag[k] = "I";
00249 }
00250
00251 double probStart = (j == 0) ? 0.0 : best[j-1];
00252 double probO = probStart;
00253 for (int k = j; k < i; k++) {
00254 probO += log(p[k % beam][O]);
00255 if (j != i-1 && probO < best[i-1])
00256 break;
00257 }
00258 if (j == i-1 || probO >= best[i-1]) {
00259 best[i-1] = probO;
00260 predTags[i-1] = "O";
00261 }
00262
00263
00264 for (int q = 0; q < nSenses; q++) {
00265 char const* classTag = superSenses[q];
00266
00267 double prob = probStart;
00268 for (int k = j; k < i; k++) {
00269 string tagk = tag[k] + '-' + classTag;
00270 ClassID cid = sst.model->OutcomeID(tagk.c_str());
00271 if (cid == (ClassID)-1)
00272 prob = -numeric_limits<double>::infinity();
00273 else
00274 prob += log(p[k % beam][cid]);
00275 if (prob < best[i-1])
00276 break;
00277 }
00278 if (prob < probStart && prob > best[i-1]) {
00279 best[i-1] = prob;
00280 for (int k = j; k < i; k++)
00281 predTags[k] = tag[k] + '-' + classTag;
00282 }
00283 }
00284 }
00285 }
00286 n++;
00287 # else // NO_BEAM_SEARCH
00288 sst.model->estimate(event->features, p[0]);
00289 predTags[n++] = sst.model->OutcomeName(sst.model->BestOutcome(p[0]));
00290 delete event;
00291 # endif
00292 }
00293 for (int i = 0; i < len; ++i) {
00294
00295 eventStream.predicted(predTags[i].c_str(), i);
00296 }
00297 return sentence;
00298 }
00299
00300 }
00301 }