00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "io/File.h"
00026
00027 #include "NER.h"
00028
00029
00030 #define MODEL GIS
00031 #if (MODEL == GIS)
00032 #include "classifier/GIS.h"
00033 #else
00034 #include "classifier/LBFGS.h"
00035 #endif
00036
00037
00038 #include <assert.h>
00039 #include <math.h>
00040 #include <limits>
00041
00042 using namespace std;
00043 using namespace Tanl::Text;
00044 using namespace Tanl::Classifier;
00045
00046 namespace Tanl {
00047 namespace NER {
00048
00050
00052
00054 extern IXE::conf<bool> oldIOB;
00056 extern IXE::conf<bool> refine;
00057
00060 NER::NER(char const* modelFile, char const* configFile,
00061 char const* POStag, char const* NEtag) :
00062 language("Language", "english"),
00063 resourceDir("ResourceDir", "Resource"),
00064 cutoff("Cutoff", 2),
00065 iter("Iterations", 100),
00066 alpha("Alpha", 0.0),
00067 verbose("Verbose", false),
00068 POStag(POStag),
00069 NEtag(NEtag),
00070 model(0),
00071 resources(POStag, NEtag)
00072 {
00073
00074 if (configFile && IXE::io::File(configFile).exists())
00075 IXE::Configuration::load(configFile);
00076 if (IXE::io::File(modelFile).exists())
00077 model = new MaxEnt(modelFile);
00078 resources.language = language->c_str();
00079 resources.load(*resourceDir);
00080 }
00081
00082 NER::~NER() { delete model; }
00083
00084 Text::RegExp::Pattern datePat("[12][0-9][0-9][0-9]-[0-9]+-[0-9]+");
00085
00086 bool checkDate(vector<Token*>& sent)
00087 {
00088
00089 for (int i = 0; i < sent.size(); i++)
00090 if (datePat.test(sent[i]->form))
00091 return true;
00092 return false;
00093 }
00094
00095 #ifdef ZONES
00096 void NER::train(SentenceReader* sentenceReader, char const* modelFile)
00097 {
00098 MODEL trainer(iter, cutoff, alpha);
00099 trainer.verbose = verbose;
00100
00101
00102
00103 NerEventStream eventStream(resources);
00104 int sentNum = 0;
00105 vector<Token*>* prev = 0;
00106 int zone = 0;
00107
00108 while (sentenceReader->MoveNext()) {
00109 vector<Token*>* sent = sentenceReader->Current();
00110 if (sent->size() == 1 && (*sent)[0]->form == "-DOCSTART-") {
00111 FOR_EACH (vector<Token*>, *sent, sit)
00112 delete *sit;
00113 delete sent;
00114 eventStream.reset();
00115 sentNum = 0;
00116 continue;
00117 }
00118 switch (sentNum) {
00119 case 0:
00120 zone = 0;
00121 break;
00122 case 1:
00123 if (checkDate(*sent)) {
00124 zone = 2;
00125 break;
00126 } else {
00127 prev = sent;
00128 sentNum++;
00129 continue;
00130 }
00131 case 2:
00132 if (prev) {
00133 if (checkDate(*sent)) {
00134
00135 zone = 1;
00136 } else {
00137
00138 zone = 3;
00139 }
00140 eventStream.analyze(prev, zone);
00141 trainer.read(eventStream);
00142 FOR_EACH (vector<Token*>, *prev, sit)
00143 delete *sit;
00144 delete prev;
00145 prev = 0;
00146 if (zone == 1)
00147 zone = 2;
00148 } else {
00149 zone = 3;
00150 }
00151 break;
00152 default:
00153 zone = 3;
00154 }
00155 eventStream.analyze(sent, zone);
00156 trainer.read(eventStream);
00157
00158 FOR_EACH (vector<Token*>, *sent, sit)
00159 delete *sit;
00160 delete sent;
00161 sentNum++;
00162 }
00163
00164 trainer.train();
00165
00166
00167 trainer.save(modelFile);
00168 }
00169 #else
00170
00171 void NER::train(SentenceReader* sentenceReader, char const* modelFile)
00172 {
00173 MODEL trainer(iter, cutoff, alpha);
00174 trainer.verbose = verbose;
00175
00176
00177
00178 NerEventStream eventStream(resources);
00179
00180 while (sentenceReader->MoveNext()) {
00181 vector<Token*>* sent = sentenceReader->Current();
00182 if (sent->size() == 1 && (*sent)[0]->form == "-DOCSTART-") {
00183 FOR_EACH (vector<Token*>, *sent, sit)
00184 delete *sit;
00185 delete sent;
00186 eventStream.reset();
00187 continue;
00188 }
00189 eventStream.analyze(sent);
00190 trainer.read(eventStream);
00191
00192 FOR_EACH (vector<Token*>, *sent, sit)
00193 delete *sit;
00194 delete sent;
00195 }
00196
00197 trainer.train();
00198
00199
00200 trainer.save(modelFile);
00201 }
00202 #endif // ZONES
00203
00204 vector<Token*>* NER::tag(std::vector<Token*>* sentence,
00205 NerEventStream* eventStream)
00206 {
00207
00208 int len = sentence->size();
00209
00210 for (int i = 0; i < len; i++)
00211 (*sentence)[i]->set(NEtag, "");
00212 bool newEventStream = false;
00213 if (!eventStream) {
00214
00215 eventStream = new NerEventStream(resources);
00216 newEventStream = true;
00217 }
00218 eventStream->analyze(sentence);
00219
00220 int nTags = model->NumOutcomes();
00221 string predTags[len];
00222 double best[len];
00223 best[0] = -numeric_limits<double>::infinity();
00224 # ifdef RL
00225 int bestLength[len];
00226 # endif
00227 static int nCateg = 4;
00228 ClassID O = model->OutcomeID("O");
00229 const int beam = 5;
00230 string tag[len];
00231 double p[beam][nTags];
00232 int n = 0;
00233
00234 while (eventStream->hasNext()) {
00235 Classifier::Event* event = eventStream->next();
00236 model->estimate(event->features, p[n % beam]);
00237 delete event;
00238
00239
00240 if (n < beam-1 && n != len-1) {
00241 n++;
00242 continue;
00243 }
00244
00245
00246 int iFirst = (n < beam) ? 1 : n+1;
00247 for (int i = iFirst; i <= n+1; i++) {
00248
00249
00250 for (int j = i - 1; j >= 0 && j >= i-5; j--) {
00251 int w = i - j;
00252
00253
00254 if (refine) {
00255 for (int k = j; k < i; k++) {
00256 if (k == j) {
00257 if (k == i - 1)
00258 tag[k] = "U";
00259 else
00260 tag[k] = "B";
00261 } else {
00262 if (k == i - 1)
00263 tag[k] = "E";
00264 else
00265 tag[k] = "I";
00266 }
00267 }
00268 } else {
00269 for (int k = j; k < i; k++) {
00270 if (k == j)
00271 tag[k] = "B";
00272 else
00273 tag[k] = "I";
00274 }
00275 }
00276
00277 double probStart = (j == 0) ? 0.0 : best[j-1];
00278 double probO = probStart;
00279 for (int k = j; k < i; k++) {
00280 probO += log(p[k % beam][O]);
00281 if (j != i-1 && probO < best[i-1])
00282 break;
00283 }
00284 if (j == i-1 || probO >= best[i-1]) {
00285 best[i-1] = probO;
00286 #ifdef RL
00287 bestLength[i-1] = i-j;
00288 #endif
00289 predTags[i-1] = "O";
00290 }
00291
00292
00293 FOR_EACH (set<string>, *Resources::entityTypes, it) {
00294 string const& classTag = *it;
00295
00296 double prob = probStart;
00297 for (int k = j; k < i; k++) {
00298 string tagk = tag[k] + '-' + classTag;
00299 ClassID cid = model->OutcomeID(tagk.c_str());
00300 if (cid == (ClassID)-1)
00301 prob = -numeric_limits<double>::infinity();
00302 else
00303 prob += log(p[k % beam][cid]);
00304 if (prob < best[i-1])
00305 break;
00306 }
00307 if (prob < probStart && prob > best[i-1]) {
00308 best[i-1] = prob;
00309 # ifdef RL
00310 bestLength[i-1] = i-j;
00311 predTags[i-1] = tag[i-1] + '-' + classTag;
00312 # else
00313 for (int k = j; k < i; k++)
00314 predTags[k] = tag[k] + '-' + classTag;
00315 # endif
00316 }
00317 }
00318 }
00319 }
00320 n++;
00321 }
00322 # ifdef RL
00323
00324 for (int i = len-1; i >= 0; ) {
00325 string predTag = predTags[i];
00326 if (predTag[0] != 'O')
00327 predTag[0] = 'I';
00328 for (int j = i; j > i - bestLength[i]; --j)
00329 predTags[j] = predTag;
00330 i -= bestLength[i];
00331 }
00332 # endif
00333 for (int i = 0; i < len; ++i) {
00334
00335 eventStream->predicted(predTags[i].c_str(), i);
00336 }
00337
00338 if (oldIOB) {
00339
00340 int i = 0;
00341 int len = sentence->size();
00342 for (Sentence::const_iterator sit = sentence->begin();
00343 sit != sentence->end(); ++sit, ++i) {
00344 Token* tok = *sit;
00345 string const* tag = tok->get(NEtag);
00346 if (!tag)
00347 continue;
00348 string const* prevTag = (i > 0) ? (*sentence)[i-1]->get(NEtag) : 0;
00349 string& nt = const_cast<string&>(*tag);
00350 if (nt[0] == 'B' && (!prevTag || (*prevTag)[0] == 'O')) {
00351 nt[0] = 'I';
00352 }
00353 }
00354 }
00355 if (refine) {
00356
00357 int i = 0;
00358 int len = sentence->size();
00359 for (Sentence::const_iterator sit = sentence->begin();
00360 sit != sentence->end(); ++sit, ++i) {
00361 Token* tok = *sit;
00362 string const* tag = tok->get(NEtag);
00363 if (!tag)
00364 continue;
00365 string& nt = const_cast<string&>(*tag);
00366 switch (nt[0]) {
00367 case 'U':
00368 if (oldIOB) {
00369 if (i > 0) {
00370 string const* prevTag = (*sentence)[i-1]->get(NEtag);
00371 if (!strcmp(prevTag->c_str()+1, tag->c_str()+1))
00372 nt[0] = 'B';
00373 else
00374 nt[0] = 'I';
00375 } else
00376 nt[0] = 'I';
00377 } else
00378 nt[0] = 'B';
00379 break;
00380 case 'E':
00381 nt[0] = 'I';
00382 break;
00383 }
00384 }
00385 }
00386 if (newEventStream)
00387 delete eventStream;
00388 return sentence;
00389 }
00390
00391 Enumerator<vector<Token*>*>* NER::pipe(Enumerator<vector<Token*>*>& se)
00392 {
00393 return new NerPipe(*this, se);
00394 }
00395
00396
00397
00398 NerPipe::NerPipe(NER& ner, Enumerator<vector<Token*>*>& se) :
00399 ner(ner),
00400 eventStream(ner.resources),
00401 se(se)
00402 {
00403 vector<string> iobTags;
00404 int entities = ner.resources.entityTypes->size();
00405 int neTags = 1 + 2 * entities;
00406 iobTags.resize(neTags);
00407 iobTags[0] = "O";
00408
00409 int i = 0;
00410 FOR_EACH (set<string>, *ner.resources.entityTypes, it) {
00411 iobTags[1 + i] = "B-" + *it;
00412 iobTags[1 + entities + i] = "I-" + *it;
00413 ++i;
00414 }
00415
00416
00417 int nTags = ner.model->NumOutcomes();
00418 for (int i = 0; i < nTags; ++i) {
00419 char const* tag = ner.model->OutcomeName(i);
00420 for (int j = 0; j < iobTags.size(); ++j)
00421 if (tag == iobTags[j]) {
00422 outcomeId[j] = i;
00423 break;
00424 }
00425 }
00426 }
00427
00428 void NerPipe::Dispose()
00429 {
00430 ner.decRef();
00431 delete this;
00432 }
00433
00434 bool NerPipe::MoveNext()
00435 {
00436 while (se.MoveNext()) {
00437 sent = se.Current();
00438 if (sent->size() == 1 && (*sent)[0]->form == "-DOCSTART-") {
00439 eventStream.reset();
00440 delete sent;
00441 } else
00442 return true;
00443 }
00444 return false;
00445 }
00446
00447 vector<Token*>* NerPipe::Current()
00448 {
00449 return ner.tag(sent, &eventStream);
00450 }
00451
00452 }
00453 }