00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #ifndef IXE_lexicon_H
00027 #define IXE_lexicon_H
00028
00029
00030 #include "platform.h"
00031
00032
00033 #include <algorithm>
00034 #include <iterator>
00035 #include <string>
00036
00037
00038 #include "include/config.h"
00039 #include "include/ixe.h"
00040 #include "io/FileHandle.h"
00041 #include "Common/IndexTable.h"
00042 #include "Common/LexEntry.h"
00043
00044 using namespace IXE::io;
00045
00046 namespace IXE {
00047
00048 class TermInfo {
00049 public:
00050 TermID ID;
00051 Count freq;
00052 };
00053
00054 class PostingList;
00055
00131 class Lexicon : public IndexTable<LexEntry>
00132 {
00133 public:
00134
00139 typedef std::pair<const_iterator, const_iterator> Range;
00140
00141
00142
00143
00144
00145
00146
00147
00148
00152 void access(mappedFile& mf, off32_t& offset);
00153
00157 void accessPostings(FileHandle* fh) {
00158 postingsFile = fh;
00159 }
00160
00166 Size getWord(TermID termID, char* term, Size size) const;
00167
00171 const_iterator find(char const* word);
00172
00177 const_iterator find(byte* word, Size len);
00178
00182 Range findPrefix(char const * term) {
00183 return findPrefix(term, ::strlen(term));
00184 }
00185
00186 Range findPrefix(char const * term, Size len);
00187
00188 Range findPrefix(wchar_t const * term, Size len) {
00189 return findPrefix((char const*)term, len * sizeof(wchar_t));
00190 }
00191
00195 Count frequency(char const* word);
00196
00197 friend class Cursor;
00198
00199 class Cursor
00200 {
00201 public:
00202 Cursor() { }
00203
00204 Cursor(Lexicon* l, int index = 0) :
00205 lexicon(l), word_index(index - 1), bigram_index(0) { }
00206
00207 bool atEnd() const {
00208
00209 return word_index >= (int)lexicon->size();
00210 }
00211
00212 bool hasNext() const {
00213
00214
00215 return word_index < (int)lexicon->size() - 1;
00216 }
00217
00218 LexEntry const* next() {
00219 return (*lexicon)[++word_index];
00220 }
00221
00222 LexEntry const* previous() {
00223 return (*lexicon)[--word_index];
00224 }
00225
00226 char const* next(char const* minWord) {
00227 if (::strcmp(word(), minWord) < 0 && word_index)
00228 word_index++;
00229 return word();
00230 }
00231
00232 char const* previous(char const* maxWord) {
00233 if (::strcmp(word(), maxWord) >= 0 && word_index)
00234 word_index--;
00235 return word();
00236 }
00237
00238 long get() { return word_index; }
00239
00240 void set(long index) {
00241 word_index = index;
00242 bigram_index = 0;
00243 }
00244
00245 void reset() { word_index = 0; }
00246
00247 LexEntry const* operator*() {
00248 return (*lexicon)[word_index];
00249 }
00250
00251 LexEntry const* operator [](int i) {
00252 return (*lexicon)[word_index + i];
00253 }
00254
00258 char const* word();
00259
00263 PostingList postingList();
00264 void postingList(PostingList&);
00265
00266 private:
00267 Lexicon* lexicon;
00268 long word_index;
00269 unsigned int bigram_index;
00270 char term[2*WordMaxSize+1];
00271 };
00272
00273 Cursor cursor(int i = 0) { return Cursor(this, i); }
00274
00275 BigramTable bigrams;
00276
00280 FileHandle* postingsFile;
00281
00282 private:
00283 friend struct EntryCompare;
00284
00288 char const* getPostfix(LexEntry const* c) const {
00289 return begin_ + c->word;
00290 }
00291
00292 # ifdef LEXLEN
00293
00296 char const* getPostfix(LexEntry const* c, Size& len) const {
00297 len = c->len;
00298 return begin_ + c->word;
00299 }
00300
00304 char const* getPostfix(TermID termID, Size& len) const {
00305 len = table[termID].len;
00306 return begin_ + table[termID].word;
00307 }
00308
00312 char const* getPostfix(const_iterator& cit, Size& len) const {
00313 len = table[cit.position()].len;
00314 return begin_ + table[cit.position()].word;
00315 }
00316 # endif
00317
00321 char const* getPostfix(TermID termID) const {
00322 return begin_ + table[termID].word;
00323 }
00324
00328 char const* getPostfix(const_iterator& cit) const {
00329 return begin_ + table[cit.position()].word;
00330 }
00331 };
00332
00333 }
00334
00335 #endif