00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include "Common/lexicon.h"
00027
00028
00029
00030
00031 #include "Common/util.h"
00032 #include "Common/PostingList.h"
00033
00034 using namespace IXE::io;
00035
00036 namespace IXE {
00037
00038 void Lexicon::access(mappedFile& indexFile, off32_t& offset)
00039 {
00040 bigrams.access(indexFile, offset);
00041 begin_ = indexFile.begin();
00042 off32_t const *p = reinterpret_cast<off32_t const *>(begin_ + offset);
00043 entries = p[0];
00044 table = reinterpret_cast<LexEntry const *>(&p[1]);
00045
00046 offset = table[entries].word;
00047 }
00048
00049 Size Lexicon::getWord(TermID termID, char* term, Size size) const
00050 {
00051 if (bigrams.size() == 0) {
00052 # ifdef LEXLEN
00053 Size len;
00054 char const* word = getPostfix(termID, len);
00055 # else
00056 char const* word = getPostfix(termID);
00057 Size len = ::strlen(word);
00058 # endif
00059 len = MIN(len, size);
00060 ::memcpy(term, word, len);
00061 term[len] = '\0';
00062 return len;
00063 }
00064 Lexicon* lex = (Lexicon*)this;
00065 TermID* bigramTable = (TermID*)&lex->bigrams[0];
00066
00067
00068 int bigram_index = 0;
00069 int first = 0;
00070 int last = num_bigrams - 1;
00071 int dist = last - first;
00072 while (dist > 0) {
00073 int half = dist / 2;
00074 int middle = first + half;
00075 TermID vmiddle = bigramTable[middle];
00076 if (vmiddle > termID) {
00077 last = middle;
00078 dist = half;
00079 } else if (termID < bigramTable[middle+1]) {
00080 bigram_index = middle;
00081 break;
00082 } else if (middle + 1== last) {
00083
00084 term[0] = '\0';
00085 return (Size)-1;
00086 } else {
00087 first = middle + 1;
00088 dist = last - first;
00089 }
00090 }
00091 term[0] = (char)(bigram_index >> 8);
00092 term[1] = (char)bigram_index;
00093 # ifdef LEXLEN
00094 Size len;
00095 char const* word = getPostfix(termID, len);
00096 # else
00097 char const* word = getPostfix(termID);
00098 Size len = ::strlen(word);
00099 # endif
00100 len = MIN(len+2, size);
00101 ::memcpy(term+2, word, len-2);
00102 term[len] = '\0';
00103 return len;
00104 }
00105
00114 Lexicon::const_iterator Lexicon::find(char const* word)
00115 {
00116 size_type begin, end;
00117 if (bigrams.size()) {
00118
00119
00120 unsigned short bigram = *(short *)(word);
00121 # ifndef WORDS_BIGENDIAN
00122 bigram = (bigram << 8) | (bigram >> 8);
00123 # endif
00124 begin = bigrams[bigram];
00125 end = bigrams[bigram + 1];
00126 if (begin == end)
00127 return this->end();
00128
00129 word = &word[2];
00130 } else {
00131 begin = 0;
00132 end = entries;
00133 }
00134
00135 const_iterator first = position_at(begin);
00136 const_iterator last = position_at(end);
00137 const_iterator middle;
00138 int len = last - first;
00139 unsigned char mchar, vchar;
00140 register const char *smiddle, *svalue;
00141
00142 while (len > 0) {
00143 int half = len / 2;
00144 middle = first + half;
00145
00146 smiddle = getPostfix(*middle);
00147 svalue = word;
00148 while ((mchar = *smiddle++) == (vchar = *svalue++))
00149 if (mchar == '\0')
00150 return middle;
00151 if (mchar < vchar) {
00152 first = middle + 1;
00153 len = len - half - 1;
00154 }
00155 else
00156 len = half;
00157 }
00158 return this->end();
00159 }
00160
00161 #ifdef LEXLEN
00162 Lexicon::const_iterator Lexicon::find(byte* word, Size len)
00163 {
00164 size_type begin, end;
00165 Size bglen;
00166 if (bigrams.size()) {
00167
00168
00169 unsigned short bigram = *(short *)(word);
00170 # ifndef WORDS_BIGENDIAN
00171 bigram = (bigram << 8) | (bigram >> 8);
00172 # endif
00173 begin = bigrams[bigram];
00174 end = bigrams[bigram + 1];
00175 if (begin == end)
00176 return this->end();
00177
00178 word = &word[2];
00179 bglen = 2;
00180 } else {
00181 begin = 0;
00182 end = entries;
00183 bglen = 0;
00184 }
00185
00186 const_iterator first = position_at(begin);
00187 const_iterator last = position_at(end);
00188 register Size lmiddle;
00189 unsigned char mchar, vchar;
00190
00191 int dist = last - first;
00192 while (dist > 0) {
00193 int half = dist / 2;
00194 const_iterator middle = first + half;
00195 char const* smiddle = getPostfix(*middle, lmiddle);
00196 byte* svalue = word;
00197 Size lvalue = len;
00198
00199 while (lvalue > bglen && lmiddle > bglen) {
00200 if ((mchar = *smiddle++) != (vchar = *svalue++))
00201 break;
00202 lvalue--; lmiddle--;
00203 }
00204 if (lvalue == bglen && lmiddle == bglen)
00205 return middle;
00206 if (mchar < vchar || lmiddle == bglen) {
00207 first = middle + 1;
00208 dist = dist - half - 1;
00209 } else
00210 dist = half;
00211 }
00212 return this->end();
00213 }
00214 #endif // LEXLEN
00215
00216 #ifdef LEXLEN
00217 struct EntryCompare :
00218 std::binary_function<LexEntry const*, char const*, bool>
00219 {
00220 EntryCompare(Size len, Lexicon const& lex) : len(len), lex(lex) { }
00221
00222 result_type
00223 operator ()(first_argument_type a, second_argument_type b) const {
00224 Size wl;
00225 char const* word = lex.getPostfix(a, wl);
00226 return ::memcmp(word, b, MIN(wl, len)) < 0;
00227 }
00228 result_type
00229 operator ()(second_argument_type a, first_argument_type b) const {
00230 Size wl;
00231 char const* word = lex.getPostfix(b, wl);
00232 return ::memcmp(a, word, MIN(len, wl)) < 0;
00233 }
00234 private:
00235 Size const len;
00236 Lexicon const& lex;
00237 };
00238 #else
00239 struct EntryCompare :
00240 std::binary_function<LexEntry const*, char const*, bool>
00241 {
00242 EntryCompare(int len, Lexicon const& lex) : len(len), lex(lex) { }
00243
00244 result_type
00245 operator ()(first_argument_type a, second_argument_type b) const {
00246 return ::strncmp(lex.getPostfix(a), b, len) < 0;
00247 }
00248 result_type
00249 operator ()(second_argument_type a, first_argument_type b) const {
00250 return ::strncmp(a, lex.getPostfix(b), len) < 0;
00251 }
00252 # ifdef _MSC_VER // why does it need this? Beppe
00253 result_type
00254 operator ()(first_argument_type a, first_argument_type b) const {
00255 return ::strncmp(lex.getPostfix(a), lex.getPostfix(b), len) < 0;
00256 }
00257 # endif
00258 private:
00259 int const len;
00260 Lexicon const& lex;
00261 };
00262 #endif // LEXLEN
00263
00274 Lexicon::Range Lexicon::findPrefix(char const * term, Size len)
00275 {
00276 Range range;
00277 if (bigrams.size()) {
00278 size_type begin, end;
00279 if (len > 1) {
00280
00281 unsigned short bigram = *(short *)term;
00282 # ifndef WORDS_BIGENDIAN
00283 bigram = (bigram << 8) | (bigram >> 8);
00284 # endif
00285 begin = bigrams[bigram];
00286 end = bigrams[bigram + 1];
00287
00288 EntryCompare const comparator(len - 2, *this);
00289 const char * rest = term + 2;
00290
00291 range = std::equal_range(position_at(begin),
00292 position_at(end),
00293 rest, comparator);
00294 } else {
00295 char first = term[0];
00296 begin = bigrams[first << 8];
00297 end = bigrams[(first + 1) << 8];
00298
00299 range.first = position_at(begin);
00300 range.second = position_at(end);
00301 }
00302 } else {
00303 EntryCompare const comparator(len, *this);
00304 range = std::equal_range(begin(), end(), term, comparator);
00305 }
00306 return range;
00307 }
00308
00309 Count Lexicon::frequency(char const* word)
00310 {
00311 # ifdef LEXLEN
00312 const_iterator it = find(word, strlen(word));
00313 # else
00314 const_iterator it = find(word);
00315 # endif
00316 if (it == end())
00317 return 0;
00318 PostingList postlist(postingsFile, it);
00319 return postlist.size();
00320 }
00321
00322 char const* Lexicon::Cursor::word()
00323 {
00324 if (lexicon->bigrams.size() == 0)
00325 # ifdef LEXLEN
00326 {
00327 Size len;
00328 char const* word = lexicon->getPostfix(word_index, len);
00329 ::memcpy(term, word, len);
00330 term[len] = '\0';
00331 return term;
00332 }
00333 # else
00334 return lexicon->getPostfix(word_index);
00335 # endif
00336 Size const* bigramTable = (Size const*)&lexicon->bigrams[0];
00337
00338
00339 int first = bigram_index;
00340 int last = num_bigrams - 1;
00341 int dist = last - first;
00342 while (dist > 0) {
00343 int half = dist / 2;
00344 int middle = first + half;
00345 long vmiddle = (long)bigramTable[middle];
00346 if (vmiddle > word_index) {
00347 last = middle;
00348 dist = half;
00349 } else if (word_index < (long)bigramTable[middle+1]) {
00350 bigram_index = middle;
00351 break;
00352 } else if (middle + 1== last) {
00353
00354 return "";
00355 } else {
00356 first = middle + 1;
00357 dist = last - first;
00358 }
00359 }
00360 term[0] = (char)(bigram_index >> 8);
00361 term[1] = (char)bigram_index;
00362 # ifdef LEXLEN
00363 Size len;
00364 char const* word = lexicon->getPostfix(word_index, len);
00365 len = MIN(len, 2*WordMaxSize);
00366 ::memcpy(term+2, word, len-2);
00367 term[len] = '\0';
00368 # else
00369 ::strncpy(term+2, lexicon->getPostfix(word_index), 2*WordMaxSize-2);
00370 term[2*WordMaxSize] = '\0';
00371 # endif
00372 return term;
00373 }
00374
00375
00376
00377
00378
00379
00380 PostingList Lexicon::Cursor::postingList()
00381 {
00382 off64_t postingStart = operator *()->postingsOffset();
00383 return PostingList(lexicon->postingsFile,
00384 postingStart,
00385 (*this)[1]->postingsOffset() - postingStart);
00386 }
00387
00388 void Lexicon::Cursor::postingList(PostingList& pl)
00389 {
00390 off64_t postingStart = operator *()->postingsOffset();
00391 pl.open(lexicon->postingsFile,
00392 postingStart,
00393 (*this)[1]->postingsOffset() - postingStart);
00394 }
00395
00396 }