TCImplementation.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "TCImplementation.h"
  21
  22 #ifdef DEBUG_MEMUSAGE
  23 #include "HeapProfiler.h" // FIXME remove
  24 #endif
  25
  26 #include <iostream>
  27 #include <map>
  28 #include <set>
  29 #include <vector>
  30 #include <utility>
  31 #include <stdexcept>
  32 #include <cassert>
  33 #include <cstring> // For strlen()
  34 using std::vector;
  35 using std::pair;
  36 using std::make_pair;
  37 using std::map;
  38
  39 namespace SXSI
  40 {
  41
  42 // Save file version info
  43 const uchar TCImplementation::versionFlag = 4;
  44
  45 /**
  46  * Constructor inits an empty dynamic FM-index.
  47  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  48  */
  49 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
  50     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  51       suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
  52 {
  53     makewavelet(bwt); // Deletes bwt!
  54     bwt = 0;
  55
  56     // Make sampling tables
  57     maketables();
  58 }
  59
  60 bool TCImplementation::EmptyText(DocId k) const
  61 {
  62     assert(k < (DocId)numberOfTexts);
  63     return false; // Empty texts are not indexed
  64 }
  65
  66 uchar* TCImplementation::GetText(DocId k) const
  67 {
  68     assert(k < (DocId)numberOfTexts);
  69     TextPosition i = k;
  70
  71     string result;
  72     // Reserve average string length to avoid reallocs
  73     result.reserve(n/numberOfTexts);
  74
  75     uchar c = alphabetrank->access(i);
  76     while (c != '\0')
  77     {
  78         result.push_back(c);
  79         i = C[c]+alphabetrank->rank(c,i)-1;
  80
  81         c = alphabetrank->access(i); // "next" char.
  82     }
  83
  84     // Convert to uchar (FIXME return string?)
  85     i = result.size();
  86     uchar* res = new uchar[i+1];
  87     res[i] = '\0';
  88     for (ulong j = 0; j < i; ++j)
  89         res[i-j-1] = result[j];
  90     return res;
  91 }
  92 /*
  93  * Not supported
  94 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
  95 {
  96     assert(k < (DocId)numberOfTexts);
  97     assert(j < (*textLength)[k]);
  98     assert(i <= j);
  99
 100     ulong textRank = 0;
 101
 102     // Start position of k'th text
 103     ulong start = (*textStartPos)[k];
 104
 105     return Substring(i + start, j-i+1);
 106     }*/
 107
 108 /******************************************************************
 109  * Existential queries
 110  */
 111 bool TCImplementation::IsPrefix(uchar const * pattern) const
 112 {
 113     TextPosition m = strlen((char *)pattern);
 114     if (m == 0)
 115         return true;
 116
 117     TextPosition sp = 0, ep = 0;
 118     Search(pattern, m, &sp, &ep);
 119
 120     // Check for end-marker(s) in result interval
 121     if (CountEndmarkers(sp, ep))
 122         return true;
 123     return false;
 124 }
 125
 126 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 127 {
 128     TextPosition m = strlen((char *)pattern);
 129     if (m == 0)
 130         return true;
 131
 132     TextPosition sp = 0, ep = 0;
 133     Search(pattern, m, &sp, &ep);
 134
 135     // Check for end-marker(s) in result interval
 136     if (CountEndmarkers(sp, ep, begin, end))
 137         return true;
 138     return false;
 139 }
 140
 141
 142 bool TCImplementation::IsSuffix(uchar const *pattern) const
 143 {
 144     // Here counting is as fast as IsSuffix():
 145     if (CountSuffix(pattern) > 0)
 146         return true;
 147     return false;
 148 }
 149
 150 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 151 {
 152     // Here counting is as fast as IsSuffix():
 153     if (CountSuffix(pattern, begin, end) > 0)
 154         return true;
 155     return false;
 156 }
 157
 158 bool TCImplementation::IsEqual(uchar const *pattern) const
 159 {
 160     TextPosition m = std::strlen((char *)pattern);
 161     if (m == 0)
 162         return false; // No empty texts exists
 163
 164     TextPosition sp = 0, ep = 0;
 165     // Match including end-marker
 166     Search(pattern, m+1, &sp, &ep);
 167
 168     // Check for end-marker(s) in result interval
 169     if (CountEndmarkers(sp, ep))
 170         return true;
 171     return false;
 172 }
 173
 174 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 175 {
 176     TextPosition m = std::strlen((char *)pattern);
 177     if (m == 0)
 178         return false; // No empty texts exists
 179
 180     TextPosition sp = 0, ep = 0;
 181     // Match including end-marker
 182     Search(pattern, m+1, &sp, &ep, begin, end);
 183
 184     // Check for end-marker(s) in result interval
 185     if (CountEndmarkers(sp, ep))
 186         return true;
 187     return false;
 188 }
 189
 190 bool TCImplementation::IsContains(uchar const * pattern) const
 191 {
 192     TextPosition m = strlen((char *)pattern);
 193     if (m == 0)
 194         return true;
 195
 196     TextPosition sp = 0, ep = 0;
 197     // Just check if pattern exists somewhere
 198     ulong count = Search(pattern, m, &sp, &ep);
 199
 200     if (count > 0)
 201         return true;
 202     return false;
 203 }
 204
 205 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
 206 {
 207     // Here counting is as fast as existential querying
 208     if (CountContains(pattern, begin, end) > 0)
 209         return true; // FIXME No need to filter result set
 210     return false;
 211 }
 212
 213 bool TCImplementation::IsLessThan(uchar const * pattern) const
 214 {
 215     if (CountLessThan(pattern) > 0)
 216         return true;
 217     return false;
 218 }
 219
 220 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 221 {
 222     if (CountLessThan(pattern, begin, end) > 0)
 223         return true;
 224     return false;
 225 }
 226
 227 /******************************************************************
 228  * Counting queries
 229  */
 230 ulong TCImplementation::Count(uchar const * pattern) const
 231 {
 232     TextPosition m = strlen((char *)pattern);
 233     if (m == 0)
 234         return 0;
 235
 236     TextPosition sp = 0, ep = 0;
 237     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 238     return count;
 239 }
 240
 241 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
 242 {
 243     TextPosition m = strlen((char *)pattern);
 244     if (m == 0)
 245         return numberOfTexts;
 246
 247     TextPosition sp = 0, ep = 0;
 248     Search(pattern, m, &sp, &ep);
 249
 250     // Count end-markers in result interval
 251     return CountEndmarkers(sp, ep);
 252 }
 253
 254 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 255 {
 256     TextPosition m = strlen((char *)pattern);
 257     if (m == 0)
 258         return numberOfTexts;
 259
 260     TextPosition sp = 0, ep = 0;
 261     Search(pattern, m, &sp, &ep);
 262
 263     // Count end-markers in result interval
 264     return CountEndmarkers(sp, ep, begin, end);
 265 }
 266
 267 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
 268 {
 269     TextPosition m = strlen((char *)pattern);
 270     if (m == 0)
 271         return numberOfTexts;
 272
 273     TextPosition sp = 0, ep = 0;
 274     // Search with end-marker
 275     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 276
 277     return count;
 278 }
 279
 280 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 281 {
 282     TextPosition m = strlen((char *)pattern);
 283     if (m == 0)
 284         return numberOfTexts;
 285
 286     TextPosition sp = 0, ep = 0;
 287     // Search with end-marker
 288     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 289
 290     return count;
 291 }
 292
 293 unsigned TCImplementation::CountEqual(uchar const *pattern) const
 294 {
 295     TextPosition m = strlen((char const *)pattern);
 296     if (m == 0)
 297         return 0; // No empty texts.
 298
 299     TextPosition sp = 0, ep = 0;
 300     // Match including end-marker
 301     Search(pattern, m+1, &sp, &ep);
 302
 303     // Count end-markers in result interval
 304     return CountEndmarkers(sp, ep);
 305 }
 306
 307 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 308 {
 309     TextPosition m = strlen((char const *)pattern);
 310     if (m == 0)
 311         return 0; // No empty texts.
 312
 313     TextPosition sp = 0, ep = 0;
 314     // Match including end-marker
 315     Search(pattern, m+1, &sp, &ep, begin, end);
 316
 317     // Count end-markers in result interval
 318     return CountEndmarkers(sp, ep); // Already within [begin, end]
 319 }
 320
 321 unsigned TCImplementation::CountContains(uchar const * pattern) const
 322 {
 323     TextPosition m = strlen((char const *)pattern);
 324     if (m == 0)
 325         return numberOfTexts; // Total number of texts.
 326
 327     // Here counting is as slow as fetching the result set
 328     // because we have to filter out occ's that fall within same document.
 329     TextCollection::document_result result = Contains(pattern);
 330     return result.size();
 331 }
 332
 333 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
 334 {
 335     TextPosition m = strlen((char const *)pattern);
 336     if (m == 0)
 337         return numberOfTexts; // Total number of texts.
 338
 339     // Here counting is as slow as fetching the result set
 340     // because we have to filter out occ's that fall within same document.
 341     TextCollection::document_result result = Contains(pattern, begin, end);
 342     return result.size();
 343 }
 344
 345 // Less than or equal
 346 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
 347 {
 348     TextPosition m = strlen((char const *)pattern);
 349     if (m == 0)
 350         return 0; // No empty texts.
 351
 352     TextPosition sp = 0, ep = 0;
 353     SearchLessThan(pattern, m, &sp, &ep);
 354
 355     // Count end-markers in result interval
 356     return CountEndmarkers(sp, ep);
 357 }
 358
 359 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 360 {
 361     TextPosition m = strlen((char const *)pattern);
 362     if (m == 0)
 363         return 0; // No empty texts.
 364
 365     TextPosition sp = 0, ep = 0;
 366     SearchLessThan(pattern, m, &sp, &ep);
 367
 368     // Count end-markers in result interval
 369     return CountEndmarkers(sp, ep, begin, end);
 370 }
 371
 372 /**
 373  * Document reporting queries
 374  */
 375 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
 376 {
 377     TextPosition m = strlen((char *)pattern);
 378     if (m == 0)
 379         return TextCollection::document_result(); // FIXME Should return all 1...k
 380
 381     TextPosition sp = 0, ep = 0;
 382     Search(pattern, m, &sp, &ep);
 383
 384     // Iterate through end-markers in [sp,ep]:
 385     return EnumerateEndmarkers(sp, ep);
 386 }
 387
 388 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
 389 {
 390     TextPosition m = strlen((char *)pattern);
 391     if (m == 0)
 392         return TextCollection::document_result(); // FIXME Should return all 1...k
 393
 394     TextPosition sp = 0, ep = 0;
 395     Search(pattern, m, &sp, &ep);
 396
 397     // Return end-markers in [sp,ep] and [begin, end]:
 398     return EnumerateEndmarkers(sp, ep, begin, end);
 399 }
 400
 401 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
 402 {
 403     TextPosition m = strlen((char *)pattern);
 404     if (m == 0)
 405         return TextCollection::document_result(); // FIXME Should return all 1...k
 406
 407     TextPosition sp = 0, ep = 0;
 408     // Search with end-marker
 409     Search(pattern, m+1, &sp, &ep);
 410
 411     TextCollection::document_result result;
 412     result.reserve(ep-sp+1); // Try to avoid reallocation.
 413
 414     // Check each occurrence
 415     for (; sp <= ep; ++sp)
 416     {
 417         TextPosition i = sp;
 418
 419         uchar c = alphabetrank->access(i);
 420         while (c != '\0' && !sampled->access(i))
 421         {
 422             i = C[c]+alphabetrank->rank(c,i)-1;
 423             c = alphabetrank->access(i);
 424         }
 425         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 426
 427         if (c == '\0')
 428         {
 429             // Rank among the end-markers in BWT
 430             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 431             result.push_back(Doc->access(endmarkerRank));
 432         }
 433         else // Sampled position
 434         {
 435             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 436             result.push_back(docId);
 437         }
 438     }
 439
 440     return result;
 441 }
 442
 443 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
 444 {
 445     TextPosition m = strlen((char *)pattern);
 446     if (m == 0)
 447         return TextCollection::document_result(); // FIXME Should return all 1...k
 448
 449     TextPosition sp = 0, ep = 0;
 450     // Search with end-marker
 451     Search(pattern, m+1, &sp, &ep, begin, end);
 452
 453     TextCollection::document_result result;
 454     result.reserve(ep-sp+1); // Try to avoid reallocation.
 455
 456     // Check each occurrence, already within [begin, end]
 457     for (; sp <= ep; ++sp)
 458     {
 459         TextPosition i = sp;
 460
 461         uchar c = alphabetrank->access(i);
 462         while (c != '\0' && !sampled->access(i))
 463         {
 464             i = C[c]+alphabetrank->rank(c,i)-1;
 465             c = alphabetrank->access(i);
 466         }
 467         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 468
 469         if (c == '\0')
 470         {
 471             // Rank among the end-markers in BWT
 472             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 473             result.push_back(Doc->access(endmarkerRank));
 474         }
 475         else // Sampled position
 476         {
 477             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 478             result.push_back(docId);
 479         }
 480     }
 481
 482     return result;
 483 }
 484
 485
 486 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
 487 {
 488     TextPosition m = strlen((char const *)pattern);
 489     if (m == 0)
 490         return TextCollection::document_result(); // FIXME Should return all empty texts
 491
 492     TextPosition sp = 0, ep = 0;
 493     // Match including end-marker
 494     Search(pattern, m+1, &sp, &ep);
 495
 496     // Report end-markers in result interval
 497     return EnumerateEndmarkers(sp, ep);
 498 }
 499
 500 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
 501 {
 502     TextPosition m = strlen((char const *)pattern);
 503     if (m == 0)
 504         return TextCollection::document_result(); // FIXME Should return all empty texts
 505
 506     TextPosition sp = 0, ep = 0;
 507     // Match including end-marker
 508     Search(pattern, m+1, &sp, &ep, begin, end);
 509
 510     // Report end-markers in result interval
 511     return EnumerateEndmarkers(sp, ep, begin, end);
 512 }
 513
 514
 515 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
 516 {
 517     TextPosition m = strlen((char *)pattern);
 518     if (m == 0)
 519         return TextCollection::document_result();
 520
 521     TextPosition sp = 0, ep = 0;
 522     // Search all occurrences
 523     Search(pattern, m, &sp, &ep);
 524
 525     // We want unique document indentifiers, using std::set to collect them
 526     std::set<DocId> resultSet;
 527
 528     // Check each occurrence
 529     for (; sp <= ep; ++sp)
 530     {
 531         TextPosition i = sp;
 532         uchar c = alphabetrank->access(i);
 533         while (c != '\0' && !sampled->access(i))
 534         {
 535             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 536             c = alphabetrank->access(i);
 537         }
 538         if (c == '\0')
 539         {
 540             // Rank among the end-markers in BWT
 541             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 542             resultSet.insert(Doc->access(endmarkerRank));
 543         }
 544         else
 545         {
 546             DocId di = (*suffixDocId)[sampled->rank1(i)-1];
 547             assert((unsigned)di < numberOfTexts);
 548             resultSet.insert(di);
 549         }
 550     }
 551
 552     // Convert std::set to std::vector
 553     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 554     return result;
 555 }
 556
 557 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
 558 {
 559     TextPosition m = strlen((char *)pattern);
 560     if (m == 0)
 561         return TextCollection::document_result();
 562
 563     TextPosition sp = 0, ep = 0;
 564     // Search all occurrences
 565     Search(pattern, m, &sp, &ep);
 566
 567     // We want unique document indentifiers, using std::set to collect them
 568     std::set<DocId> resultSet;
 569
 570     // Check each occurrence
 571     for (; sp <= ep; ++sp)
 572     {
 573         TextPosition i = sp;
 574         uchar c = alphabetrank->access(i);
 575         while (c != '\0' && !sampled->access(i))
 576         {
 577             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 578             c = alphabetrank->access(i);
 579         }
 580         if (c == '\0')
 581         {
 582             // Rank among the end-markers in BWT
 583             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 584             DocId docId = Doc->access(endmarkerRank);
 585             if (docId >= begin && docId <= end)
 586                 resultSet.insert(docId);
 587         }
 588         else
 589         {
 590             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 591             assert((unsigned)docId < numberOfTexts);
 592             if (docId >= begin && docId <= end)
 593                 resultSet.insert(docId);
 594         }
 595     }
 596
 597     // Convert std::set to std::vector
 598     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 599     return result;
 600 }
 601
 602 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
 603 {
 604     TextPosition m = strlen((char *)pattern);
 605     if (m == 0)
 606         return TextCollection::document_result(); // empty result set
 607
 608     TextPosition sp = 0, ep = 0;
 609     SearchLessThan(pattern, m, &sp, &ep);
 610
 611     // Report end-markers in result interval
 612     return EnumerateEndmarkers(sp, ep);
 613 }
 614
 615 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
 616 {
 617     TextPosition m = strlen((char *)pattern);
 618     if (m == 0)
 619         return TextCollection::document_result(); // empty result set
 620
 621     TextPosition sp = 0, ep = 0;
 622     SearchLessThan(pattern, m, &sp, &ep);
 623
 624     // Iterate through end-markers in [sp,ep] and [begin, end]:
 625     return EnumerateEndmarkers(sp, ep, begin, end);
 626 }
 627
 628 /**
 629  * Full result set queries
 630  */
 631 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
 632 {
 633     TextPosition m = strlen((char *)pattern);
 634     if (m == 0)
 635         return full_result(); // FIXME Throw exception?
 636
 637     TextPosition sp = 0, ep = 0;
 638     // Search all occurrences
 639     Search(pattern, m, &sp, &ep);
 640
 641     full_result result;
 642     result.reserve(ep-sp+1); // Try to avoid reallocation.
 643
 644     // Report each occurrence
 645     for (; sp <= ep; ++sp)
 646     {
 647         TextPosition i = sp;
 648         TextPosition dist = 0;
 649         uchar c = alphabetrank->access(i);
 650         while (c != '\0' && !sampled->access(i))
 651         {
 652             i = C[c]+alphabetrank->rank(c,i)-1;
 653             c = alphabetrank->access(i);
 654             ++ dist;
 655         }
 656         if (c == '\0')
 657         {
 658             // Rank among the end-markers in BWT
 659             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 660             DocId docId = Doc->access(endmarkerRank);
 661             result.push_back(make_pair(docId, dist));
 662         }
 663         else
 664         {
 665             TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 666             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 667
 668             result.push_back(make_pair(docId, textPos));
 669         }
 670     }
 671
 672     return result;
 673 }
 674
 675 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
 676 {
 677     TextPosition m = strlen((char *)pattern);
 678     if (m == 0)
 679         return full_result(); // FIXME Throw exception?
 680
 681     TextPosition sp = 0, ep = 0;
 682     // Search all occurrences
 683     Search(pattern, m, &sp, &ep);
 684
 685     full_result result;
 686     result.reserve(ep-sp+1); // Try to avoid reallocation.
 687
 688     // Report each occurrence
 689     for (; sp <= ep; ++sp)
 690     {
 691         TextPosition i = sp;
 692         TextPosition dist = 0;
 693         uchar c = alphabetrank->access(i);
 694         while (c != '\0' && !sampled->access(i))
 695         {
 696             i = C[c]+alphabetrank->rank(c,i)-1;
 697             c = alphabetrank->access(i);
 698             ++ dist;
 699         }
 700         if (c == '\0')
 701         {
 702             // Rank among the end-markers in BWT
 703             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 704
 705             // End-marker that we found belongs to the "preceeding" doc in collection:
 706             DocId docId = Doc->access(endmarkerRank);
 707             if (docId >= begin && docId <= end)
 708                 result.push_back(make_pair(docId, dist));
 709         }
 710         else
 711         {
 712             TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 713             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 714
 715             if (docId >= begin && docId <= end)
 716                 result.push_back(make_pair(docId, textPos));
 717         }
 718     }
 719
 720     return result;
 721 }
 722
 723
 724 /**
 725  * Save index to a file handle
 726  *
 727  * Throws a std::runtime_error exception on i/o error.
 728  * First byte that is saved represents the version number of the save file.
 729  * In version 2 files, the data fields are:
 730  *  uchar versionFlag;
 731     TextPosition n;
 732     unsigned samplerate;
 733     unsigned C[256];
 734     TextPosition bwtEndPos;
 735     static_sequence * alphabetrank;
 736     BSGAP *sampled;
 737     BlockArray *suffixes;
 738     BlockArray *suffixDocId;
 739     unsigned numberOfTexts;
 740     ulong maxTextLength;
 741     static_sequence *docId;
 742  */
 743 void TCImplementation::Save(FILE *file) const
 744 {
 745     // Saving version info:
 746     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 747         throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
 748
 749     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 750         throw std::runtime_error("TCImplementation::Save(): file write error (n).");
 751     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 752         throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
 753
 754     for(ulong i = 0; i < 256; ++i)
 755         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 756             throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
 757
 758     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 759         throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
 760
 761     alphabetrank->save(file);
 762     sampled->save(file);
 763     suffixes->Save(file);
 764     suffixDocId->Save(file);
 765
 766     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 767         throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
 768     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 769         throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
 770
 771     Doc->save(file);
 772     fflush(file);
 773 }
 774
 775
 776 /**
 777  * Load index from a file handle
 778  *
 779  * Throws a std::runtime_error exception on i/o error.
 780  * For more info, see TCImplementation::Save().
 781  */
 782 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
 783     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
 784       suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
 785 {
 786     uchar verFlag = 0;
 787     if (std::fread(&verFlag, 1, 1, file) != 1)
 788         throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
 789     if (verFlag != TCImplementation::versionFlag)
 790         throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
 791
 792     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 793         throw std::runtime_error("TCImplementation::Load(): file read error (n).");
 794     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 795         throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
 796 // FIXME samplerate can not be changed during load.
 797 //    if (this->samplerate == 0)
 798 //        this->samplerate = samplerate;
 799
 800     for(ulong i = 0; i < 256; ++i)
 801         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 802             throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
 803
 804     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 805         throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
 806
 807     alphabetrank = static_sequence::load(file);
 808     sampled = static_bitsequence::load(file);
 809     suffixes = new BlockArray(file);
 810     suffixDocId = new BlockArray(file);
 811
 812     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 813         throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
 814     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 815         throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
 816
 817     Doc = static_sequence::load(file);
 818
 819     // FIXME Construct data structures with new samplerate
 820     //maketables();
 821 }
 822
 823
 824
 825 /**
 826  * Rest of the functions follow...
 827  */
 828
 829
 830
 831 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 832 {
 833     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 834     int c = (int)pattern[m-1];
 835     TextPosition i=m-1;
 836     TextPosition sp = C[c];
 837     TextPosition ep = C[c+1]-1;
 838     while (sp<=ep && i>=1)
 839     {
 840 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 841         c = (int)pattern[--i];
 842         sp = C[c]+alphabetrank->rank(c,sp-1);
 843         ep = C[c]+alphabetrank->rank(c,ep)-1;
 844     }
 845     *spResult = sp;
 846     *epResult = ep;
 847     if (sp<=ep)
 848         return ep - sp + 1;
 849     else
 850         return 0;
 851 }
 852
 853 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
 854 {
 855     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 856     int c = (int)pattern[m-1];
 857     assert(c == 0); // Start from endmarkers
 858     TextPosition i=m-1;
 859     TextPosition sp = begin;
 860     TextPosition ep = end;
 861     while (sp<=ep && i>=1)
 862     {
 863 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 864         c = (int)pattern[--i];
 865         sp = C[c]+alphabetrank->rank(c,sp-1);
 866         ep = C[c]+alphabetrank->rank(c,ep)-1;
 867     }
 868     *spResult = sp;
 869     *epResult = ep;
 870     if (sp<=ep)
 871         return ep - sp + 1;
 872     else
 873         return 0;
 874 }
 875
 876
 877 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 878 {
 879     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 880     uint c = (int)pattern[m-1];
 881     TextPosition i=m-1;
 882     TextPosition sp = 1;
 883     TextPosition ep = C[c+1]-1;
 884     while (sp<=ep && i>=1)
 885     {
 886 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 887         c = (int)pattern[--i];
 888         uint result = alphabetrank->rankLessThan(c,ep);
 889         if (result == ~0u)
 890             ep = 0;
 891         else
 892             ep = C[c]+result-1;
 893     }
 894     *spResult = sp;
 895     *epResult = ep;
 896     if (sp<=ep)
 897         return ep - sp + 1;
 898     else
 899         return 0;
 900 }
 901
 902
 903 TCImplementation::~TCImplementation() {
 904     delete alphabetrank;
 905     delete sampled;
 906     delete suffixes;
 907     delete suffixDocId;
 908     delete Doc;
 909 }
 910
 911 void TCImplementation::makewavelet(uchar *bwt)
 912 {
 913     ulong i, min = 0,
 914              max;
 915     for (i=0;i<256;i++)
 916         C[i]=0;
 917     for (i=0;i<n;++i)
 918         C[(int)bwt[i]]++;
 919     for (i=0;i<256;i++)
 920         if (C[i]>0) {min = i; break;}
 921     for (i=255;i>=min;--i)
 922         if (C[i]>0) {max = i; break;}
 923
 924     ulong prev=C[0], temp;
 925     C[0]=0;
 926     for (i=1;i<256;i++) {
 927         temp = C[i];
 928         C[i]=C[i-1]+prev;
 929         prev = temp;
 930     }
 931 //    this->codetable = node::makecodetable(bwt,n);
 932 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 933 //    delete [] bwt;
 934     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
 935 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 936
 937 #ifdef DEBUG_MEMUSAGE
 938     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 939     HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
 940 #endif
 941
 942     alphabet_mapper * am = new alphabet_mapper_none();
 943     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
 944     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
 945     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
 946     delete bmb;
 947     bwt = 0; // already deleted
 948
 949 #ifdef DEBUG_MEMUSAGE
 950     std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 951     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 952 #endif
 953 }
 954
 955 void TCImplementation::maketables()
 956 {
 957     // Calculate BWT end-marker position (of last inserted text)
 958     {
 959         ulong i = 0;
 960         uint alphabetrank_i_tmp = 0;
 961         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
 962         while (c != '\0')
 963         {
 964             i = C[c]+alphabetrank_i_tmp-1;
 965             c = alphabetrank->access(i, alphabetrank_i_tmp);
 966         }
 967
 968         this->bwtEndPos = i;
 969     }
 970
 971     // Build up array for text starting positions
 972     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
 973     (*textStartPos)[0] = 0;
 974
 975     // Mapping from end-markers to doc ID's:
 976     uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs.
 977
 978     uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
 979     for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
 980         sampledpositions[i] = 0;
 981
 982     ulong x,p=bwtEndPos;
 983     ulong sampleCount = 0;
 984     // Keeping track of text position of prev. end-marker seen
 985     ulong posOfSuccEndmarker = n;
 986     DocId textId = numberOfTexts;
 987     ulong ulongmax = 0;
 988     ulongmax--;
 989     uint alphabetrank_i_tmp =0;
 990
 991     /**
 992      * First pass: populate tables textStartPos and sampledpositions.
 993      */
 994     for (ulong i=n-1;i<ulongmax;i--) {
 995         // i substitutes SA->GetPos(i)
 996         x=(i==n-1)?0:i+1;
 997
 998         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
 999             set_field(sampledpositions,1,p,1);
1000             sampleCount ++;
1001         }
1002
1003         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1004         if (c == '\0')
1005         {
1006             --textId;
1007
1008             // Record the order of end-markers in BWT:
1009             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1010             endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts;
1011
1012             // Store text length and text start position:
1013             if (textId < (DocId)numberOfTexts - 1)
1014             {
1015                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1016                 posOfSuccEndmarker = x;
1017             }
1018
1019             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1020             p = textId; // Correct LF-mapping to the last char of the previous text.
1021         }
1022         else // Now c != '\0', do LF-mapping:
1023             p = C[c]+alphabetrank_i_tmp-1;
1024     }
1025     assert(textId == 0);
1026
1027     sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1028     delete [] sampledpositions;
1029     ulong sampleLength = sampled->rank1(n-1);
1030     assert(sampleCount == sampleLength);
1031
1032     // Suffixes store an offset from the text start position
1033     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1034     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1035
1036     p=bwtEndPos;
1037     textId = numberOfTexts;
1038
1039     /**
1040      * Second pass: populate tables suffixes and suffixDocId.
1041      */
1042     for (ulong i=n-1;i<ulongmax;i--) {
1043         x=(i==n-1)?0:i+1;
1044
1045         if (sampled->access(p)) {
1046             ulong j = sampled->rank1(p)-1;
1047
1048             (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x);
1049
1050             // calculate offset from text start:
1051             (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]];
1052         }
1053
1054         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1055         if (c == '\0')
1056         {
1057             --textId;
1058             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1059             p = textId; // Correct LF-mapping to the last char of the previous text.
1060         }
1061         else // Now c != '\0', do LF-mapping:
1062             p = C[c]+alphabetrank_i_tmp-1;
1063     }
1064     assert(textId == 0);
1065
1066     delete textStartPos;
1067
1068 #ifdef DEBUG_MEMUSAGE
1069     std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1070     HeapProfiler::ResetMaxHeapConsumption();
1071 #endif
1072
1073     alphabet_mapper * am = new alphabet_mapper_none();
1074     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1075     Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true);
1076     delete bmb;
1077     // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1078
1079 #ifdef DEBUG_MEMUSAGE
1080     std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1081 #endif
1082 }
1083
1084
1085 /**
1086  * Finds document identifier for given text position
1087  *
1088  * Starting text position of the document is stored into second parameter.
1089  * Binary searching on text starting positions.
1090  */
1091 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1092 {
1093     assert(i < n);
1094
1095     DocId a = 0;
1096     DocId b = numberOfTexts - 1;
1097     while (a < b)
1098     {
1099         DocId c = a + (b - a)/2;
1100         if ((*textStartPos)[c] > i)
1101             b = c - 1;
1102         else if ((*textStartPos)[c+1] > i)
1103             return c;
1104         else
1105             a = c + 1;
1106     }
1107
1108     assert(a < (DocId)numberOfTexts);
1109     assert(i >= (*textStartPos)[a]);
1110     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1111     return a;
1112 }
1113
1114
1115 } // namespace SXSI
1116