FMIndex.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *   FMIndex implementation for the TextCollection interface                  *
   5  *                                                                            *
   6  *   This program is free software; you can redistribute it and/or modify     *
   7  *   it under the terms of the GNU Lesser General Public License as published *
   8  *   by the Free Software Foundation; either version 2 of the License, or     *
   9  *   (at your option) any later version.                                      *
  10  *                                                                            *
  11  *   This program is distributed in the hope that it will be useful,          *
  12  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  13  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  14  *   GNU Lesser General Public License for more details.                      *
  15  *                                                                            *
  16  *   You should have received a copy of the GNU Lesser General Public License *
  17  *   along with this program; if not, write to the                            *
  18  *   Free Software Foundation, Inc.,                                          *
  19  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  20  *****************************************************************************/
  21 #include "FMIndex.h"
  22
  23 //#define DEBUG_MEMUSAGE
  24 #ifdef DEBUG_MEMUSAGE
  25 #include "HeapProfiler.h" // FIXME remove
  26 #endif
  27
  28 #include <iostream>
  29 #include <map>
  30 #include <set>
  31 #include <vector>
  32 #include <utility>
  33 #include <stdexcept>
  34 #include <cassert>
  35 #include <cstring> // For strlen()
  36 using std::vector;
  37 using std::pair;
  38 using std::make_pair;
  39 using std::map;
  40 using std::string;
  41
  42 namespace SXSI
  43 {
  44
  45 // Save file version info
  46 const uchar FMIndex::versionFlag = 9;
  47
  48 /**
  49  * Constructor inits an empty dynamic FM-index.
  50  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  51  */
  52 FMIndex::FMIndex(uchar * bwt, ulong length, unsigned samplerate_,
  53                      unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_,
  54                      CSA::DeltaVector & notIndexed, const string & niText, char tsType)
  55     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  56       suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
  57 {
  58     makewavelet(bwt); // Deletes bwt!
  59     bwt = 0;
  60
  61     CSA::DeltaVector::Iterator notIndexedIt(notIndexed);
  62
  63     // Make sampling tables
  64     maketables(numberOfSamples_, tsType, notIndexedIt, niText);
  65 }
  66
  67 bool FMIndex::EmptyText(DocId k) const
  68 {
  69     assert(k < (DocId)numberOfTexts);
  70     return false; // Empty texts are not indexed
  71 }
  72
  73 uchar * FMIndex::GetText(DocId k) const
  74 {
  75     assert(k < (DocId)numberOfTexts);
  76
  77     return textStorage->GetText(k);
  78 /*    TextPosition i = k;
  79
  80     string result;
  81     // Reserve average string length to avoid reallocs
  82     result.reserve(n/numberOfTexts);
  83
  84     uchar c = alphabetrank->access(i);
  85     while (c != '\0')
  86     {
  87         result.push_back(c);
  88         i = C[c]+alphabetrank->rank(c,i)-1;
  89
  90         c = alphabetrank->access(i); // "next" char.
  91     }
  92
  93     // Convert to uchar (FIXME return string?)
  94     i = result.size();
  95     uchar* res = new uchar[i+1];
  96     res[i] = '\0';
  97     for (ulong j = 0; j < i; ++j)
  98         res[i-j-1] = result[j];
  99         return res;*/
 100 }
 101
 102 /*
 103  * Substring queries are supported via the pointer returned by TextStorage::GetText
 104 uchar* FMIndex::GetText(DocId k, TextPosition i, TextPosition j) const
 105 {
 106     assert(k < (DocId)numberOfTexts);
 107     assert(j < (*textLength)[k]);
 108     assert(i <= j);
 109
 110     ulong textRank = 0;
 111
 112     // Start position of k'th text
 113     ulong start = (*textStartPos)[k];
 114
 115     return Substring(i + start, j-i+1);
 116     }*/
 117
 118
 119
 120 /******************************************************************
 121  * Existential queries
 122  */
 123 bool FMIndex::IsPrefix(uchar const * pattern) const
 124 {
 125     TextPosition m = strlen((char *)pattern);
 126     if (m == 0)
 127         return true;
 128
 129     TextPosition sp = 0, ep = 0;
 130     Search(pattern, m, &sp, &ep);
 131
 132     // Check for end-marker(s) in result interval
 133     if (CountEndmarkers(sp, ep))
 134         return true;
 135     return false;
 136 }
 137
 138 bool FMIndex::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 139 {
 140     TextPosition m = strlen((char *)pattern);
 141     if (m == 0)
 142         return true;
 143
 144     TextPosition sp = 0, ep = 0;
 145     Search(pattern, m, &sp, &ep);
 146
 147     // Check for end-marker(s) in result interval
 148     if (CountEndmarkers(sp, ep, begin, end))
 149         return true;
 150     return false;
 151 }
 152
 153
 154 bool FMIndex::IsSuffix(uchar const *pattern) const
 155 {
 156     // Here counting is as fast as IsSuffix():
 157     if (CountSuffix(pattern) > 0)
 158         return true;
 159     return false;
 160 }
 161
 162 bool FMIndex::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 163 {
 164     // Here counting is as fast as IsSuffix():
 165     if (CountSuffix(pattern, begin, end) > 0)
 166         return true;
 167     return false;
 168 }
 169
 170 bool FMIndex::IsEqual(uchar const *pattern) const
 171 {
 172     TextPosition m = std::strlen((char *)pattern);
 173     if (m == 0)
 174         return false; // No empty texts exists
 175
 176     TextPosition sp = 0, ep = 0;
 177     // Match including end-marker
 178     Search(pattern, m+1, &sp, &ep);
 179
 180     // Check for end-marker(s) in result interval
 181     if (CountEndmarkers(sp, ep))
 182         return true;
 183     return false;
 184 }
 185
 186 bool FMIndex::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 187 {
 188     TextPosition m = std::strlen((char *)pattern);
 189     if (m == 0)
 190         return false; // No empty texts exists
 191
 192     TextPosition sp = 0, ep = 0;
 193     // Match including end-marker
 194     Search(pattern, m+1, &sp, &ep, begin, end);
 195
 196     // Check for end-marker(s) in result interval
 197     if (CountEndmarkers(sp, ep))
 198         return true;
 199     return false;
 200 }
 201
 202 bool FMIndex::IsContains(uchar const * pattern) const
 203 {
 204     TextPosition m = strlen((char *)pattern);
 205     if (m == 0)
 206         return true;
 207
 208     TextPosition sp = 0, ep = 0;
 209     // Just check if pattern exists somewhere
 210     ulong count = Search(pattern, m, &sp, &ep);
 211
 212     if (count > 0)
 213         return true;
 214     return false;
 215 }
 216
 217 bool FMIndex::IsContains(uchar const * pattern, DocId begin, DocId end) const
 218 {
 219     // Here counting is as fast as existential querying
 220     if (CountContains(pattern, begin, end) > 0)
 221         return true; // FIXME No need to filter result set
 222     return false;
 223 }
 224
 225 bool FMIndex::IsLessThan(uchar const * pattern) const
 226 {
 227     if (CountLessThan(pattern) > 0)
 228         return true;
 229     return false;
 230 }
 231
 232 bool FMIndex::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 233 {
 234     if (CountLessThan(pattern, begin, end) > 0)
 235         return true;
 236     return false;
 237 }
 238
 239 /******************************************************************
 240  * Counting queries
 241  */
 242 ulong FMIndex::Count(uchar const * pattern) const
 243 {
 244     TextPosition m = strlen((char *)pattern);
 245     if (m == 0)
 246         return 0;
 247
 248     TextPosition sp = 0, ep = 0;
 249     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 250     return count;
 251 }
 252
 253 unsigned FMIndex::CountPrefix(uchar const * pattern) const
 254 {
 255     TextPosition m = strlen((char *)pattern);
 256     if (m == 0)
 257         return numberOfTexts;
 258
 259     TextPosition sp = 0, ep = 0;
 260     Search(pattern, m, &sp, &ep);
 261
 262     // Count end-markers in result interval
 263     return CountEndmarkers(sp, ep);
 264 }
 265
 266 unsigned FMIndex::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 267 {
 268     TextPosition m = strlen((char *)pattern);
 269     if (m == 0)
 270         return numberOfTexts;
 271
 272     TextPosition sp = 0, ep = 0;
 273     Search(pattern, m, &sp, &ep);
 274
 275     // Count end-markers in result interval
 276     return CountEndmarkers(sp, ep, begin, end);
 277 }
 278
 279 unsigned FMIndex::CountSuffix(uchar const * pattern) const
 280 {
 281     TextPosition m = strlen((char *)pattern);
 282     if (m == 0)
 283         return numberOfTexts;
 284
 285     TextPosition sp = 0, ep = 0;
 286     // Search with end-marker
 287     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 288
 289     return count;
 290 }
 291
 292 unsigned FMIndex::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 293 {
 294     TextPosition m = strlen((char *)pattern);
 295     if (m == 0)
 296         return numberOfTexts;
 297
 298     TextPosition sp = 0, ep = 0;
 299     // Search with end-marker
 300     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 301
 302     return count;
 303 }
 304
 305 unsigned FMIndex::CountEqual(uchar const *pattern) const
 306 {
 307     TextPosition m = strlen((char const *)pattern);
 308     if (m == 0)
 309         return 0; // No empty texts.
 310
 311     TextPosition sp = 0, ep = 0;
 312     // Match including end-marker
 313     Search(pattern, m+1, &sp, &ep);
 314
 315     // Count end-markers in result interval
 316     return CountEndmarkers(sp, ep);
 317 }
 318
 319 unsigned FMIndex::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 320 {
 321     TextPosition m = strlen((char const *)pattern);
 322     if (m == 0)
 323         return 0; // No empty texts.
 324
 325     TextPosition sp = 0, ep = 0;
 326     // Match including end-marker
 327     Search(pattern, m+1, &sp, &ep, begin, end);
 328
 329     // Count end-markers in result interval
 330     return CountEndmarkers(sp, ep); // Already within [begin, end]
 331 }
 332
 333 unsigned FMIndex::CountContains(uchar const * pattern) const
 334 {
 335     TextPosition m = strlen((char const *)pattern);
 336     if (m == 0)
 337         return numberOfTexts; // Total number of texts.
 338
 339     // Here counting is as slow as fetching the result set
 340     // because we have to filter out occ's that fall within same document.
 341     TextCollection::document_result result = Contains(pattern);
 342     return result.size();
 343 }
 344
 345 unsigned FMIndex::CountContains(uchar const * pattern, DocId begin, DocId end) const
 346 {
 347     TextPosition m = strlen((char const *)pattern);
 348     if (m == 0)
 349         return numberOfTexts; // Total number of texts.
 350
 351     // Here counting is as slow as fetching the result set
 352     // because we have to filter out occ's that fall within same document.
 353     TextCollection::document_result result = Contains(pattern, begin, end);
 354     return result.size();
 355 }
 356
 357 // Less than or equal
 358 unsigned FMIndex::CountLessThan(uchar const * pattern) const
 359 {
 360     TextPosition m = strlen((char const *)pattern);
 361     if (m == 0)
 362         return 0; // No empty texts.
 363
 364     TextPosition sp = 0, ep = 0;
 365     SearchLessThan(pattern, m, &sp, &ep);
 366
 367     // Count end-markers in result interval
 368     return CountEndmarkers(sp, ep);
 369 }
 370
 371 unsigned FMIndex::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 372 {
 373     TextPosition m = strlen((char const *)pattern);
 374     if (m == 0)
 375         return 0; // No empty texts.
 376
 377     TextPosition sp = 0, ep = 0;
 378     SearchLessThan(pattern, m, &sp, &ep);
 379
 380     // Count end-markers in result interval
 381     return CountEndmarkers(sp, ep, begin, end);
 382 }
 383
 384 /**
 385  * Document reporting queries
 386  */
 387 TextCollection::document_result FMIndex::Prefix(uchar const * pattern) const
 388 {
 389     TextPosition m = strlen((char *)pattern);
 390     if (m == 0)
 391         return TextCollection::document_result(); // FIXME Should return all 1...k
 392
 393     TextPosition sp = 0, ep = 0;
 394     Search(pattern, m, &sp, &ep);
 395
 396     // Iterate through end-markers in [sp,ep]:
 397     return EnumerateEndmarkers(sp, ep);
 398 }
 399
 400 TextCollection::document_result FMIndex::Prefix(uchar const * pattern, DocId begin, DocId end) const
 401 {
 402     TextPosition m = strlen((char *)pattern);
 403     if (m == 0)
 404         return TextCollection::document_result(); // FIXME Should return all 1...k
 405
 406     TextPosition sp = 0, ep = 0;
 407     Search(pattern, m, &sp, &ep);
 408
 409     // Return end-markers in [sp,ep] and [begin, end]:
 410     return EnumerateEndmarkers(sp, ep, begin, end);
 411 }
 412
 413 TextCollection::document_result FMIndex::Suffix(uchar const * pattern) const
 414 {
 415     TextPosition m = strlen((char *)pattern);
 416     if (m == 0)
 417         return TextCollection::document_result(); // FIXME Should return all 1...k
 418
 419     TextPosition sp = 0, ep = 0;
 420     // Search with end-marker
 421     Search(pattern, m+1, &sp, &ep);
 422
 423     TextCollection::document_result result;
 424     result.reserve(ep-sp+1); // Try to avoid reallocation.
 425
 426     // Check each occurrence
 427     for (; sp <= ep; ++sp)
 428     {
 429         TextPosition i = sp;
 430
 431         uchar c = alphabetrank->access(i);
 432         while (c != '\0' && !sampled->access(i))
 433         {
 434             i = C[c]+alphabetrank->rank(c,i)-1;
 435             c = alphabetrank->access(i);
 436         }
 437         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 438
 439         if (c == '\0')
 440         {
 441             // Rank among the end-markers in BWT
 442             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 443             result.push_back(Doc->access(endmarkerRank));
 444         }
 445         else // Sampled position
 446         {
 447             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 448             result.push_back(docId);
 449         }
 450     }
 451
 452     return result;
 453 }
 454
 455 TextCollection::document_result FMIndex::Suffix(uchar const * pattern, DocId begin, DocId end) const
 456 {
 457     TextPosition m = strlen((char *)pattern);
 458     if (m == 0)
 459         return TextCollection::document_result(); // FIXME Should return all 1...k
 460
 461     TextPosition sp = 0, ep = 0;
 462     // Search with end-marker
 463     Search(pattern, m+1, &sp, &ep, begin, end);
 464
 465     TextCollection::document_result result;
 466     result.reserve(ep-sp+1); // Try to avoid reallocation.
 467
 468     // Check each occurrence, already within [begin, end]
 469     for (; sp <= ep; ++sp)
 470     {
 471         TextPosition i = sp;
 472
 473         uchar c = alphabetrank->access(i);
 474         while (c != '\0' && !sampled->access(i))
 475         {
 476             i = C[c]+alphabetrank->rank(c,i)-1;
 477             c = alphabetrank->access(i);
 478         }
 479         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 480
 481         if (c == '\0')
 482         {
 483             // Rank among the end-markers in BWT
 484             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 485             result.push_back(Doc->access(endmarkerRank));
 486         }
 487         else // Sampled position
 488         {
 489             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 490             result.push_back(docId);
 491         }
 492     }
 493
 494     return result;
 495 }
 496
 497
 498 TextCollection::document_result FMIndex::Equal(uchar const *pattern) const
 499 {
 500     TextPosition m = strlen((char const *)pattern);
 501     if (m == 0)
 502         return TextCollection::document_result(); // FIXME Should return all empty texts
 503
 504     TextPosition sp = 0, ep = 0;
 505     // Match including end-marker
 506     Search(pattern, m+1, &sp, &ep);
 507
 508     // Report end-markers in result interval
 509     return EnumerateEndmarkers(sp, ep);
 510 }
 511
 512 TextCollection::document_result FMIndex::Equal(uchar const *pattern, DocId begin, DocId end) const
 513 {
 514     TextPosition m = strlen((char const *)pattern);
 515     if (m == 0)
 516         return TextCollection::document_result(); // FIXME Should return all empty texts
 517
 518     TextPosition sp = 0, ep = 0;
 519     // Match including end-marker
 520     Search(pattern, m+1, &sp, &ep, begin, end);
 521
 522     // Report end-markers in result interval
 523     return EnumerateEndmarkers(sp, ep, begin, end);
 524 }
 525
 526
 527 TextCollection::document_result FMIndex::Contains(uchar const * pattern) const
 528 {
 529     TextPosition m = strlen((char *)pattern);
 530     if (m == 0)
 531         return TextCollection::document_result();
 532
 533     TextPosition sp = 0, ep = 0;
 534     // Search all occurrences
 535     Search(pattern, m, &sp, &ep);
 536
 537     // We want unique document indentifiers, using std::set to collect them
 538     std::set<DocId> resultSet;
 539     EnumerateDocuments(resultSet, sp, ep);
 540
 541     // Convert std::set to std::vector
 542     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 543     return result;
 544 }
 545
 546 TextCollection::document_result FMIndex::Contains(uchar const * pattern, DocId begin, DocId end) const
 547 {
 548     TextPosition m = strlen((char *)pattern);
 549     if (m == 0)
 550         return TextCollection::document_result();
 551
 552     TextPosition sp = 0, ep = 0;
 553     // Search all occurrences
 554     Search(pattern, m, &sp, &ep);
 555
 556     // We want unique document indentifiers, using std::set to collect them
 557     std::set<DocId> resultSet;
 558     EnumerateDocuments(resultSet, sp, ep, begin, end);
 559
 560     // Convert std::set to std::vector
 561     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 562     return result;
 563 }
 564
 565
 566 /**
 567  ***
 568 * * FIXME Lessthan or equal
 569  */
 570 TextCollection::document_result FMIndex::LessThan(uchar const * pattern) const
 571 {
 572     TextPosition m = strlen((char *)pattern);
 573     if (m == 0)
 574         return TextCollection::document_result(); // empty result set
 575
 576     TextPosition sp = 0, ep = 0;
 577     SearchLessThan(pattern, m, &sp, &ep);
 578
 579     // Report end-markers in result interval
 580     return EnumerateEndmarkers(sp, ep);
 581 }
 582
 583 TextCollection::document_result FMIndex::LessThan(uchar const * pattern, DocId begin, DocId end) const
 584 {
 585     TextPosition m = strlen((char *)pattern);
 586     if (m == 0)
 587         return TextCollection::document_result(); // empty result set
 588
 589     TextPosition sp = 0, ep = 0;
 590     SearchLessThan(pattern, m, &sp, &ep);
 591
 592     // Iterate through end-markers in [sp,ep] and [begin, end]:
 593     return EnumerateEndmarkers(sp, ep, begin, end);
 594 }
 595
 596
 597 TextCollection::document_result FMIndex::KMismaches(uchar const * pattern, unsigned k) const
 598 {
 599     TextPosition m = strlen((char *)pattern);
 600     if (m == 0)
 601         return TextCollection::document_result(); // empty result set
 602
 603     suffix_range_vector ranges;
 604     kmismatches(ranges, pattern, 0, n-1, m, k);
 605     std::set<DocId> resultSet;
 606
 607     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 608         // Iterate through docs in [sp,ep]:
 609         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 610
 611     // Convert std::set to std::vector
 612     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 613     return result;
 614 }
 615
 616 TextCollection::document_result FMIndex::KErrors(uchar const * pattern, unsigned k) const
 617 {
 618     TextPosition m = strlen((char *)pattern);
 619     if (m == 0)
 620         return TextCollection::document_result(); // empty result set
 621
 622     suffix_range_vector ranges;
 623     ulong *dd = new ulong[m+1];
 624     for (ulong i=0;i<m+1;i++)
 625         dd[i]=i;
 626     kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 627     delete [] dd;
 628
 629     std::set<DocId> resultSet;
 630     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 631         // Iterate through docs in [sp,ep]:
 632         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 633
 634     // Convert std::set to std::vector
 635     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 636     return result;
 637 }
 638
 639
 640 /**
 641  * Full result set queries
 642  */
 643 TextCollection::full_result FMIndex::FullContains(uchar const * pattern) const
 644 {
 645     TextPosition m = strlen((char *)pattern);
 646     if (m == 0)
 647         return full_result(); // FIXME Throw exception?
 648
 649     TextPosition sp = 0, ep = 0;
 650     // Search all occurrences
 651     Search(pattern, m, &sp, &ep);
 652
 653     full_result result;
 654     result.reserve(ep-sp+1); // Try to avoid reallocation.
 655     EnumeratePositions(result, sp, ep);
 656
 657     return result;
 658 }
 659
 660 TextCollection::full_result FMIndex::FullContains(uchar const * pattern, DocId begin, DocId end) const
 661 {
 662     TextPosition m = strlen((char *)pattern);
 663     if (m == 0)
 664         return full_result(); // FIXME Throw exception?
 665
 666     TextPosition sp = 0, ep = 0;
 667     // Search all occurrences
 668     Search(pattern, m, &sp, &ep);
 669
 670     full_result result;
 671     result.reserve(ep-sp+1); // Try to avoid reallocation.
 672     EnumeratePositions(result, sp, ep, begin, end);
 673
 674     return result;
 675 }
 676
 677 TextCollection::full_result FMIndex::FullKMismatches(uchar const * pattern, unsigned k) const
 678 {
 679     TextPosition m = strlen((char *)pattern);
 680     if (m == 0)
 681         return TextCollection::full_result(); // empty result set
 682
 683     suffix_range_vector ranges;
 684     ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
 685
 686     TextCollection::full_result result;
 687     result.reserve(count); // avoid reallocation.
 688     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 689         // Iterate through docs in [sp,ep]:
 690         EnumeratePositions(result, (*it).first, (*it).second);
 691     return result;
 692 }
 693
 694 TextCollection::full_result FMIndex::FullKErrors(uchar const * pattern, unsigned k) const
 695 {
 696     TextPosition m = strlen((char *)pattern);
 697     if (m == 0)
 698         return TextCollection::full_result(); // empty result set
 699
 700     suffix_range_vector ranges;
 701     ulong *dd = new ulong[m+1];
 702     for (unsigned i=0;i<m+1;i++)
 703         dd[i]=i;
 704     ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 705     delete [] dd;
 706
 707     TextCollection::full_result result;
 708     result.reserve(count); // avoid reallocation.
 709     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 710         // Iterate through docs in [sp,ep]:
 711         EnumeratePositions(result, (*it).first, (*it).second);
 712     return result;
 713 }
 714
 715
 716 /**
 717  * Save index to a file handle
 718  *
 719  * Throws a std::runtime_error exception on i/o error.
 720  * First byte that is saved represents the version number of the save file.
 721  * In this version files, the data fields are:
 722  *  uchar type;
 723     uchar versionFlag;
 724     TextPosition n;
 725     unsigned samplerate;
 726     unsigned C[256];
 727     TextPosition bwtEndPos;
 728     static_sequence * alphabetrank;
 729     BSGAP *sampled;
 730     BlockArray *suffixes;
 731     BlockArray *suffixDocId;
 732     unsigned numberOfTexts;
 733     ulong maxTextLength;
 734     static_sequence *docId;
 735  *
 736  * The second parameter (filename's prefix) is ignored.
 737  */
 738 void FMIndex::Save(FILE *file, char const *ignored) const
 739 {
 740     const char type = 'F';
 741     // Saving type info:
 742     if (std::fwrite(&type, 1, 1, file) != 1)
 743         throw std::runtime_error("FMIndex::Save(): file write error (type flag).");
 744
 745     // Saving version info:
 746     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 747         throw std::runtime_error("FMIndex::Save(): file write error (version flag).");
 748
 749     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 750         throw std::runtime_error("FMIndex::Save(): file write error (n).");
 751     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 752         throw std::runtime_error("FMIndex::Save(): file write error (samplerate).");
 753
 754     for(ulong i = 0; i < 256; ++i)
 755         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 756             throw std::runtime_error("FMIndex::Save(): file write error (C table).");
 757
 758     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 759         throw std::runtime_error("FMIndex::Save(): file write error (bwt end position).");
 760
 761     alphabetrank->save(file);
 762     sampled->save(file);
 763     suffixes->Save(file);
 764     suffixDocId->Save(file);
 765
 766     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 767         throw std::runtime_error("FMIndex::Save(): file write error (numberOfTexts).");
 768     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 769         throw std::runtime_error("FMIndex::Save(): file write error (maxTextLength).");
 770
 771     Doc->save(file);
 772     textStorage->Save(file);
 773     fflush(file);
 774 }
 775
 776
 777 /**
 778  * Load index from a file handle
 779  *
 780  * Throws a std::runtime_error exception on i/o error.
 781  * For more info, see FMIndex::Save().
 782  *
 783  * index_mode_t is defined in TextCollection.h and
 784  * defaults to both the index and "naive" text.
 785  *
 786  * Note: Samplerate can not be changed during load.
 787  */
 788 FMIndex::FMIndex(FILE *file, index_mode_t im, unsigned samplerate_)
 789     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
 790       suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
 791 {
 792     // NB: Type byte has already been read from input
 793
 794     uchar verFlag = 0;
 795     if (std::fread(&verFlag, 1, 1, file) != 1)
 796         throw std::runtime_error("FMIndex::Load(): file read error (version flag).");
 797     if (verFlag != FMIndex::versionFlag)
 798         throw std::runtime_error("FMIndex::Load(): invalid save file version.");
 799
 800     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 801         throw std::runtime_error("FMIndex::Load(): file read error (n).");
 802     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 803         throw std::runtime_error("FMIndex::Load(): file read error (samplerate).");
 804 // FIXME samplerate can not be changed during load.
 805 //    if (this->samplerate == 0)
 806 //        this->samplerate = samplerate;
 807
 808     for(ulong i = 0; i < 256; ++i)
 809         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 810             throw std::runtime_error("FMIndex::Load(): file read error (C table).");
 811
 812     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 813         throw std::runtime_error("FMIndex::Load(): file read error (bwt end position).");
 814
 815     alphabetrank = static_sequence::load(file);
 816     if (im == index_mode_text_only) { delete alphabetrank; alphabetrank = 0; }
 817
 818     sampled = static_bitsequence::load(file);
 819     if (im == index_mode_text_only) { delete sampled; sampled = 0; }
 820     suffixes = new BlockArray(file);
 821     if (im == index_mode_text_only) { delete suffixes; suffixes = 0; }
 822     suffixDocId = new BlockArray(file);
 823     if (im == index_mode_text_only) { delete suffixDocId; suffixDocId = 0; }
 824
 825     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 826         throw std::runtime_error("FMIndex::Load(): file read error (numberOfTexts).");
 827     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 828         throw std::runtime_error("FMIndex::Load(): file read error (maxTextLength).");
 829
 830     Doc = new ArrayDoc(file); //static_sequence::load(file);
 831     if (im == index_mode_text_only) { delete Doc; Doc = 0; }
 832
 833     textStorage = TextStorage::Load(file);
 834
 835     // FIXME Construct data structures with new samplerate
 836     //maketables();
 837 }
 838
 839
 840
 841 /**
 842  * Rest of the functions follow...
 843  */
 844 ulong FMIndex::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
 845 {
 846     int c;
 847     while (*sp<=*ep && i>=1)
 848     {
 849         c = (int)pattern[--i];
 850         *sp = C[c]+alphabetrank->rank(c,*sp-1);
 851         *ep = C[c]+alphabetrank->rank(c,*ep)-1;
 852     }
 853     if (*sp<=*ep)
 854         return *ep - *sp + 1;
 855     else
 856         return 0;
 857 }
 858
 859
 860 ulong FMIndex::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
 861 {
 862     if (sp>ep) return 0;
 863     if (j == 0)
 864     {
 865         result.push_back(std::make_pair(sp,ep));
 866         return ep-sp+1;
 867     }
 868     int c;
 869     ulong spnew;
 870     ulong epnew;
 871     int knew;
 872     ulong sum=0;
 873     if (k==0)
 874     {
 875         sum = searchPrefix(pattern, j, &sp, &ep);
 876         if (sp<=ep)
 877             result.push_back(std::make_pair(sp, ep));
 878         return sum;
 879     }
 880     vector<int> chars = alphabetrank->accessAll(sp, ep);
 881     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 882     {
 883         if (*it == 0)
 884             continue; // skip '\0'
 885         c = *it;
 886         spnew = C[c]+alphabetrank->rank(c,sp-1);
 887         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 888         if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
 889         if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
 890     }
 891     return sum;
 892 }
 893
 894 //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
 895 ulong FMIndex::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
 896 {
 897     ulong sum=0;
 898     if (d[m]<=k) // range of suffixes with at most k-errors found
 899     {
 900         if (sp<=ep)
 901             result.push_back(std::make_pair(sp, ep));
 902         sum += (sp<=ep)?ep-sp+1:0ul;
 903     }
 904     if (sp>ep || j==0)
 905         return sum;
 906     ulong *dnew = new ulong[m+1];
 907     int c;
 908     ulong spnew;
 909     ulong p,lowerbound;
 910     ulong epnew;
 911     vector<int> chars = alphabetrank->accessAll(sp, ep);
 912     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 913     {
 914         if (*it == 0)
 915             continue; // skip '\0'
 916         c = *it;
 917         spnew = C[c]+alphabetrank->rank(c,sp-1);
 918         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 919         if (spnew>epnew) continue;
 920         dnew[0]=m+k-j+1;
 921         lowerbound=k+1;
 922         for (p=1; p<=m; p++) {
 923             dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
 924             if (dnew[p]<lowerbound)
 925                 lowerbound = dnew[p];
 926         }
 927         if (lowerbound<=k)
 928             sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
 929     }
 930     delete [] dnew;
 931     return sum;
 932 }
 933
 934
 935 ulong FMIndex::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 936 {
 937     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 938     int c = (int)pattern[m-1];
 939     TextPosition i=m-1;
 940     TextPosition sp = C[c];
 941     TextPosition ep = C[c+1]-1;
 942     while (sp<=ep && i>=1)
 943     {
 944 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 945         c = (int)pattern[--i];
 946         sp = C[c]+alphabetrank->rank(c,sp-1);
 947         ep = C[c]+alphabetrank->rank(c,ep)-1;
 948     }
 949     *spResult = sp;
 950     *epResult = ep;
 951     if (sp<=ep)
 952         return ep - sp + 1;
 953     else
 954         return 0;
 955 }
 956
 957 ulong FMIndex::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
 958 {
 959     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 960     int c = (int)pattern[m-1];
 961     assert(c == 0); // Start from endmarkers
 962     TextPosition i=m-1;
 963     TextPosition sp = begin;
 964     TextPosition ep = end;
 965     while (sp<=ep && i>=1)
 966     {
 967 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 968         c = (int)pattern[--i];
 969         sp = C[c]+alphabetrank->rank(c,sp-1);
 970         ep = C[c]+alphabetrank->rank(c,ep)-1;
 971     }
 972     *spResult = sp;
 973     *epResult = ep;
 974     if (sp<=ep)
 975         return ep - sp + 1;
 976     else
 977         return 0;
 978 }
 979
 980
 981 ulong FMIndex::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 982 {
 983     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 984     uint c = (int)pattern[m-1];
 985     TextPosition i=m-1;
 986     TextPosition sp = 1;
 987     TextPosition ep = C[c+1]-1;
 988     while (sp<=ep && i>=1)
 989     {
 990 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 991         c = (int)pattern[--i];
 992         uint result = alphabetrank->rankLessThan(c,ep);
 993         if (result == ~0u)
 994             ep = 0;
 995         else
 996             ep = C[c]+result-1;
 997     }
 998     *spResult = sp;
 999     *epResult = ep;
1000     if (sp<=ep)
1001         return ep - sp + 1;
1002     else
1003         return 0;
1004 }
1005
1006
1007 FMIndex::~FMIndex() {
1008     delete alphabetrank;
1009     delete sampled;
1010     delete suffixes;
1011     delete suffixDocId;
1012     delete Doc;
1013     delete textStorage;
1014 }
1015
1016 void FMIndex::makewavelet(uchar *bwt)
1017 {
1018     ulong i, min = 0,
1019              max;
1020     for (i=0;i<256;i++)
1021         C[i]=0;
1022     for (i=0;i<n;++i)
1023         C[(int)bwt[i]]++;
1024     for (i=0;i<256;i++)
1025         if (C[i]>0) {min = i; break;}
1026     for (i=255;i>=min;--i)
1027         if (C[i]>0) {max = i; break;}
1028
1029     ulong prev=C[0], temp;
1030     C[0]=0;
1031     for (i=1;i<256;i++) {
1032         temp = C[i];
1033         C[i]=C[i-1]+prev;
1034         prev = temp;
1035     }
1036 //    this->codetable = node::makecodetable(bwt,n);
1037 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1038 //    delete [] bwt;
1039     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1040 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1041
1042 #ifdef DEBUG_MEMUSAGE
1043     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1044     HeapProfiler::ResetMaxHeapConsumption();
1045 #endif
1046
1047     alphabet_mapper * am = new alphabet_mapper_none();
1048     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate?
1049     wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape
1050     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1051     delete bmb;
1052     bwt = 0; // already deleted
1053
1054 #ifdef DEBUG_MEMUSAGE
1055     std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1056     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1057 #endif
1058 }
1059
1060 void FMIndex::maketables(ulong sampleLength, char tsType, CSA::DeltaVector::Iterator & notIndexedIt, const string & niText)
1061 {
1062     // Calculate BWT end-marker position (of last inserted text)
1063     {
1064         ulong i = 0;
1065         uint alphabetrank_i_tmp = 0;
1066         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1067         while (c != '\0')
1068         {
1069             i = C[c]+alphabetrank_i_tmp-1;
1070             c = alphabetrank->access(i, alphabetrank_i_tmp);
1071         }
1072
1073         this->bwtEndPos = i;
1074     }
1075
1076 #ifdef DEBUG_MEMUSAGE
1077     std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1078     HeapProfiler::ResetMaxHeapConsumption();
1079 #endif
1080
1081     // Build up array for text starting positions
1082 //    BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1083 //    (*textStartPos)[0] = 0;
1084
1085     // Mapping from end-markers to doc ID's:
1086     unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts);
1087 //    uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1];
1088     BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts);
1089
1090     BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n));
1091     uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
1092     for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
1093         sampledpositions[i] = 0;
1094
1095     ulong x,p=bwtEndPos;
1096     ulong sampleCount = 0;
1097     // Keeping track of text position of prev. end-marker seen
1098     ulong posOfSuccEndmarker = n-1;
1099     DocId textId = numberOfTexts;
1100     ulong ulongmax = 0;
1101     ulongmax--;
1102     uint alphabetrank_i_tmp =0;
1103
1104     // Text length = n + number of bytes not indexed.
1105     TextStorageBuilder tsbuilder(n + niText.length());
1106     ulong tsb_i = n + niText.length(); // Iterator from text length to 0.
1107     string::const_reverse_iterator nit_i = niText.rbegin(); // Iterator through non-indexed texts
1108
1109     for (ulong i=n-1;i<ulongmax;i--) {
1110         // i substitutes SA->GetPos(i)
1111         x=(i==n-1)?0:i+1;
1112
1113         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1114
1115         tsbuilder[--tsb_i] = c; // Build TextStorage
1116
1117         if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0')
1118         {
1119             set_field(sampledpositions,1,p,1);
1120             (*positions)[sampleCount] = p;
1121             sampleCount ++;
1122         }
1123
1124         if (c == '\0')
1125         {
1126             unsigned prevTextId = textId; // Cache textId value.
1127             --textId;
1128             /**
1129              * At first c == '\0' it holds that (prevTextId == numberOfTexts), thus,
1130              * we have to search for the first text that is actually *indexed*
1131              * to get correct prevTextId.
1132              */
1133             if (prevTextId == numberOfTexts)
1134             {
1135                 prevTextId = 0;
1136                 while (notIndexedIt.isSet(prevTextId))
1137                     ++ prevTextId;
1138                 // Now prevTextId points to the first indexed Doc ID.
1139             }
1140
1141             /**
1142              * Insert non-indexed texts
1143              */
1144             while (notIndexedIt.isSet(textId))
1145             {
1146                 do {
1147                     tsbuilder[tsb_i] = *nit_i;
1148                     -- tsb_i;
1149                     ++ nit_i;
1150                 } while (nit_i != niText.rend() && *nit_i != '\0');
1151
1152                 tsbuilder[tsb_i] = '\0';
1153
1154                 if (textId == 0)
1155                     break;
1156                 --textId;
1157             }
1158
1159             // Record the order of end-markers in BWT:
1160             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1161             //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts);
1162             (*endmarkerDocId)[endmarkerRank] = prevTextId % numberOfTexts;
1163
1164             // Store text length and text start position:
1165             if (textId < (DocId)numberOfTexts - 1)
1166             {
1167 //                (*textStartPos)[textId + 1] = x;  // x-1 is text position of end-marker.
1168
1169                 posOfSuccEndmarker = i;
1170             }
1171
1172             // LF-mapping from '\0' does not work with this (pseudo) BWT.
1173             // Correct LF-mapping to the last char of the previous text:
1174             p = textId - notIndexedIt.rank(textId);
1175         }
1176         else // Now c != '\0', do LF-mapping:
1177             p = C[c]+alphabetrank_i_tmp-1;
1178     }
1179     while (textId > 0 && notIndexedIt.isSet(textId-1))
1180     {
1181         do {
1182             -- tsb_i;
1183             tsbuilder[tsb_i] = *nit_i;
1184             ++ nit_i;
1185         } while (nit_i != niText.rend() && *nit_i != '\0');
1186         --textId;
1187     }
1188     assert(textId == 0);
1189     assert(tsb_i == 0);
1190     assert(nit_i == niText.rend());
1191
1192 #ifdef DEBUG_MEMUSAGE
1193     std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1194     HeapProfiler::ResetMaxHeapConsumption();
1195 #endif
1196
1197     textStorage = tsbuilder.InitTextStorage(tsType);
1198
1199 #ifdef DEBUG_MEMUSAGE
1200     std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1201     HeapProfiler::ResetMaxHeapConsumption();
1202 #endif
1203
1204     sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1205     delete [] sampledpositions;
1206     assert(sampleCount == sampleLength);
1207     assert(sampled->rank1(n-1) == sampleLength);
1208
1209 #ifdef DEBUG_MEMUSAGE
1210     std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1211     HeapProfiler::ResetMaxHeapConsumption();
1212 #endif
1213
1214     // Suffixes store an offset from the text start position
1215     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1216     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1217
1218     x = n + niText.length() - 2;
1219     textId = numberOfTexts - 1;
1220     posOfSuccEndmarker = x + 1;
1221     for(ulong i = 0; i < sampleLength; i ++) {
1222         // Find next sampled text position
1223         while ((posOfSuccEndmarker - x) % samplerate != 0
1224                || notIndexedIt.isSet(textId)) // Loop over non-indexed
1225         {
1226             --x;
1227             assert(x != ~0lu);
1228             if (textStorage->IsEndmarker(x))
1229             {
1230                 posOfSuccEndmarker = x--;
1231                 -- textId;
1232             }
1233         }
1234         assert((*positions)[i] < n);
1235         ulong j = sampled->rank1((*positions)[i]);
1236
1237         assert(j != 0); // if (j==0) j=sampleLength;
1238
1239         TextPosition textPos = (x==n-1)?0:x+1;
1240         (*suffixDocId)[j-1] = textId; // textStorage->DocIdAtTextPos(textPos);
1241         assert(textStorage->DocIdAtTextPos(textPos) == textId);
1242
1243         assert((*suffixDocId)[j-1] < numberOfTexts);
1244         // calculate offset from text start:
1245         (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]);
1246         --x;
1247         if (x != ~0lu && textStorage->IsEndmarker(x))
1248         {
1249             posOfSuccEndmarker = x--;
1250             -- textId;
1251         }
1252     }
1253
1254     delete positions;
1255
1256 #ifdef DEBUG_MEMUSAGE
1257     std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1258     HeapProfiler::ResetMaxHeapConsumption();
1259 #endif
1260
1261 #ifdef DEBUG_MEMUSAGE
1262     std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1263     HeapProfiler::ResetMaxHeapConsumption();
1264 #endif
1265
1266     /*alphabet_mapper * am = new alphabet_mapper_none();
1267     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate?
1268     Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true);
1269     delete bmb;*/
1270     // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1271
1272     Doc = new ArrayDoc(endmarkerDocId);
1273
1274 #ifdef DEBUG_MEMUSAGE
1275     std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1276 #endif
1277 }
1278
1279
1280 /**
1281  * Finds document identifier for given text position
1282  *
1283  * Starting text position of the document is stored into second parameter.
1284  * Binary searching on text starting positions.
1285  */
1286 TextCollection::DocId FMIndex::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1287 {
1288     assert(i < n);
1289
1290     DocId a = 0;
1291     DocId b = numberOfTexts - 1;
1292     while (a < b)
1293     {
1294         DocId c = a + (b - a)/2;
1295         if ((*textStartPos)[c] > i)
1296             b = c - 1;
1297         else if ((*textStartPos)[c+1] > i)
1298             return c;
1299         else
1300             a = c + 1;
1301     }
1302
1303     assert(a < (DocId)numberOfTexts);
1304     assert(i >= (*textStartPos)[a]);
1305     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1306     return a;
1307 }
1308
1309
1310 } // namespace SXSI
1311