FMIndex.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *   FMIndex implementation for the TextCollection interface                  *
   5  *                                                                            *
   6  *   This program is free software; you can redistribute it and/or modify     *
   7  *   it under the terms of the GNU Lesser General Public License as published *
   8  *   by the Free Software Foundation; either version 2 of the License, or     *
   9  *   (at your option) any later version.                                      *
  10  *                                                                            *
  11  *   This program is distributed in the hope that it will be useful,          *
  12  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  13  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  14  *   GNU Lesser General Public License for more details.                      *
  15  *                                                                            *
  16  *   You should have received a copy of the GNU Lesser General Public License *
  17  *   along with this program; if not, write to the                            *
  18  *   Free Software Foundation, Inc.,                                          *
  19  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  20  *****************************************************************************/
  21 #include "FMIndex.h"
  22
  23 //#define DEBUG_MEMUSAGE
  24 #ifdef DEBUG_MEMUSAGE
  25 #include "HeapProfiler.h" // FIXME remove
  26 #endif
  27
  28 #include <iostream>
  29 #include <map>
  30 #include <set>
  31 #include <vector>
  32 #include <utility>
  33 #include <stdexcept>
  34 #include <cassert>
  35 #include <cstring> // For strlen()
  36 using std::vector;
  37 using std::pair;
  38 using std::make_pair;
  39 using std::map;
  40 using std::string;
  41
  42 namespace SXSI
  43 {
  44
  45 // Save file version info
  46 const uchar FMIndex::versionFlag = 9;
  47
  48 /**
  49  * Constructor inits an empty dynamic FM-index.
  50  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  51  */
  52 FMIndex::FMIndex(uchar * bwt, ulong length, unsigned samplerate_,
  53                      unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_,
  54                      CSA::DeltaVector & notIndexed, const string & niText, char tsType)
  55     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  56       suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
  57 {
  58     makewavelet(bwt); // Deletes bwt!
  59     bwt = 0;
  60
  61     // Make sampling tables
  62     maketables(numberOfSamples_, tsType, notIndexed, niText);
  63 }
  64
  65 bool FMIndex::EmptyText(DocId k) const
  66 {
  67     assert(k < (DocId)numberOfTexts);
  68     return false; // Empty texts are not indexed
  69 }
  70
  71 uchar * FMIndex::GetText(DocId k) const
  72 {
  73     assert(k < (DocId)numberOfTexts);
  74
  75     return textStorage->GetText(k);
  76 /*    TextPosition i = k;
  77
  78     string result;
  79     // Reserve average string length to avoid reallocs
  80     result.reserve(n/numberOfTexts);
  81
  82     uchar c = alphabetrank->access(i);
  83     while (c != '\0')
  84     {
  85         result.push_back(c);
  86         i = C[c]+alphabetrank->rank(c,i)-1;
  87
  88         c = alphabetrank->access(i); // "next" char.
  89     }
  90
  91     // Convert to uchar (FIXME return string?)
  92     i = result.size();
  93     uchar* res = new uchar[i+1];
  94     res[i] = '\0';
  95     for (ulong j = 0; j < i; ++j)
  96         res[i-j-1] = result[j];
  97         return res;*/
  98 }
  99
 100 /*
 101  * Substring queries are supported via the pointer returned by TextStorage::GetText
 102 uchar* FMIndex::GetText(DocId k, TextPosition i, TextPosition j) const
 103 {
 104     assert(k < (DocId)numberOfTexts);
 105     assert(j < (*textLength)[k]);
 106     assert(i <= j);
 107
 108     ulong textRank = 0;
 109
 110     // Start position of k'th text
 111     ulong start = (*textStartPos)[k];
 112
 113     return Substring(i + start, j-i+1);
 114     }*/
 115
 116
 117
 118 /******************************************************************
 119  * Existential queries
 120  */
 121 bool FMIndex::IsPrefix(uchar const * pattern) const
 122 {
 123     TextPosition m = strlen((char *)pattern);
 124     if (m == 0)
 125         return true;
 126
 127     TextPosition sp = 0, ep = 0;
 128     Search(pattern, m, &sp, &ep);
 129
 130     // Check for end-marker(s) in result interval
 131     if (CountEndmarkers(sp, ep))
 132         return true;
 133     return false;
 134 }
 135
 136 bool FMIndex::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 137 {
 138     TextPosition m = strlen((char *)pattern);
 139     if (m == 0)
 140         return true;
 141
 142     TextPosition sp = 0, ep = 0;
 143     Search(pattern, m, &sp, &ep);
 144
 145     // Check for end-marker(s) in result interval
 146     if (CountEndmarkers(sp, ep, begin, end))
 147         return true;
 148     return false;
 149 }
 150
 151
 152 bool FMIndex::IsSuffix(uchar const *pattern) const
 153 {
 154     // Here counting is as fast as IsSuffix():
 155     if (CountSuffix(pattern) > 0)
 156         return true;
 157     return false;
 158 }
 159
 160 bool FMIndex::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 161 {
 162     // Here counting is as fast as IsSuffix():
 163     if (CountSuffix(pattern, begin, end) > 0)
 164         return true;
 165     return false;
 166 }
 167
 168 bool FMIndex::IsEqual(uchar const *pattern) const
 169 {
 170     TextPosition m = std::strlen((char *)pattern);
 171     if (m == 0)
 172         return false; // No empty texts exists
 173
 174     TextPosition sp = 0, ep = 0;
 175     // Match including end-marker
 176     Search(pattern, m+1, &sp, &ep);
 177
 178     // Check for end-marker(s) in result interval
 179     if (CountEndmarkers(sp, ep))
 180         return true;
 181     return false;
 182 }
 183
 184 bool FMIndex::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 185 {
 186     TextPosition m = std::strlen((char *)pattern);
 187     if (m == 0)
 188         return false; // No empty texts exists
 189
 190     TextPosition sp = 0, ep = 0;
 191     // Match including end-marker
 192     Search(pattern, m+1, &sp, &ep, begin, end);
 193
 194     // Check for end-marker(s) in result interval
 195     if (CountEndmarkers(sp, ep))
 196         return true;
 197     return false;
 198 }
 199
 200 bool FMIndex::IsContains(uchar const * pattern) const
 201 {
 202     TextPosition m = strlen((char *)pattern);
 203     if (m == 0)
 204         return true;
 205
 206     TextPosition sp = 0, ep = 0;
 207     // Just check if pattern exists somewhere
 208     ulong count = Search(pattern, m, &sp, &ep);
 209
 210     if (count > 0)
 211         return true;
 212     return false;
 213 }
 214
 215 bool FMIndex::IsContains(uchar const * pattern, DocId begin, DocId end) const
 216 {
 217     // Here counting is as fast as existential querying
 218     if (CountContains(pattern, begin, end) > 0)
 219         return true; // FIXME No need to filter result set
 220     return false;
 221 }
 222
 223 bool FMIndex::IsLessThan(uchar const * pattern) const
 224 {
 225     if (CountLessThan(pattern) > 0)
 226         return true;
 227     return false;
 228 }
 229
 230 bool FMIndex::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 231 {
 232     if (CountLessThan(pattern, begin, end) > 0)
 233         return true;
 234     return false;
 235 }
 236
 237 /******************************************************************
 238  * Counting queries
 239  */
 240 ulong FMIndex::Count(uchar const * pattern) const
 241 {
 242     TextPosition m = strlen((char *)pattern);
 243     if (m == 0)
 244         return 0;
 245
 246     TextPosition sp = 0, ep = 0;
 247     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 248     return count;
 249 }
 250
 251 unsigned FMIndex::CountPrefix(uchar const * pattern) const
 252 {
 253     TextPosition m = strlen((char *)pattern);
 254     if (m == 0)
 255         return numberOfTexts;
 256
 257     TextPosition sp = 0, ep = 0;
 258     Search(pattern, m, &sp, &ep);
 259
 260     // Count end-markers in result interval
 261     return CountEndmarkers(sp, ep);
 262 }
 263
 264 unsigned FMIndex::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 265 {
 266     TextPosition m = strlen((char *)pattern);
 267     if (m == 0)
 268         return numberOfTexts;
 269
 270     TextPosition sp = 0, ep = 0;
 271     Search(pattern, m, &sp, &ep);
 272
 273     // Count end-markers in result interval
 274     return CountEndmarkers(sp, ep, begin, end);
 275 }
 276
 277 unsigned FMIndex::CountSuffix(uchar const * pattern) const
 278 {
 279     TextPosition m = strlen((char *)pattern);
 280     if (m == 0)
 281         return numberOfTexts;
 282
 283     TextPosition sp = 0, ep = 0;
 284     // Search with end-marker
 285     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 286
 287     return count;
 288 }
 289
 290 unsigned FMIndex::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 291 {
 292     TextPosition m = strlen((char *)pattern);
 293     if (m == 0)
 294         return numberOfTexts;
 295
 296     TextPosition sp = 0, ep = 0;
 297     // Search with end-marker
 298     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 299
 300     return count;
 301 }
 302
 303 unsigned FMIndex::CountEqual(uchar const *pattern) const
 304 {
 305     TextPosition m = strlen((char const *)pattern);
 306     if (m == 0)
 307         return 0; // No empty texts.
 308
 309     TextPosition sp = 0, ep = 0;
 310     // Match including end-marker
 311     Search(pattern, m+1, &sp, &ep);
 312
 313     // Count end-markers in result interval
 314     return CountEndmarkers(sp, ep);
 315 }
 316
 317 unsigned FMIndex::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 318 {
 319     TextPosition m = strlen((char const *)pattern);
 320     if (m == 0)
 321         return 0; // No empty texts.
 322
 323     TextPosition sp = 0, ep = 0;
 324     // Match including end-marker
 325     Search(pattern, m+1, &sp, &ep, begin, end);
 326
 327     // Count end-markers in result interval
 328     return CountEndmarkers(sp, ep); // Already within [begin, end]
 329 }
 330
 331 unsigned FMIndex::CountContains(uchar const * pattern) const
 332 {
 333     TextPosition m = strlen((char const *)pattern);
 334     if (m == 0)
 335         return numberOfTexts; // Total number of texts.
 336
 337     // Here counting is as slow as fetching the result set
 338     // because we have to filter out occ's that fall within same document.
 339     TextCollection::document_result result = Contains(pattern);
 340     return result.size();
 341 }
 342
 343 unsigned FMIndex::CountContains(uchar const * pattern, DocId begin, DocId end) const
 344 {
 345     TextPosition m = strlen((char const *)pattern);
 346     if (m == 0)
 347         return numberOfTexts; // Total number of texts.
 348
 349     // Here counting is as slow as fetching the result set
 350     // because we have to filter out occ's that fall within same document.
 351     TextCollection::document_result result = Contains(pattern, begin, end);
 352     return result.size();
 353 }
 354
 355 // Less than or equal
 356 unsigned FMIndex::CountLessThan(uchar const * pattern) const
 357 {
 358     TextPosition m = strlen((char const *)pattern);
 359     if (m == 0)
 360         return 0; // No empty texts.
 361
 362     TextPosition sp = 0, ep = 0;
 363     SearchLessThan(pattern, m, &sp, &ep);
 364
 365     // Count end-markers in result interval
 366     return CountEndmarkers(sp, ep);
 367 }
 368
 369 unsigned FMIndex::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 370 {
 371     TextPosition m = strlen((char const *)pattern);
 372     if (m == 0)
 373         return 0; // No empty texts.
 374
 375     TextPosition sp = 0, ep = 0;
 376     SearchLessThan(pattern, m, &sp, &ep);
 377
 378     // Count end-markers in result interval
 379     return CountEndmarkers(sp, ep, begin, end);
 380 }
 381
 382 /**
 383  * Document reporting queries
 384  */
 385 TextCollection::document_result FMIndex::Prefix(uchar const * pattern) const
 386 {
 387     TextPosition m = strlen((char *)pattern);
 388     if (m == 0)
 389         return TextCollection::document_result(); // FIXME Should return all 1...k
 390
 391     TextPosition sp = 0, ep = 0;
 392     Search(pattern, m, &sp, &ep);
 393
 394     // Iterate through end-markers in [sp,ep]:
 395     return EnumerateEndmarkers(sp, ep);
 396 }
 397
 398 TextCollection::document_result FMIndex::Prefix(uchar const * pattern, DocId begin, DocId end) const
 399 {
 400     TextPosition m = strlen((char *)pattern);
 401     if (m == 0)
 402         return TextCollection::document_result(); // FIXME Should return all 1...k
 403
 404     TextPosition sp = 0, ep = 0;
 405     Search(pattern, m, &sp, &ep);
 406
 407     // Return end-markers in [sp,ep] and [begin, end]:
 408     return EnumerateEndmarkers(sp, ep, begin, end);
 409 }
 410
 411 TextCollection::document_result FMIndex::Suffix(uchar const * pattern) const
 412 {
 413     TextPosition m = strlen((char *)pattern);
 414     if (m == 0)
 415         return TextCollection::document_result(); // FIXME Should return all 1...k
 416
 417     TextPosition sp = 0, ep = 0;
 418     // Search with end-marker
 419     Search(pattern, m+1, &sp, &ep);
 420
 421     TextCollection::document_result result;
 422     result.reserve(ep-sp+1); // Try to avoid reallocation.
 423
 424     // Check each occurrence
 425     for (; sp <= ep; ++sp)
 426     {
 427         TextPosition i = sp;
 428
 429         uchar c = alphabetrank->access(i);
 430         while (c != '\0' && !sampled->access(i))
 431         {
 432             i = C[c]+alphabetrank->rank(c,i)-1;
 433             c = alphabetrank->access(i);
 434         }
 435         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 436
 437         if (c == '\0')
 438         {
 439             // Rank among the end-markers in BWT
 440             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 441             result.push_back(Doc->access(endmarkerRank));
 442         }
 443         else // Sampled position
 444         {
 445             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 446             result.push_back(docId);
 447         }
 448     }
 449
 450     return result;
 451 }
 452
 453 TextCollection::document_result FMIndex::Suffix(uchar const * pattern, DocId begin, DocId end) const
 454 {
 455     TextPosition m = strlen((char *)pattern);
 456     if (m == 0)
 457         return TextCollection::document_result(); // FIXME Should return all 1...k
 458
 459     TextPosition sp = 0, ep = 0;
 460     // Search with end-marker
 461     Search(pattern, m+1, &sp, &ep, begin, end);
 462
 463     TextCollection::document_result result;
 464     result.reserve(ep-sp+1); // Try to avoid reallocation.
 465
 466     // Check each occurrence, already within [begin, end]
 467     for (; sp <= ep; ++sp)
 468     {
 469         TextPosition i = sp;
 470
 471         uchar c = alphabetrank->access(i);
 472         while (c != '\0' && !sampled->access(i))
 473         {
 474             i = C[c]+alphabetrank->rank(c,i)-1;
 475             c = alphabetrank->access(i);
 476         }
 477         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 478
 479         if (c == '\0')
 480         {
 481             // Rank among the end-markers in BWT
 482             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 483             result.push_back(Doc->access(endmarkerRank));
 484         }
 485         else // Sampled position
 486         {
 487             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 488             result.push_back(docId);
 489         }
 490     }
 491
 492     return result;
 493 }
 494
 495
 496 TextCollection::document_result FMIndex::Equal(uchar const *pattern) const
 497 {
 498     TextPosition m = strlen((char const *)pattern);
 499     if (m == 0)
 500         return TextCollection::document_result(); // FIXME Should return all empty texts
 501
 502     TextPosition sp = 0, ep = 0;
 503     // Match including end-marker
 504     Search(pattern, m+1, &sp, &ep);
 505
 506     // Report end-markers in result interval
 507     return EnumerateEndmarkers(sp, ep);
 508 }
 509
 510 TextCollection::document_result FMIndex::Equal(uchar const *pattern, DocId begin, DocId end) const
 511 {
 512     TextPosition m = strlen((char const *)pattern);
 513     if (m == 0)
 514         return TextCollection::document_result(); // FIXME Should return all empty texts
 515
 516     TextPosition sp = 0, ep = 0;
 517     // Match including end-marker
 518     Search(pattern, m+1, &sp, &ep, begin, end);
 519
 520     // Report end-markers in result interval
 521     return EnumerateEndmarkers(sp, ep, begin, end);
 522 }
 523
 524
 525 TextCollection::document_result FMIndex::Contains(uchar const * pattern) const
 526 {
 527     TextPosition m = strlen((char *)pattern);
 528     if (m == 0)
 529         return TextCollection::document_result();
 530
 531     TextPosition sp = 0, ep = 0;
 532     // Search all occurrences
 533     Search(pattern, m, &sp, &ep);
 534
 535     // We want unique document indentifiers, using std::set to collect them
 536     std::set<DocId> resultSet;
 537     EnumerateDocuments(resultSet, sp, ep);
 538
 539     // Convert std::set to std::vector
 540     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 541     return result;
 542 }
 543
 544 TextCollection::document_result FMIndex::Contains(uchar const * pattern, DocId begin, DocId end) const
 545 {
 546     TextPosition m = strlen((char *)pattern);
 547     if (m == 0)
 548         return TextCollection::document_result();
 549
 550     TextPosition sp = 0, ep = 0;
 551     // Search all occurrences
 552     Search(pattern, m, &sp, &ep);
 553
 554     // We want unique document indentifiers, using std::set to collect them
 555     std::set<DocId> resultSet;
 556     EnumerateDocuments(resultSet, sp, ep, begin, end);
 557
 558     // Convert std::set to std::vector
 559     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 560     return result;
 561 }
 562
 563
 564 /**
 565  ***
 566 * * FIXME Lessthan or equal
 567  */
 568 TextCollection::document_result FMIndex::LessThan(uchar const * pattern) const
 569 {
 570     TextPosition m = strlen((char *)pattern);
 571     if (m == 0)
 572         return TextCollection::document_result(); // empty result set
 573
 574     TextPosition sp = 0, ep = 0;
 575     SearchLessThan(pattern, m, &sp, &ep);
 576
 577     // Report end-markers in result interval
 578     return EnumerateEndmarkers(sp, ep);
 579 }
 580
 581 TextCollection::document_result FMIndex::LessThan(uchar const * pattern, DocId begin, DocId end) const
 582 {
 583     TextPosition m = strlen((char *)pattern);
 584     if (m == 0)
 585         return TextCollection::document_result(); // empty result set
 586
 587     TextPosition sp = 0, ep = 0;
 588     SearchLessThan(pattern, m, &sp, &ep);
 589
 590     // Iterate through end-markers in [sp,ep] and [begin, end]:
 591     return EnumerateEndmarkers(sp, ep, begin, end);
 592 }
 593
 594
 595 TextCollection::document_result FMIndex::KMismaches(uchar const * pattern, unsigned k) const
 596 {
 597     TextPosition m = strlen((char *)pattern);
 598     if (m == 0)
 599         return TextCollection::document_result(); // empty result set
 600
 601     suffix_range_vector ranges;
 602     kmismatches(ranges, pattern, 0, n-1, m, k);
 603     std::set<DocId> resultSet;
 604
 605     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 606         // Iterate through docs in [sp,ep]:
 607         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 608
 609     // Convert std::set to std::vector
 610     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 611     return result;
 612 }
 613
 614 TextCollection::document_result FMIndex::KErrors(uchar const * pattern, unsigned k) const
 615 {
 616     TextPosition m = strlen((char *)pattern);
 617     if (m == 0)
 618         return TextCollection::document_result(); // empty result set
 619
 620     suffix_range_vector ranges;
 621     ulong *dd = new ulong[m+1];
 622     for (ulong i=0;i<m+1;i++)
 623         dd[i]=i;
 624     kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 625     delete [] dd;
 626
 627     std::set<DocId> resultSet;
 628     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 629         // Iterate through docs in [sp,ep]:
 630         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 631
 632     // Convert std::set to std::vector
 633     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 634     return result;
 635 }
 636
 637
 638 /**
 639  * Full result set queries
 640  */
 641 TextCollection::full_result FMIndex::FullContains(uchar const * pattern) const
 642 {
 643     TextPosition m = strlen((char *)pattern);
 644     if (m == 0)
 645         return full_result(); // FIXME Throw exception?
 646
 647     TextPosition sp = 0, ep = 0;
 648     // Search all occurrences
 649     Search(pattern, m, &sp, &ep);
 650
 651     full_result result;
 652     result.reserve(ep-sp+1); // Try to avoid reallocation.
 653     EnumeratePositions(result, sp, ep);
 654
 655     return result;
 656 }
 657
 658 TextCollection::full_result FMIndex::FullContains(uchar const * pattern, DocId begin, DocId end) const
 659 {
 660     TextPosition m = strlen((char *)pattern);
 661     if (m == 0)
 662         return full_result(); // FIXME Throw exception?
 663
 664     TextPosition sp = 0, ep = 0;
 665     // Search all occurrences
 666     Search(pattern, m, &sp, &ep);
 667
 668     full_result result;
 669     result.reserve(ep-sp+1); // Try to avoid reallocation.
 670     EnumeratePositions(result, sp, ep, begin, end);
 671
 672     return result;
 673 }
 674
 675 TextCollection::full_result FMIndex::FullKMismatches(uchar const * pattern, unsigned k) const
 676 {
 677     TextPosition m = strlen((char *)pattern);
 678     if (m == 0)
 679         return TextCollection::full_result(); // empty result set
 680
 681     suffix_range_vector ranges;
 682     ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
 683
 684     TextCollection::full_result result;
 685     result.reserve(count); // avoid reallocation.
 686     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 687         // Iterate through docs in [sp,ep]:
 688         EnumeratePositions(result, (*it).first, (*it).second);
 689     return result;
 690 }
 691
 692 TextCollection::full_result FMIndex::FullKErrors(uchar const * pattern, unsigned k) const
 693 {
 694     TextPosition m = strlen((char *)pattern);
 695     if (m == 0)
 696         return TextCollection::full_result(); // empty result set
 697
 698     suffix_range_vector ranges;
 699     ulong *dd = new ulong[m+1];
 700     for (unsigned i=0;i<m+1;i++)
 701         dd[i]=i;
 702     ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 703     delete [] dd;
 704
 705     TextCollection::full_result result;
 706     result.reserve(count); // avoid reallocation.
 707     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 708         // Iterate through docs in [sp,ep]:
 709         EnumeratePositions(result, (*it).first, (*it).second);
 710     return result;
 711 }
 712
 713
 714 /**
 715  * Save index to a file handle
 716  *
 717  * Throws a std::runtime_error exception on i/o error.
 718  * First byte that is saved represents the version number of the save file.
 719  * In this version files, the data fields are:
 720  *  uchar type;
 721     uchar versionFlag;
 722     TextPosition n;
 723     unsigned samplerate;
 724     unsigned C[256];
 725     TextPosition bwtEndPos;
 726     static_sequence * alphabetrank;
 727     BSGAP *sampled;
 728     BlockArray *suffixes;
 729     BlockArray *suffixDocId;
 730     unsigned numberOfTexts;
 731     ulong maxTextLength;
 732     static_sequence *docId;
 733  *
 734  * The second parameter (filename's prefix) is ignored.
 735  */
 736 void FMIndex::Save(FILE *file, char const *ignored) const
 737 {
 738     const char type = 'F';
 739     // Saving type info:
 740     if (std::fwrite(&type, 1, 1, file) != 1)
 741         throw std::runtime_error("FMIndex::Save(): file write error (type flag).");
 742
 743     // Saving version info:
 744     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 745         throw std::runtime_error("FMIndex::Save(): file write error (version flag).");
 746
 747     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 748         throw std::runtime_error("FMIndex::Save(): file write error (n).");
 749     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 750         throw std::runtime_error("FMIndex::Save(): file write error (samplerate).");
 751
 752     for(ulong i = 0; i < 256; ++i)
 753         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 754             throw std::runtime_error("FMIndex::Save(): file write error (C table).");
 755
 756     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 757         throw std::runtime_error("FMIndex::Save(): file write error (bwt end position).");
 758
 759     alphabetrank->save(file);
 760     sampled->save(file);
 761     suffixes->Save(file);
 762     suffixDocId->Save(file);
 763
 764     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 765         throw std::runtime_error("FMIndex::Save(): file write error (numberOfTexts).");
 766     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 767         throw std::runtime_error("FMIndex::Save(): file write error (maxTextLength).");
 768
 769     Doc->save(file);
 770     textStorage->Save(file);
 771     fflush(file);
 772 }
 773
 774
 775 /**
 776  * Load index from a file handle
 777  *
 778  * Throws a std::runtime_error exception on i/o error.
 779  * For more info, see FMIndex::Save().
 780  *
 781  * index_mode_t is defined in TextCollection.h and
 782  * defaults to both the index and "naive" text.
 783  *
 784  * Note: Samplerate can not be changed during load.
 785  */
 786 FMIndex::FMIndex(FILE *file, index_mode_t im, unsigned samplerate_)
 787     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
 788       suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
 789 {
 790     // NB: Type byte has already been read from input
 791
 792     uchar verFlag = 0;
 793     if (std::fread(&verFlag, 1, 1, file) != 1)
 794         throw std::runtime_error("FMIndex::Load(): file read error (version flag).");
 795     if (verFlag != FMIndex::versionFlag)
 796         throw std::runtime_error("FMIndex::Load(): invalid save file version.");
 797
 798     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 799         throw std::runtime_error("FMIndex::Load(): file read error (n).");
 800     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 801         throw std::runtime_error("FMIndex::Load(): file read error (samplerate).");
 802 // FIXME samplerate can not be changed during load.
 803 //    if (this->samplerate == 0)
 804 //        this->samplerate = samplerate;
 805
 806     for(ulong i = 0; i < 256; ++i)
 807         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 808             throw std::runtime_error("FMIndex::Load(): file read error (C table).");
 809
 810     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 811         throw std::runtime_error("FMIndex::Load(): file read error (bwt end position).");
 812
 813     alphabetrank = static_sequence::load(file);
 814     if (im == index_mode_text_only) { delete alphabetrank; alphabetrank = 0; }
 815
 816     sampled = static_bitsequence::load(file);
 817     if (im == index_mode_text_only) { delete sampled; sampled = 0; }
 818     suffixes = new BlockArray(file);
 819     if (im == index_mode_text_only) { delete suffixes; suffixes = 0; }
 820     suffixDocId = new BlockArray(file);
 821     if (im == index_mode_text_only) { delete suffixDocId; suffixDocId = 0; }
 822
 823     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 824         throw std::runtime_error("FMIndex::Load(): file read error (numberOfTexts).");
 825     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 826         throw std::runtime_error("FMIndex::Load(): file read error (maxTextLength).");
 827
 828     Doc = new ArrayDoc(file); //static_sequence::load(file);
 829     if (im == index_mode_text_only) { delete Doc; Doc = 0; }
 830
 831     textStorage = TextStorage::Load(file);
 832
 833     // FIXME Construct data structures with new samplerate
 834     //maketables();
 835 }
 836
 837
 838
 839 /**
 840  * Rest of the functions follow...
 841  */
 842 ulong FMIndex::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
 843 {
 844     int c;
 845     while (*sp<=*ep && i>=1)
 846     {
 847         c = (int)pattern[--i];
 848         *sp = C[c]+alphabetrank->rank(c,*sp-1);
 849         *ep = C[c]+alphabetrank->rank(c,*ep)-1;
 850     }
 851     if (*sp<=*ep)
 852         return *ep - *sp + 1;
 853     else
 854         return 0;
 855 }
 856
 857
 858 ulong FMIndex::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
 859 {
 860     if (sp>ep) return 0;
 861     if (j == 0)
 862     {
 863         result.push_back(std::make_pair(sp,ep));
 864         return ep-sp+1;
 865     }
 866     int c;
 867     ulong spnew;
 868     ulong epnew;
 869     int knew;
 870     ulong sum=0;
 871     if (k==0)
 872     {
 873         sum = searchPrefix(pattern, j, &sp, &ep);
 874         if (sp<=ep)
 875             result.push_back(std::make_pair(sp, ep));
 876         return sum;
 877     }
 878     vector<int> chars = alphabetrank->accessAll(sp, ep);
 879     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 880     {
 881         if (*it == 0)
 882             continue; // skip '\0'
 883         c = *it;
 884         spnew = C[c]+alphabetrank->rank(c,sp-1);
 885         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 886         if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
 887         if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
 888     }
 889     return sum;
 890 }
 891
 892 //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
 893 ulong FMIndex::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
 894 {
 895     ulong sum=0;
 896     if (d[m]<=k) // range of suffixes with at most k-errors found
 897     {
 898         if (sp<=ep)
 899             result.push_back(std::make_pair(sp, ep));
 900         sum += (sp<=ep)?ep-sp+1:0ul;
 901     }
 902     if (sp>ep || j==0)
 903         return sum;
 904     ulong *dnew = new ulong[m+1];
 905     int c;
 906     ulong spnew;
 907     ulong p,lowerbound;
 908     ulong epnew;
 909     vector<int> chars = alphabetrank->accessAll(sp, ep);
 910     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 911     {
 912         if (*it == 0)
 913             continue; // skip '\0'
 914         c = *it;
 915         spnew = C[c]+alphabetrank->rank(c,sp-1);
 916         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 917         if (spnew>epnew) continue;
 918         dnew[0]=m+k-j+1;
 919         lowerbound=k+1;
 920         for (p=1; p<=m; p++) {
 921             dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
 922             if (dnew[p]<lowerbound)
 923                 lowerbound = dnew[p];
 924         }
 925         if (lowerbound<=k)
 926             sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
 927     }
 928     delete [] dnew;
 929     return sum;
 930 }
 931
 932
 933 ulong FMIndex::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 934 {
 935     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 936     int c = (int)pattern[m-1];
 937     TextPosition i=m-1;
 938     TextPosition sp = C[c];
 939     TextPosition ep = C[c+1]-1;
 940     while (sp<=ep && i>=1)
 941     {
 942 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 943         c = (int)pattern[--i];
 944         sp = C[c]+alphabetrank->rank(c,sp-1);
 945         ep = C[c]+alphabetrank->rank(c,ep)-1;
 946     }
 947     *spResult = sp;
 948     *epResult = ep;
 949     if (sp<=ep)
 950         return ep - sp + 1;
 951     else
 952         return 0;
 953 }
 954
 955 ulong FMIndex::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
 956 {
 957     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 958     int c = (int)pattern[m-1];
 959     assert(c == 0); // Start from endmarkers
 960     TextPosition i=m-1;
 961     TextPosition sp = begin;
 962     TextPosition ep = end;
 963     while (sp<=ep && i>=1)
 964     {
 965 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 966         c = (int)pattern[--i];
 967         sp = C[c]+alphabetrank->rank(c,sp-1);
 968         ep = C[c]+alphabetrank->rank(c,ep)-1;
 969     }
 970     *spResult = sp;
 971     *epResult = ep;
 972     if (sp<=ep)
 973         return ep - sp + 1;
 974     else
 975         return 0;
 976 }
 977
 978
 979 ulong FMIndex::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 980 {
 981     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 982     uint c = (int)pattern[m-1];
 983     TextPosition i=m-1;
 984     TextPosition sp = 1;
 985     TextPosition ep = C[c+1]-1;
 986     while (sp<=ep && i>=1)
 987     {
 988 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 989         c = (int)pattern[--i];
 990         uint result = alphabetrank->rankLessThan(c,ep);
 991         if (result == ~0u)
 992             ep = 0;
 993         else
 994             ep = C[c]+result-1;
 995     }
 996     *spResult = sp;
 997     *epResult = ep;
 998     if (sp<=ep)
 999         return ep - sp + 1;
1000     else
1001         return 0;
1002 }
1003
1004
1005 FMIndex::~FMIndex() {
1006     delete alphabetrank;
1007     delete sampled;
1008     delete suffixes;
1009     delete suffixDocId;
1010     delete Doc;
1011     delete textStorage;
1012 }
1013
1014 void FMIndex::makewavelet(uchar *bwt)
1015 {
1016     ulong i, min = 0,
1017              max;
1018     for (i=0;i<256;i++)
1019         C[i]=0;
1020     for (i=0;i<n;++i)
1021         C[(int)bwt[i]]++;
1022     for (i=0;i<256;i++)
1023         if (C[i]>0) {min = i; break;}
1024     for (i=255;i>=min;--i)
1025         if (C[i]>0) {max = i; break;}
1026
1027     ulong prev=C[0], temp;
1028     C[0]=0;
1029     for (i=1;i<256;i++) {
1030         temp = C[i];
1031         C[i]=C[i-1]+prev;
1032         prev = temp;
1033     }
1034 //    this->codetable = node::makecodetable(bwt,n);
1035 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1036 //    delete [] bwt;
1037     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1038 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1039
1040 #ifdef DEBUG_MEMUSAGE
1041     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1042     HeapProfiler::ResetMaxHeapConsumption();
1043 #endif
1044
1045     alphabet_mapper * am = new alphabet_mapper_none();
1046     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate?
1047     wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape
1048     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1049     delete bmb;
1050     bwt = 0; // already deleted
1051
1052 #ifdef DEBUG_MEMUSAGE
1053     std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1054     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1055 #endif
1056 }
1057
1058 void FMIndex::maketables(ulong sampleLength, char tsType, CSA::DeltaVector & notIndexed, const string & niText)
1059 {
1060     // Calculate BWT end-marker position (of last inserted text)
1061     {
1062         ulong i = 0;
1063         uint alphabetrank_i_tmp = 0;
1064         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1065         while (c != '\0')
1066         {
1067             i = C[c]+alphabetrank_i_tmp-1;
1068             c = alphabetrank->access(i, alphabetrank_i_tmp);
1069         }
1070
1071         this->bwtEndPos = i;
1072     }
1073
1074 #ifdef DEBUG_MEMUSAGE
1075     std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1076     HeapProfiler::ResetMaxHeapConsumption();
1077 #endif
1078
1079     // Build up array for text starting positions
1080 //    BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1081 //    (*textStartPos)[0] = 0;
1082
1083     // Mapping from end-markers to doc ID's:
1084     unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts);
1085 //    uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1];
1086     BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts);
1087
1088     BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n));
1089     uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
1090     for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
1091         sampledpositions[i] = 0;
1092
1093     ulong x,p=bwtEndPos;
1094     ulong sampleCount = 0;
1095     // Keeping track of text position of prev. end-marker seen
1096     ulong posOfSuccEndmarker = n-1;
1097     DocId textId = numberOfTexts;
1098     ulong ulongmax = 0;
1099     ulongmax--;
1100     uint alphabetrank_i_tmp =0;
1101
1102     // Text length = n + number of bytes not indexed.
1103     TextStorageBuilder tsbuilder(n + niText.length());
1104     ulong tsb_i = n + niText.length(); // Iterator from text length to 0.
1105     string::const_reverse_iterator nit_i = niText.rbegin(); // Iterator through non-indexed texts
1106
1107     for (ulong i=n-1;i<ulongmax;i--) {
1108         // i substitutes SA->GetPos(i)
1109         x=(i==n-1)?0:i+1;
1110
1111         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1112
1113         tsbuilder[--tsb_i] = c; // Build TextStorage
1114
1115         if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0')
1116         {
1117             set_field(sampledpositions,1,p,1);
1118             (*positions)[sampleCount] = p;
1119             sampleCount ++;
1120         }
1121
1122         if (c == '\0')
1123         {
1124             unsigned prevTextId = textId; // Cache textId value.
1125             --textId;
1126             /**
1127              * At first c == '\0' it holds that (prevTextId == numberOfTexts), thus,
1128              * we have to search for the first text that is actually *indexed*
1129              * to get correct prevTextId.
1130              */
1131             if (prevTextId == numberOfTexts)
1132             {
1133                 prevTextId = 0;
1134                 while (notIndexed.isSet(prevTextId))
1135                     ++ prevTextId;
1136                 // Now prevTextId points to the first indexed Doc ID.
1137             }
1138
1139             /**
1140              * Insert non-indexed texts
1141              */
1142             while (notIndexed.isSet(textId))
1143             {
1144                 do {
1145                     tsbuilder[tsb_i] = *nit_i;
1146                     -- tsb_i;
1147                     ++ nit_i;
1148                 } while (nit_i != niText.rend() && *nit_i != '\0');
1149
1150                 tsbuilder[tsb_i] = '\0';
1151
1152                 if (textId == 0)
1153                     break;
1154                 --textId;
1155             }
1156
1157             // Record the order of end-markers in BWT:
1158             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1159             //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts);
1160             (*endmarkerDocId)[endmarkerRank] = prevTextId % numberOfTexts;
1161
1162             // Store text length and text start position:
1163             if (textId < (DocId)numberOfTexts - 1)
1164             {
1165 //                (*textStartPos)[textId + 1] = x;  // x-1 is text position of end-marker.
1166
1167                 posOfSuccEndmarker = i;
1168             }
1169
1170             // LF-mapping from '\0' does not work with this (pseudo) BWT.
1171             // Correct LF-mapping to the last char of the previous text:
1172             p = textId - notIndexed.rank(textId);
1173         }
1174         else // Now c != '\0', do LF-mapping:
1175             p = C[c]+alphabetrank_i_tmp-1;
1176     }
1177     while (textId > 0 && notIndexed.isSet(textId-1))
1178     {
1179         do {
1180             -- tsb_i;
1181             tsbuilder[tsb_i] = *nit_i;
1182             ++ nit_i;
1183         } while (nit_i != niText.rend() && *nit_i != '\0');
1184         --textId;
1185     }
1186     assert(textId == 0);
1187     assert(tsb_i == 0);
1188     assert(nit_i == niText.rend());
1189
1190 #ifdef DEBUG_MEMUSAGE
1191     std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1192     HeapProfiler::ResetMaxHeapConsumption();
1193 #endif
1194
1195     textStorage = tsbuilder.InitTextStorage(tsType);
1196
1197 #ifdef DEBUG_MEMUSAGE
1198     std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1199     HeapProfiler::ResetMaxHeapConsumption();
1200 #endif
1201
1202     sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1203     delete [] sampledpositions;
1204     assert(sampleCount == sampleLength);
1205     assert(sampled->rank1(n-1) == sampleLength);
1206
1207 #ifdef DEBUG_MEMUSAGE
1208     std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1209     HeapProfiler::ResetMaxHeapConsumption();
1210 #endif
1211
1212     // Suffixes store an offset from the text start position
1213     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1214     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1215
1216     x = n + niText.length() - 2;
1217     textId = numberOfTexts - 1;
1218     posOfSuccEndmarker = x + 1;
1219     for(ulong i = 0; i < sampleLength; i ++) {
1220         // Find next sampled text position
1221         while ((posOfSuccEndmarker - x) % samplerate != 0
1222                || notIndexed.isSet(textId)) // Loop over non-indexed
1223         {
1224             --x;
1225             assert(x != ~0lu);
1226             if (textStorage->IsEndmarker(x))
1227             {
1228                 posOfSuccEndmarker = x--;
1229                 -- textId;
1230             }
1231         }
1232         assert((*positions)[i] < n);
1233         ulong j = sampled->rank1((*positions)[i]);
1234
1235         assert(j != 0); // if (j==0) j=sampleLength;
1236
1237         TextPosition textPos = (x==n-1)?0:x+1;
1238         (*suffixDocId)[j-1] = textId; // textStorage->DocIdAtTextPos(textPos);
1239         assert(textStorage->DocIdAtTextPos(textPos) == textId);
1240
1241         assert((*suffixDocId)[j-1] < numberOfTexts);
1242         // calculate offset from text start:
1243         (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]);
1244         --x;
1245         if (x != ~0lu && textStorage->IsEndmarker(x))
1246         {
1247             posOfSuccEndmarker = x--;
1248             -- textId;
1249         }
1250     }
1251
1252     delete positions;
1253
1254 #ifdef DEBUG_MEMUSAGE
1255     std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1256     HeapProfiler::ResetMaxHeapConsumption();
1257 #endif
1258
1259 #ifdef DEBUG_MEMUSAGE
1260     std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1261     HeapProfiler::ResetMaxHeapConsumption();
1262 #endif
1263
1264     /*alphabet_mapper * am = new alphabet_mapper_none();
1265     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate?
1266     Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true);
1267     delete bmb;*/
1268     // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1269
1270     Doc = new ArrayDoc(endmarkerDocId);
1271
1272 #ifdef DEBUG_MEMUSAGE
1273     std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1274 #endif
1275 }
1276
1277
1278 /**
1279  * Finds document identifier for given text position
1280  *
1281  * Starting text position of the document is stored into second parameter.
1282  * Binary searching on text starting positions.
1283  */
1284 TextCollection::DocId FMIndex::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1285 {
1286     assert(i < n);
1287
1288     DocId a = 0;
1289     DocId b = numberOfTexts - 1;
1290     while (a < b)
1291     {
1292         DocId c = a + (b - a)/2;
1293         if ((*textStartPos)[c] > i)
1294             b = c - 1;
1295         else if ((*textStartPos)[c+1] > i)
1296             return c;
1297         else
1298             a = c + 1;
1299     }
1300
1301     assert(a < (DocId)numberOfTexts);
1302     assert(i >= (*textStartPos)[a]);
1303     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1304     return a;
1305 }
1306
1307
1308 } // namespace SXSI
1309