TCImplementation.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "TCImplementation.h"
  21
  22 //#define DEBUG_MEMUSAGE
  23 #ifdef DEBUG_MEMUSAGE
  24 #include "HeapProfiler.h" // FIXME remove
  25 #endif
  26
  27 #include <iostream>
  28 #include <map>
  29 #include <set>
  30 #include <vector>
  31 #include <utility>
  32 #include <stdexcept>
  33 #include <cassert>
  34 #include <cstring> // For strlen()
  35 using std::vector;
  36 using std::pair;
  37 using std::make_pair;
  38 using std::map;
  39
  40 namespace SXSI
  41 {
  42
  43 // Save file version info
  44 const uchar TCImplementation::versionFlag = 8;
  45
  46 /**
  47  * Constructor inits an empty dynamic FM-index.
  48  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  49  */
  50 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_,
  51                      unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, char tsType)
  52     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  53       suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
  54 {
  55     makewavelet(bwt); // Deletes bwt!
  56     bwt = 0;
  57
  58     // Make sampling tables
  59     maketables(numberOfSamples_, tsType);
  60 }
  61
  62 bool TCImplementation::EmptyText(DocId k) const
  63 {
  64     assert(k < (DocId)numberOfTexts);
  65     return false; // Empty texts are not indexed
  66 }
  67
  68 uchar * TCImplementation::GetText(DocId k) const
  69 {
  70     assert(k < (DocId)numberOfTexts);
  71
  72     return textStorage->GetText(k);
  73 /*    TextPosition i = k;
  74
  75     string result;
  76     // Reserve average string length to avoid reallocs
  77     result.reserve(n/numberOfTexts);
  78
  79     uchar c = alphabetrank->access(i);
  80     while (c != '\0')
  81     {
  82         result.push_back(c);
  83         i = C[c]+alphabetrank->rank(c,i)-1;
  84
  85         c = alphabetrank->access(i); // "next" char.
  86     }
  87
  88     // Convert to uchar (FIXME return string?)
  89     i = result.size();
  90     uchar* res = new uchar[i+1];
  91     res[i] = '\0';
  92     for (ulong j = 0; j < i; ++j)
  93         res[i-j-1] = result[j];
  94         return res;*/
  95 }
  96 /*
  97  * Not supported
  98 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
  99 {
 100     assert(k < (DocId)numberOfTexts);
 101     assert(j < (*textLength)[k]);
 102     assert(i <= j);
 103
 104     ulong textRank = 0;
 105
 106     // Start position of k'th text
 107     ulong start = (*textStartPos)[k];
 108
 109     return Substring(i + start, j-i+1);
 110     }*/
 111
 112 /******************************************************************
 113  * Existential queries
 114  */
 115 bool TCImplementation::IsPrefix(uchar const * pattern) const
 116 {
 117     TextPosition m = strlen((char *)pattern);
 118     if (m == 0)
 119         return true;
 120
 121     TextPosition sp = 0, ep = 0;
 122     Search(pattern, m, &sp, &ep);
 123
 124     // Check for end-marker(s) in result interval
 125     if (CountEndmarkers(sp, ep))
 126         return true;
 127     return false;
 128 }
 129
 130 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 131 {
 132     TextPosition m = strlen((char *)pattern);
 133     if (m == 0)
 134         return true;
 135
 136     TextPosition sp = 0, ep = 0;
 137     Search(pattern, m, &sp, &ep);
 138
 139     // Check for end-marker(s) in result interval
 140     if (CountEndmarkers(sp, ep, begin, end))
 141         return true;
 142     return false;
 143 }
 144
 145
 146 bool TCImplementation::IsSuffix(uchar const *pattern) const
 147 {
 148     // Here counting is as fast as IsSuffix():
 149     if (CountSuffix(pattern) > 0)
 150         return true;
 151     return false;
 152 }
 153
 154 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 155 {
 156     // Here counting is as fast as IsSuffix():
 157     if (CountSuffix(pattern, begin, end) > 0)
 158         return true;
 159     return false;
 160 }
 161
 162 bool TCImplementation::IsEqual(uchar const *pattern) const
 163 {
 164     TextPosition m = std::strlen((char *)pattern);
 165     if (m == 0)
 166         return false; // No empty texts exists
 167
 168     TextPosition sp = 0, ep = 0;
 169     // Match including end-marker
 170     Search(pattern, m+1, &sp, &ep);
 171
 172     // Check for end-marker(s) in result interval
 173     if (CountEndmarkers(sp, ep))
 174         return true;
 175     return false;
 176 }
 177
 178 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 179 {
 180     TextPosition m = std::strlen((char *)pattern);
 181     if (m == 0)
 182         return false; // No empty texts exists
 183
 184     TextPosition sp = 0, ep = 0;
 185     // Match including end-marker
 186     Search(pattern, m+1, &sp, &ep, begin, end);
 187
 188     // Check for end-marker(s) in result interval
 189     if (CountEndmarkers(sp, ep))
 190         return true;
 191     return false;
 192 }
 193
 194 bool TCImplementation::IsContains(uchar const * pattern) const
 195 {
 196     TextPosition m = strlen((char *)pattern);
 197     if (m == 0)
 198         return true;
 199
 200     TextPosition sp = 0, ep = 0;
 201     // Just check if pattern exists somewhere
 202     ulong count = Search(pattern, m, &sp, &ep);
 203
 204     if (count > 0)
 205         return true;
 206     return false;
 207 }
 208
 209 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
 210 {
 211     // Here counting is as fast as existential querying
 212     if (CountContains(pattern, begin, end) > 0)
 213         return true; // FIXME No need to filter result set
 214     return false;
 215 }
 216
 217 bool TCImplementation::IsLessThan(uchar const * pattern) const
 218 {
 219     if (CountLessThan(pattern) > 0)
 220         return true;
 221     return false;
 222 }
 223
 224 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 225 {
 226     if (CountLessThan(pattern, begin, end) > 0)
 227         return true;
 228     return false;
 229 }
 230
 231 /******************************************************************
 232  * Counting queries
 233  */
 234 ulong TCImplementation::Count(uchar const * pattern) const
 235 {
 236     TextPosition m = strlen((char *)pattern);
 237     if (m == 0)
 238         return 0;
 239
 240     TextPosition sp = 0, ep = 0;
 241     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 242     return count;
 243 }
 244
 245 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
 246 {
 247     TextPosition m = strlen((char *)pattern);
 248     if (m == 0)
 249         return numberOfTexts;
 250
 251     TextPosition sp = 0, ep = 0;
 252     Search(pattern, m, &sp, &ep);
 253
 254     // Count end-markers in result interval
 255     return CountEndmarkers(sp, ep);
 256 }
 257
 258 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 259 {
 260     TextPosition m = strlen((char *)pattern);
 261     if (m == 0)
 262         return numberOfTexts;
 263
 264     TextPosition sp = 0, ep = 0;
 265     Search(pattern, m, &sp, &ep);
 266
 267     // Count end-markers in result interval
 268     return CountEndmarkers(sp, ep, begin, end);
 269 }
 270
 271 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
 272 {
 273     TextPosition m = strlen((char *)pattern);
 274     if (m == 0)
 275         return numberOfTexts;
 276
 277     TextPosition sp = 0, ep = 0;
 278     // Search with end-marker
 279     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 280
 281     return count;
 282 }
 283
 284 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 285 {
 286     TextPosition m = strlen((char *)pattern);
 287     if (m == 0)
 288         return numberOfTexts;
 289
 290     TextPosition sp = 0, ep = 0;
 291     // Search with end-marker
 292     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 293
 294     return count;
 295 }
 296
 297 unsigned TCImplementation::CountEqual(uchar const *pattern) const
 298 {
 299     TextPosition m = strlen((char const *)pattern);
 300     if (m == 0)
 301         return 0; // No empty texts.
 302
 303     TextPosition sp = 0, ep = 0;
 304     // Match including end-marker
 305     Search(pattern, m+1, &sp, &ep);
 306
 307     // Count end-markers in result interval
 308     return CountEndmarkers(sp, ep);
 309 }
 310
 311 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 312 {
 313     TextPosition m = strlen((char const *)pattern);
 314     if (m == 0)
 315         return 0; // No empty texts.
 316
 317     TextPosition sp = 0, ep = 0;
 318     // Match including end-marker
 319     Search(pattern, m+1, &sp, &ep, begin, end);
 320
 321     // Count end-markers in result interval
 322     return CountEndmarkers(sp, ep); // Already within [begin, end]
 323 }
 324
 325 unsigned TCImplementation::CountContains(uchar const * pattern) const
 326 {
 327     TextPosition m = strlen((char const *)pattern);
 328     if (m == 0)
 329         return numberOfTexts; // Total number of texts.
 330
 331     // Here counting is as slow as fetching the result set
 332     // because we have to filter out occ's that fall within same document.
 333     TextCollection::document_result result = Contains(pattern);
 334     return result.size();
 335 }
 336
 337 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
 338 {
 339     TextPosition m = strlen((char const *)pattern);
 340     if (m == 0)
 341         return numberOfTexts; // Total number of texts.
 342
 343     // Here counting is as slow as fetching the result set
 344     // because we have to filter out occ's that fall within same document.
 345     TextCollection::document_result result = Contains(pattern, begin, end);
 346     return result.size();
 347 }
 348
 349 // Less than or equal
 350 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
 351 {
 352     TextPosition m = strlen((char const *)pattern);
 353     if (m == 0)
 354         return 0; // No empty texts.
 355
 356     TextPosition sp = 0, ep = 0;
 357     SearchLessThan(pattern, m, &sp, &ep);
 358
 359     // Count end-markers in result interval
 360     return CountEndmarkers(sp, ep);
 361 }
 362
 363 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 364 {
 365     TextPosition m = strlen((char const *)pattern);
 366     if (m == 0)
 367         return 0; // No empty texts.
 368
 369     TextPosition sp = 0, ep = 0;
 370     SearchLessThan(pattern, m, &sp, &ep);
 371
 372     // Count end-markers in result interval
 373     return CountEndmarkers(sp, ep, begin, end);
 374 }
 375
 376 /**
 377  * Document reporting queries
 378  */
 379 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
 380 {
 381     TextPosition m = strlen((char *)pattern);
 382     if (m == 0)
 383         return TextCollection::document_result(); // FIXME Should return all 1...k
 384
 385     TextPosition sp = 0, ep = 0;
 386     Search(pattern, m, &sp, &ep);
 387
 388     // Iterate through end-markers in [sp,ep]:
 389     return EnumerateEndmarkers(sp, ep);
 390 }
 391
 392 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
 393 {
 394     TextPosition m = strlen((char *)pattern);
 395     if (m == 0)
 396         return TextCollection::document_result(); // FIXME Should return all 1...k
 397
 398     TextPosition sp = 0, ep = 0;
 399     Search(pattern, m, &sp, &ep);
 400
 401     // Return end-markers in [sp,ep] and [begin, end]:
 402     return EnumerateEndmarkers(sp, ep, begin, end);
 403 }
 404
 405 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
 406 {
 407     TextPosition m = strlen((char *)pattern);
 408     if (m == 0)
 409         return TextCollection::document_result(); // FIXME Should return all 1...k
 410
 411     TextPosition sp = 0, ep = 0;
 412     // Search with end-marker
 413     Search(pattern, m+1, &sp, &ep);
 414
 415     TextCollection::document_result result;
 416     result.reserve(ep-sp+1); // Try to avoid reallocation.
 417
 418     // Check each occurrence
 419     for (; sp <= ep; ++sp)
 420     {
 421         TextPosition i = sp;
 422
 423         uchar c = alphabetrank->access(i);
 424         while (c != '\0' && !sampled->access(i))
 425         {
 426             i = C[c]+alphabetrank->rank(c,i)-1;
 427             c = alphabetrank->access(i);
 428         }
 429         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 430
 431         if (c == '\0')
 432         {
 433             // Rank among the end-markers in BWT
 434             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 435             result.push_back(Doc->access(endmarkerRank));
 436         }
 437         else // Sampled position
 438         {
 439             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 440             result.push_back(docId);
 441         }
 442     }
 443
 444     return result;
 445 }
 446
 447 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
 448 {
 449     TextPosition m = strlen((char *)pattern);
 450     if (m == 0)
 451         return TextCollection::document_result(); // FIXME Should return all 1...k
 452
 453     TextPosition sp = 0, ep = 0;
 454     // Search with end-marker
 455     Search(pattern, m+1, &sp, &ep, begin, end);
 456
 457     TextCollection::document_result result;
 458     result.reserve(ep-sp+1); // Try to avoid reallocation.
 459
 460     // Check each occurrence, already within [begin, end]
 461     for (; sp <= ep; ++sp)
 462     {
 463         TextPosition i = sp;
 464
 465         uchar c = alphabetrank->access(i);
 466         while (c != '\0' && !sampled->access(i))
 467         {
 468             i = C[c]+alphabetrank->rank(c,i)-1;
 469             c = alphabetrank->access(i);
 470         }
 471         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 472
 473         if (c == '\0')
 474         {
 475             // Rank among the end-markers in BWT
 476             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 477             result.push_back(Doc->access(endmarkerRank));
 478         }
 479         else // Sampled position
 480         {
 481             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 482             result.push_back(docId);
 483         }
 484     }
 485
 486     return result;
 487 }
 488
 489
 490 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
 491 {
 492     TextPosition m = strlen((char const *)pattern);
 493     if (m == 0)
 494         return TextCollection::document_result(); // FIXME Should return all empty texts
 495
 496     TextPosition sp = 0, ep = 0;
 497     // Match including end-marker
 498     Search(pattern, m+1, &sp, &ep);
 499
 500     // Report end-markers in result interval
 501     return EnumerateEndmarkers(sp, ep);
 502 }
 503
 504 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
 505 {
 506     TextPosition m = strlen((char const *)pattern);
 507     if (m == 0)
 508         return TextCollection::document_result(); // FIXME Should return all empty texts
 509
 510     TextPosition sp = 0, ep = 0;
 511     // Match including end-marker
 512     Search(pattern, m+1, &sp, &ep, begin, end);
 513
 514     // Report end-markers in result interval
 515     return EnumerateEndmarkers(sp, ep, begin, end);
 516 }
 517
 518
 519 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
 520 {
 521     TextPosition m = strlen((char *)pattern);
 522     if (m == 0)
 523         return TextCollection::document_result();
 524
 525     TextPosition sp = 0, ep = 0;
 526     // Search all occurrences
 527     Search(pattern, m, &sp, &ep);
 528
 529     // We want unique document indentifiers, using std::set to collect them
 530     std::set<DocId> resultSet;
 531     EnumerateDocuments(resultSet, sp, ep);
 532
 533     // Convert std::set to std::vector
 534     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 535     return result;
 536 }
 537
 538 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
 539 {
 540     TextPosition m = strlen((char *)pattern);
 541     if (m == 0)
 542         return TextCollection::document_result();
 543
 544     TextPosition sp = 0, ep = 0;
 545     // Search all occurrences
 546     Search(pattern, m, &sp, &ep);
 547
 548     // We want unique document indentifiers, using std::set to collect them
 549     std::set<DocId> resultSet;
 550     EnumerateDocuments(resultSet, sp, ep, begin, end);
 551
 552     // Convert std::set to std::vector
 553     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 554     return result;
 555 }
 556
 557
 558 /**
 559  ***
 560 * * FIXME Lessthan or equal
 561  */
 562 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
 563 {
 564     TextPosition m = strlen((char *)pattern);
 565     if (m == 0)
 566         return TextCollection::document_result(); // empty result set
 567
 568     TextPosition sp = 0, ep = 0;
 569     SearchLessThan(pattern, m, &sp, &ep);
 570
 571     // Report end-markers in result interval
 572     return EnumerateEndmarkers(sp, ep);
 573 }
 574
 575 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
 576 {
 577     TextPosition m = strlen((char *)pattern);
 578     if (m == 0)
 579         return TextCollection::document_result(); // empty result set
 580
 581     TextPosition sp = 0, ep = 0;
 582     SearchLessThan(pattern, m, &sp, &ep);
 583
 584     // Iterate through end-markers in [sp,ep] and [begin, end]:
 585     return EnumerateEndmarkers(sp, ep, begin, end);
 586 }
 587
 588
 589 TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const
 590 {
 591     TextPosition m = strlen((char *)pattern);
 592     if (m == 0)
 593         return TextCollection::document_result(); // empty result set
 594
 595     suffix_range_vector ranges;
 596     kmismatches(ranges, pattern, 0, n-1, m, k);
 597     std::set<DocId> resultSet;
 598
 599     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 600         // Iterate through docs in [sp,ep]:
 601         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 602
 603     // Convert std::set to std::vector
 604     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 605     return result;
 606 }
 607
 608 TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const
 609 {
 610     TextPosition m = strlen((char *)pattern);
 611     if (m == 0)
 612         return TextCollection::document_result(); // empty result set
 613
 614     suffix_range_vector ranges;
 615     ulong *dd = new ulong[m+1];
 616     for (ulong i=0;i<m+1;i++)
 617         dd[i]=i;
 618     kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 619     delete [] dd;
 620
 621     std::set<DocId> resultSet;
 622     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 623         // Iterate through docs in [sp,ep]:
 624         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 625
 626     // Convert std::set to std::vector
 627     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 628     return result;
 629 }
 630
 631
 632 /**
 633  * Full result set queries
 634  */
 635 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
 636 {
 637     TextPosition m = strlen((char *)pattern);
 638     if (m == 0)
 639         return full_result(); // FIXME Throw exception?
 640
 641     TextPosition sp = 0, ep = 0;
 642     // Search all occurrences
 643     Search(pattern, m, &sp, &ep);
 644
 645     full_result result;
 646     result.reserve(ep-sp+1); // Try to avoid reallocation.
 647     EnumeratePositions(result, sp, ep);
 648
 649     return result;
 650 }
 651
 652 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
 653 {
 654     TextPosition m = strlen((char *)pattern);
 655     if (m == 0)
 656         return full_result(); // FIXME Throw exception?
 657
 658     TextPosition sp = 0, ep = 0;
 659     // Search all occurrences
 660     Search(pattern, m, &sp, &ep);
 661
 662     full_result result;
 663     result.reserve(ep-sp+1); // Try to avoid reallocation.
 664     EnumeratePositions(result, sp, ep, begin, end);
 665
 666     return result;
 667 }
 668
 669 TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const
 670 {
 671     TextPosition m = strlen((char *)pattern);
 672     if (m == 0)
 673         return TextCollection::full_result(); // empty result set
 674
 675     suffix_range_vector ranges;
 676     ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
 677
 678     TextCollection::full_result result;
 679     result.reserve(count); // avoid reallocation.
 680     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 681         // Iterate through docs in [sp,ep]:
 682         EnumeratePositions(result, (*it).first, (*it).second);
 683     return result;
 684 }
 685
 686 TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const
 687 {
 688     TextPosition m = strlen((char *)pattern);
 689     if (m == 0)
 690         return TextCollection::full_result(); // empty result set
 691
 692     suffix_range_vector ranges;
 693     ulong *dd = new ulong[m+1];
 694     for (unsigned i=0;i<m+1;i++)
 695         dd[i]=i;
 696     ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 697     delete [] dd;
 698
 699     TextCollection::full_result result;
 700     result.reserve(count); // avoid reallocation.
 701     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 702         // Iterate through docs in [sp,ep]:
 703         EnumeratePositions(result, (*it).first, (*it).second);
 704     return result;
 705 }
 706
 707
 708 /**
 709  * Save index to a file handle
 710  *
 711  * Throws a std::runtime_error exception on i/o error.
 712  * First byte that is saved represents the version number of the save file.
 713  * In version 2 files, the data fields are:
 714  *  uchar versionFlag;
 715     TextPosition n;
 716     unsigned samplerate;
 717     unsigned C[256];
 718     TextPosition bwtEndPos;
 719     static_sequence * alphabetrank;
 720     BSGAP *sampled;
 721     BlockArray *suffixes;
 722     BlockArray *suffixDocId;
 723     unsigned numberOfTexts;
 724     ulong maxTextLength;
 725     static_sequence *docId;
 726  */
 727 void TCImplementation::Save(FILE *file) const
 728 {
 729     // Saving version info:
 730     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 731         throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
 732
 733     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 734         throw std::runtime_error("TCImplementation::Save(): file write error (n).");
 735     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 736         throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
 737
 738     for(ulong i = 0; i < 256; ++i)
 739         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 740             throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
 741
 742     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 743         throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
 744
 745     alphabetrank->save(file);
 746     sampled->save(file);
 747     suffixes->Save(file);
 748     suffixDocId->Save(file);
 749
 750     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 751         throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
 752     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 753         throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
 754
 755     Doc->save(file);
 756     textStorage->Save(file);
 757     fflush(file);
 758 }
 759
 760
 761 /**
 762  * Load index from a file handle
 763  *
 764  * Throws a std::runtime_error exception on i/o error.
 765  * For more info, see TCImplementation::Save().
 766  */
 767 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
 768     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
 769       suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
 770 {
 771 //    Tools::StartTimer();
 772 //    std::cout << std::endl << "Loading..."<< std::endl;
 773
 774     uchar verFlag = 0;
 775     if (std::fread(&verFlag, 1, 1, file) != 1)
 776         throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
 777     if (verFlag != TCImplementation::versionFlag)
 778         throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
 779
 780     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 781         throw std::runtime_error("TCImplementation::Load(): file read error (n).");
 782     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 783         throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
 784 // FIXME samplerate can not be changed during load.
 785 //    if (this->samplerate == 0)
 786 //        this->samplerate = samplerate;
 787
 788     for(ulong i = 0; i < 256; ++i)
 789         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 790             throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
 791
 792     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 793         throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
 794
 795 //    std::cout << "Loading alphabet rank (" << Tools::GetTime() << " s)." << std::endl;
 796     alphabetrank = static_sequence::load(file);
 797 //    std::cout << "Loading samples (" << Tools::GetTime() << " s)." << std::endl;
 798     sampled = static_bitsequence::load(file);
 799     suffixes = new BlockArray(file);
 800     suffixDocId = new BlockArray(file);
 801
 802     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 803         throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
 804     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 805         throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
 806
 807 //    std::cout << "Loading Doc (" << Tools::GetTime() << " s)." << std::endl;
 808     Doc = new ArrayDoc(file); //static_sequence::load(file);
 809 //    std::cout << "Loading text storage (" << Tools::GetTime() << " s)." << std::endl;
 810     textStorage = TextStorage::Load(file);
 811
 812 //    std::cout << "Loading done(" << Tools::GetTime() << " s)." << std::endl;
 813
 814     // FIXME Construct data structures with new samplerate
 815     //maketables();
 816 }
 817
 818
 819
 820 /**
 821  * Rest of the functions follow...
 822  */
 823 ulong TCImplementation::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
 824 {
 825     int c;
 826     while (*sp<=*ep && i>=1)
 827     {
 828         c = (int)pattern[--i];
 829         *sp = C[c]+alphabetrank->rank(c,*sp-1);
 830         *ep = C[c]+alphabetrank->rank(c,*ep)-1;
 831     }
 832     if (*sp<=*ep)
 833         return *ep - *sp + 1;
 834     else
 835         return 0;
 836 }
 837
 838
 839 ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
 840 {
 841     if (sp>ep) return 0;
 842     if (j == 0)
 843     {
 844         result.push_back(std::make_pair(sp,ep));
 845         return ep-sp+1;
 846     }
 847     int c;
 848     ulong spnew;
 849     ulong epnew;
 850     int knew;
 851     ulong sum=0;
 852     if (k==0)
 853     {
 854         sum = searchPrefix(pattern, j, &sp, &ep);
 855         if (sp<=ep)
 856             result.push_back(std::make_pair(sp, ep));
 857         return sum;
 858     }
 859     vector<int> chars = alphabetrank->accessAll(sp, ep);
 860     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 861     {
 862         if (*it == 0)
 863             continue; // skip '\0'
 864         c = *it;
 865         spnew = C[c]+alphabetrank->rank(c,sp-1);
 866         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 867         if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
 868         if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
 869     }
 870     return sum;
 871 }
 872
 873 //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
 874 ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
 875 {
 876     ulong sum=0;
 877     if (d[m]<=k) // range of suffixes with at most k-errors found
 878     {
 879         if (sp<=ep)
 880             result.push_back(std::make_pair(sp, ep));
 881         sum += (sp<=ep)?ep-sp+1:0ul;
 882     }
 883     if (sp>ep || j==0)
 884         return sum;
 885     ulong *dnew = new ulong[m+1];
 886     int c;
 887     ulong spnew;
 888     ulong p,lowerbound;
 889     ulong epnew;
 890     vector<int> chars = alphabetrank->accessAll(sp, ep);
 891     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 892     {
 893         if (*it == 0)
 894             continue; // skip '\0'
 895         c = *it;
 896         spnew = C[c]+alphabetrank->rank(c,sp-1);
 897         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 898         if (spnew>epnew) continue;
 899         dnew[0]=m+k-j+1;
 900         lowerbound=k+1;
 901         for (p=1; p<=m; p++) {
 902             dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
 903             if (dnew[p]<lowerbound)
 904                 lowerbound = dnew[p];
 905         }
 906         if (lowerbound<=k)
 907             sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
 908     }
 909     delete [] dnew;
 910     return sum;
 911 }
 912
 913
 914 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 915 {
 916     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 917     int c = (int)pattern[m-1];
 918     TextPosition i=m-1;
 919     TextPosition sp = C[c];
 920     TextPosition ep = C[c+1]-1;
 921     while (sp<=ep && i>=1)
 922     {
 923 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 924         c = (int)pattern[--i];
 925         sp = C[c]+alphabetrank->rank(c,sp-1);
 926         ep = C[c]+alphabetrank->rank(c,ep)-1;
 927     }
 928     *spResult = sp;
 929     *epResult = ep;
 930     if (sp<=ep)
 931         return ep - sp + 1;
 932     else
 933         return 0;
 934 }
 935
 936 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
 937 {
 938     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 939     int c = (int)pattern[m-1];
 940     assert(c == 0); // Start from endmarkers
 941     TextPosition i=m-1;
 942     TextPosition sp = begin;
 943     TextPosition ep = end;
 944     while (sp<=ep && i>=1)
 945     {
 946 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 947         c = (int)pattern[--i];
 948         sp = C[c]+alphabetrank->rank(c,sp-1);
 949         ep = C[c]+alphabetrank->rank(c,ep)-1;
 950     }
 951     *spResult = sp;
 952     *epResult = ep;
 953     if (sp<=ep)
 954         return ep - sp + 1;
 955     else
 956         return 0;
 957 }
 958
 959
 960 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 961 {
 962     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 963     uint c = (int)pattern[m-1];
 964     TextPosition i=m-1;
 965     TextPosition sp = 1;
 966     TextPosition ep = C[c+1]-1;
 967     while (sp<=ep && i>=1)
 968     {
 969 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 970         c = (int)pattern[--i];
 971         uint result = alphabetrank->rankLessThan(c,ep);
 972         if (result == ~0u)
 973             ep = 0;
 974         else
 975             ep = C[c]+result-1;
 976     }
 977     *spResult = sp;
 978     *epResult = ep;
 979     if (sp<=ep)
 980         return ep - sp + 1;
 981     else
 982         return 0;
 983 }
 984
 985
 986 TCImplementation::~TCImplementation() {
 987     delete alphabetrank;
 988     delete sampled;
 989     delete suffixes;
 990     delete suffixDocId;
 991     delete Doc;
 992     delete textStorage;
 993 }
 994
 995 void TCImplementation::makewavelet(uchar *bwt)
 996 {
 997     ulong i, min = 0,
 998              max;
 999     for (i=0;i<256;i++)
1000         C[i]=0;
1001     for (i=0;i<n;++i)
1002         C[(int)bwt[i]]++;
1003     for (i=0;i<256;i++)
1004         if (C[i]>0) {min = i; break;}
1005     for (i=255;i>=min;--i)
1006         if (C[i]>0) {max = i; break;}
1007
1008     ulong prev=C[0], temp;
1009     C[0]=0;
1010     for (i=1;i<256;i++) {
1011         temp = C[i];
1012         C[i]=C[i-1]+prev;
1013         prev = temp;
1014     }
1015 //    this->codetable = node::makecodetable(bwt,n);
1016 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1017 //    delete [] bwt;
1018     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1019 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1020
1021 #ifdef DEBUG_MEMUSAGE
1022     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1023     HeapProfiler::ResetMaxHeapConsumption();
1024 #endif
1025
1026     alphabet_mapper * am = new alphabet_mapper_none();
1027     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate?
1028     wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape
1029     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1030     delete bmb;
1031     bwt = 0; // already deleted
1032
1033 #ifdef DEBUG_MEMUSAGE
1034     std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1035     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1036 #endif
1037 }
1038
1039 void TCImplementation::maketables(ulong sampleLength, char tsType)
1040 {
1041     // Calculate BWT end-marker position (of last inserted text)
1042     {
1043         ulong i = 0;
1044         uint alphabetrank_i_tmp = 0;
1045         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1046         while (c != '\0')
1047         {
1048             i = C[c]+alphabetrank_i_tmp-1;
1049             c = alphabetrank->access(i, alphabetrank_i_tmp);
1050         }
1051
1052         this->bwtEndPos = i;
1053     }
1054
1055 #ifdef DEBUG_MEMUSAGE
1056     std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1057     HeapProfiler::ResetMaxHeapConsumption();
1058 #endif
1059
1060     // Build up array for text starting positions
1061 //    BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1062 //    (*textStartPos)[0] = 0;
1063
1064     // Mapping from end-markers to doc ID's:
1065     unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts);
1066 //    uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1];
1067     BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts);
1068
1069     BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n));
1070     uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
1071     for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
1072         sampledpositions[i] = 0;
1073
1074     ulong x,p=bwtEndPos;
1075     ulong sampleCount = 0;
1076     // Keeping track of text position of prev. end-marker seen
1077     ulong posOfSuccEndmarker = n-1;
1078     DocId textId = numberOfTexts;
1079     ulong ulongmax = 0;
1080     ulongmax--;
1081     uint alphabetrank_i_tmp =0;
1082
1083     TextStorageBuilder tsbuilder(n);
1084     Tools::StartTimer();
1085
1086     for (ulong i=n-1;i<ulongmax;i--) {
1087         // i substitutes SA->GetPos(i)
1088         x=(i==n-1)?0:i+1;
1089
1090         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1091         tsbuilder[i] = c;
1092
1093         if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0')
1094         {
1095             set_field(sampledpositions,1,p,1);
1096             (*positions)[sampleCount] = p;
1097             sampleCount ++;
1098         }
1099
1100         if (c == '\0')
1101         {
1102             --textId;
1103
1104             // Record the order of end-markers in BWT:
1105             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1106             //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts);
1107             (*endmarkerDocId)[endmarkerRank] = (textId + 1) % numberOfTexts;
1108
1109             // Store text length and text start position:
1110             if (textId < (DocId)numberOfTexts - 1)
1111             {
1112 //                (*textStartPos)[textId + 1] = x;  // x-1 is text position of end-marker.
1113                 posOfSuccEndmarker = i;
1114             }
1115
1116             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1117             p = textId; // Correct LF-mapping to the last char of the previous text.
1118         }
1119         else // Now c != '\0', do LF-mapping:
1120             p = C[c]+alphabetrank_i_tmp-1;
1121     }
1122     assert(textId == 0);
1123
1124 #ifdef DEBUG_MEMUSAGE
1125     std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1126     HeapProfiler::ResetMaxHeapConsumption();
1127 #endif
1128
1129     textStorage = tsbuilder.InitTextStorage(tsType);
1130
1131 #ifdef DEBUG_MEMUSAGE
1132     std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1133     HeapProfiler::ResetMaxHeapConsumption();
1134 #endif
1135
1136     sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1137     delete [] sampledpositions;
1138     assert(sampleCount == sampleLength);
1139     assert(sampled->rank1(n-1) == sampleLength);
1140
1141 #ifdef DEBUG_MEMUSAGE
1142     std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1143     HeapProfiler::ResetMaxHeapConsumption();
1144 #endif
1145
1146     // Suffixes store an offset from the text start position
1147     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1148     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1149
1150     x = n - 2;
1151     posOfSuccEndmarker = n-1;
1152     for(ulong i=0; i<sampleLength; i++) {
1153         // Find next sampled text position
1154         while ((posOfSuccEndmarker - x) % samplerate != 0)
1155         {
1156             --x;
1157             assert(x != ~0lu);
1158             if (textStorage->IsEndmarker(x))
1159                 posOfSuccEndmarker = x--;
1160         }
1161         assert((*positions)[i] < n);
1162         ulong j = sampled->rank1((*positions)[i]);
1163
1164         assert(j != 0); // if (j==0) j=sampleLength;
1165
1166         TextPosition textPos = (x==n-1)?0:x+1;
1167         (*suffixDocId)[j-1] = textStorage->DocIdAtTextPos(textPos);
1168
1169         assert((*suffixDocId)[j-1] < numberOfTexts);
1170         // calculate offset from text start:
1171         (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]);
1172         --x;
1173         if (x != ~0lu && textStorage->IsEndmarker(x))
1174             posOfSuccEndmarker = x--;
1175     }
1176
1177     delete positions;
1178
1179 #ifdef DEBUG_MEMUSAGE
1180     std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1181     HeapProfiler::ResetMaxHeapConsumption();
1182 #endif
1183
1184     /**
1185      * Second pass: check tables
1186      */
1187 /*    p=bwtEndPos;
1188     textId = numberOfTexts;
1189     for (ulong i=n-1;i<ulongmax;i--) {
1190         x=(i==n-1)?0:i+1;
1191
1192         if (sampled->access(p)) {
1193             ulong j = sampled->rank1(p)-1;
1194             assert((*suffixDocId)[j] == DocIdAtTextPos(textStartPos, x));
1195
1196             // calculate offset from text start:
1197             assert((*suffixes)[j] == x - (*textStartPos)[(*suffixDocId)[j]]);
1198         }
1199
1200         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1201
1202         if (c == '\0')
1203         {
1204             --textId;
1205             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1206             p = textId; // Correct LF-mapping to the last char of the previous text.
1207         }
1208         else // Now c != '\0', do LF-mapping:
1209             p = C[c]+alphabetrank_i_tmp-1;
1210     }
1211     assert(textId == 0);
1212     delete textStartPos
1213 */
1214
1215 #ifdef DEBUG_MEMUSAGE
1216     std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1217     HeapProfiler::ResetMaxHeapConsumption();
1218 #endif
1219
1220     /*alphabet_mapper * am = new alphabet_mapper_none();
1221     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate?
1222     Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true);
1223     delete bmb;*/
1224     // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1225
1226     Doc = new ArrayDoc(endmarkerDocId);
1227
1228 #ifdef DEBUG_MEMUSAGE
1229     std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1230 #endif
1231 }
1232
1233
1234 /**
1235  * Finds document identifier for given text position
1236  *
1237  * Starting text position of the document is stored into second parameter.
1238  * Binary searching on text starting positions.
1239  */
1240 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1241 {
1242     assert(i < n);
1243
1244     DocId a = 0;
1245     DocId b = numberOfTexts - 1;
1246     while (a < b)
1247     {
1248         DocId c = a + (b - a)/2;
1249         if ((*textStartPos)[c] > i)
1250             b = c - 1;
1251         else if ((*textStartPos)[c+1] > i)
1252             return c;
1253         else
1254             a = c + 1;
1255     }
1256
1257     assert(a < (DocId)numberOfTexts);
1258     assert(i >= (*textStartPos)[a]);
1259     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1260     return a;
1261 }
1262
1263
1264 } // namespace SXSI
1265