TCImplementation.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "TCImplementation.h"
  21
  22 //#define DEBUG_MEMUSAGE
  23 #ifdef DEBUG_MEMUSAGE
  24 #include "HeapProfiler.h" // FIXME remove
  25 #endif
  26
  27 #include <iostream>
  28 #include <map>
  29 #include <set>
  30 #include <vector>
  31 #include <utility>
  32 #include <stdexcept>
  33 #include <cassert>
  34 #include <cstring> // For strlen()
  35 using std::vector;
  36 using std::pair;
  37 using std::make_pair;
  38 using std::map;
  39
  40 namespace SXSI
  41 {
  42
  43 // Save file version info
  44 const uchar TCImplementation::versionFlag = 6;
  45
  46 /**
  47  * Constructor inits an empty dynamic FM-index.
  48  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  49  */
  50 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_,
  51                      unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, char tsType)
  52     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  53       suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
  54 {
  55     makewavelet(bwt); // Deletes bwt!
  56     bwt = 0;
  57
  58     // Make sampling tables
  59     maketables(numberOfSamples_, tsType);
  60 }
  61
  62 bool TCImplementation::EmptyText(DocId k) const
  63 {
  64     assert(k < (DocId)numberOfTexts);
  65     return false; // Empty texts are not indexed
  66 }
  67
  68 uchar * TCImplementation::GetText(DocId k) const
  69 {
  70     assert(k < (DocId)numberOfTexts);
  71
  72     return textStorage->GetText(k);
  73 /*    TextPosition i = k;
  74
  75     string result;
  76     // Reserve average string length to avoid reallocs
  77     result.reserve(n/numberOfTexts);
  78
  79     uchar c = alphabetrank->access(i);
  80     while (c != '\0')
  81     {
  82         result.push_back(c);
  83         i = C[c]+alphabetrank->rank(c,i)-1;
  84
  85         c = alphabetrank->access(i); // "next" char.
  86     }
  87
  88     // Convert to uchar (FIXME return string?)
  89     i = result.size();
  90     uchar* res = new uchar[i+1];
  91     res[i] = '\0';
  92     for (ulong j = 0; j < i; ++j)
  93         res[i-j-1] = result[j];
  94         return res;*/
  95 }
  96 /*
  97  * Not supported
  98 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
  99 {
 100     assert(k < (DocId)numberOfTexts);
 101     assert(j < (*textLength)[k]);
 102     assert(i <= j);
 103
 104     ulong textRank = 0;
 105
 106     // Start position of k'th text
 107     ulong start = (*textStartPos)[k];
 108
 109     return Substring(i + start, j-i+1);
 110     }*/
 111
 112 /******************************************************************
 113  * Existential queries
 114  */
 115 bool TCImplementation::IsPrefix(uchar const * pattern) const
 116 {
 117     TextPosition m = strlen((char *)pattern);
 118     if (m == 0)
 119         return true;
 120
 121     TextPosition sp = 0, ep = 0;
 122     Search(pattern, m, &sp, &ep);
 123
 124     // Check for end-marker(s) in result interval
 125     if (CountEndmarkers(sp, ep))
 126         return true;
 127     return false;
 128 }
 129
 130 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 131 {
 132     TextPosition m = strlen((char *)pattern);
 133     if (m == 0)
 134         return true;
 135
 136     TextPosition sp = 0, ep = 0;
 137     Search(pattern, m, &sp, &ep);
 138
 139     // Check for end-marker(s) in result interval
 140     if (CountEndmarkers(sp, ep, begin, end))
 141         return true;
 142     return false;
 143 }
 144
 145
 146 bool TCImplementation::IsSuffix(uchar const *pattern) const
 147 {
 148     // Here counting is as fast as IsSuffix():
 149     if (CountSuffix(pattern) > 0)
 150         return true;
 151     return false;
 152 }
 153
 154 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 155 {
 156     // Here counting is as fast as IsSuffix():
 157     if (CountSuffix(pattern, begin, end) > 0)
 158         return true;
 159     return false;
 160 }
 161
 162 bool TCImplementation::IsEqual(uchar const *pattern) const
 163 {
 164     TextPosition m = std::strlen((char *)pattern);
 165     if (m == 0)
 166         return false; // No empty texts exists
 167
 168     TextPosition sp = 0, ep = 0;
 169     // Match including end-marker
 170     Search(pattern, m+1, &sp, &ep);
 171
 172     // Check for end-marker(s) in result interval
 173     if (CountEndmarkers(sp, ep))
 174         return true;
 175     return false;
 176 }
 177
 178 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 179 {
 180     TextPosition m = std::strlen((char *)pattern);
 181     if (m == 0)
 182         return false; // No empty texts exists
 183
 184     TextPosition sp = 0, ep = 0;
 185     // Match including end-marker
 186     Search(pattern, m+1, &sp, &ep, begin, end);
 187
 188     // Check for end-marker(s) in result interval
 189     if (CountEndmarkers(sp, ep))
 190         return true;
 191     return false;
 192 }
 193
 194 bool TCImplementation::IsContains(uchar const * pattern) const
 195 {
 196     TextPosition m = strlen((char *)pattern);
 197     if (m == 0)
 198         return true;
 199
 200     TextPosition sp = 0, ep = 0;
 201     // Just check if pattern exists somewhere
 202     ulong count = Search(pattern, m, &sp, &ep);
 203
 204     if (count > 0)
 205         return true;
 206     return false;
 207 }
 208
 209 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
 210 {
 211     // Here counting is as fast as existential querying
 212     if (CountContains(pattern, begin, end) > 0)
 213         return true; // FIXME No need to filter result set
 214     return false;
 215 }
 216
 217 bool TCImplementation::IsLessThan(uchar const * pattern) const
 218 {
 219     if (CountLessThan(pattern) > 0)
 220         return true;
 221     return false;
 222 }
 223
 224 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 225 {
 226     if (CountLessThan(pattern, begin, end) > 0)
 227         return true;
 228     return false;
 229 }
 230
 231 /******************************************************************
 232  * Counting queries
 233  */
 234 ulong TCImplementation::Count(uchar const * pattern) const
 235 {
 236     TextPosition m = strlen((char *)pattern);
 237     if (m == 0)
 238         return 0;
 239
 240     TextPosition sp = 0, ep = 0;
 241     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 242     return count;
 243 }
 244
 245 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
 246 {
 247     TextPosition m = strlen((char *)pattern);
 248     if (m == 0)
 249         return numberOfTexts;
 250
 251     TextPosition sp = 0, ep = 0;
 252     Search(pattern, m, &sp, &ep);
 253
 254     // Count end-markers in result interval
 255     return CountEndmarkers(sp, ep);
 256 }
 257
 258 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 259 {
 260     TextPosition m = strlen((char *)pattern);
 261     if (m == 0)
 262         return numberOfTexts;
 263
 264     TextPosition sp = 0, ep = 0;
 265     Search(pattern, m, &sp, &ep);
 266
 267     // Count end-markers in result interval
 268     return CountEndmarkers(sp, ep, begin, end);
 269 }
 270
 271 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
 272 {
 273     TextPosition m = strlen((char *)pattern);
 274     if (m == 0)
 275         return numberOfTexts;
 276
 277     TextPosition sp = 0, ep = 0;
 278     // Search with end-marker
 279     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 280
 281     return count;
 282 }
 283
 284 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 285 {
 286     TextPosition m = strlen((char *)pattern);
 287     if (m == 0)
 288         return numberOfTexts;
 289
 290     TextPosition sp = 0, ep = 0;
 291     // Search with end-marker
 292     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 293
 294     return count;
 295 }
 296
 297 unsigned TCImplementation::CountEqual(uchar const *pattern) const
 298 {
 299     TextPosition m = strlen((char const *)pattern);
 300     if (m == 0)
 301         return 0; // No empty texts.
 302
 303     TextPosition sp = 0, ep = 0;
 304     // Match including end-marker
 305     Search(pattern, m+1, &sp, &ep);
 306
 307     // Count end-markers in result interval
 308     return CountEndmarkers(sp, ep);
 309 }
 310
 311 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 312 {
 313     TextPosition m = strlen((char const *)pattern);
 314     if (m == 0)
 315         return 0; // No empty texts.
 316
 317     TextPosition sp = 0, ep = 0;
 318     // Match including end-marker
 319     Search(pattern, m+1, &sp, &ep, begin, end);
 320
 321     // Count end-markers in result interval
 322     return CountEndmarkers(sp, ep); // Already within [begin, end]
 323 }
 324
 325 unsigned TCImplementation::CountContains(uchar const * pattern) const
 326 {
 327     TextPosition m = strlen((char const *)pattern);
 328     if (m == 0)
 329         return numberOfTexts; // Total number of texts.
 330
 331     // Here counting is as slow as fetching the result set
 332     // because we have to filter out occ's that fall within same document.
 333     TextCollection::document_result result = Contains(pattern);
 334     return result.size();
 335 }
 336
 337 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
 338 {
 339     TextPosition m = strlen((char const *)pattern);
 340     if (m == 0)
 341         return numberOfTexts; // Total number of texts.
 342
 343     // Here counting is as slow as fetching the result set
 344     // because we have to filter out occ's that fall within same document.
 345     TextCollection::document_result result = Contains(pattern, begin, end);
 346     return result.size();
 347 }
 348
 349 // Less than or equal
 350 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
 351 {
 352     TextPosition m = strlen((char const *)pattern);
 353     if (m == 0)
 354         return 0; // No empty texts.
 355
 356     TextPosition sp = 0, ep = 0;
 357     SearchLessThan(pattern, m, &sp, &ep);
 358
 359     // Count end-markers in result interval
 360     return CountEndmarkers(sp, ep);
 361 }
 362
 363 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 364 {
 365     TextPosition m = strlen((char const *)pattern);
 366     if (m == 0)
 367         return 0; // No empty texts.
 368
 369     TextPosition sp = 0, ep = 0;
 370     SearchLessThan(pattern, m, &sp, &ep);
 371
 372     // Count end-markers in result interval
 373     return CountEndmarkers(sp, ep, begin, end);
 374 }
 375
 376 /**
 377  * Document reporting queries
 378  */
 379 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
 380 {
 381     TextPosition m = strlen((char *)pattern);
 382     if (m == 0)
 383         return TextCollection::document_result(); // FIXME Should return all 1...k
 384
 385     TextPosition sp = 0, ep = 0;
 386     Search(pattern, m, &sp, &ep);
 387
 388     // Iterate through end-markers in [sp,ep]:
 389     return EnumerateEndmarkers(sp, ep);
 390 }
 391
 392 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
 393 {
 394     TextPosition m = strlen((char *)pattern);
 395     if (m == 0)
 396         return TextCollection::document_result(); // FIXME Should return all 1...k
 397
 398     TextPosition sp = 0, ep = 0;
 399     Search(pattern, m, &sp, &ep);
 400
 401     // Return end-markers in [sp,ep] and [begin, end]:
 402     return EnumerateEndmarkers(sp, ep, begin, end);
 403 }
 404
 405 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
 406 {
 407     TextPosition m = strlen((char *)pattern);
 408     if (m == 0)
 409         return TextCollection::document_result(); // FIXME Should return all 1...k
 410
 411     TextPosition sp = 0, ep = 0;
 412     // Search with end-marker
 413     Search(pattern, m+1, &sp, &ep);
 414
 415     TextCollection::document_result result;
 416     result.reserve(ep-sp+1); // Try to avoid reallocation.
 417
 418     // Check each occurrence
 419     for (; sp <= ep; ++sp)
 420     {
 421         TextPosition i = sp;
 422
 423         uchar c = alphabetrank->access(i);
 424         while (c != '\0' && !sampled->access(i))
 425         {
 426             i = C[c]+alphabetrank->rank(c,i)-1;
 427             c = alphabetrank->access(i);
 428         }
 429         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 430
 431         if (c == '\0')
 432         {
 433             // Rank among the end-markers in BWT
 434             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 435             result.push_back(Doc->access(endmarkerRank));
 436         }
 437         else // Sampled position
 438         {
 439             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 440             result.push_back(docId);
 441         }
 442     }
 443
 444     return result;
 445 }
 446
 447 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
 448 {
 449     TextPosition m = strlen((char *)pattern);
 450     if (m == 0)
 451         return TextCollection::document_result(); // FIXME Should return all 1...k
 452
 453     TextPosition sp = 0, ep = 0;
 454     // Search with end-marker
 455     Search(pattern, m+1, &sp, &ep, begin, end);
 456
 457     TextCollection::document_result result;
 458     result.reserve(ep-sp+1); // Try to avoid reallocation.
 459
 460     // Check each occurrence, already within [begin, end]
 461     for (; sp <= ep; ++sp)
 462     {
 463         TextPosition i = sp;
 464
 465         uchar c = alphabetrank->access(i);
 466         while (c != '\0' && !sampled->access(i))
 467         {
 468             i = C[c]+alphabetrank->rank(c,i)-1;
 469             c = alphabetrank->access(i);
 470         }
 471         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 472
 473         if (c == '\0')
 474         {
 475             // Rank among the end-markers in BWT
 476             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 477             result.push_back(Doc->access(endmarkerRank));
 478         }
 479         else // Sampled position
 480         {
 481             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 482             result.push_back(docId);
 483         }
 484     }
 485
 486     return result;
 487 }
 488
 489
 490 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
 491 {
 492     TextPosition m = strlen((char const *)pattern);
 493     if (m == 0)
 494         return TextCollection::document_result(); // FIXME Should return all empty texts
 495
 496     TextPosition sp = 0, ep = 0;
 497     // Match including end-marker
 498     Search(pattern, m+1, &sp, &ep);
 499
 500     // Report end-markers in result interval
 501     return EnumerateEndmarkers(sp, ep);
 502 }
 503
 504 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
 505 {
 506     TextPosition m = strlen((char const *)pattern);
 507     if (m == 0)
 508         return TextCollection::document_result(); // FIXME Should return all empty texts
 509
 510     TextPosition sp = 0, ep = 0;
 511     // Match including end-marker
 512     Search(pattern, m+1, &sp, &ep, begin, end);
 513
 514     // Report end-markers in result interval
 515     return EnumerateEndmarkers(sp, ep, begin, end);
 516 }
 517
 518
 519 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
 520 {
 521     TextPosition m = strlen((char *)pattern);
 522     if (m == 0)
 523         return TextCollection::document_result();
 524
 525     TextPosition sp = 0, ep = 0;
 526     // Search all occurrences
 527     Search(pattern, m, &sp, &ep);
 528
 529     // We want unique document indentifiers, using std::set to collect them
 530     std::set<DocId> resultSet;
 531     EnumerateDocuments(resultSet, sp, ep);
 532
 533     // Convert std::set to std::vector
 534     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 535     return result;
 536 }
 537
 538 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
 539 {
 540     TextPosition m = strlen((char *)pattern);
 541     if (m == 0)
 542         return TextCollection::document_result();
 543
 544     TextPosition sp = 0, ep = 0;
 545     // Search all occurrences
 546     Search(pattern, m, &sp, &ep);
 547
 548     // We want unique document indentifiers, using std::set to collect them
 549     std::set<DocId> resultSet;
 550     EnumerateDocuments(resultSet, sp, ep, begin, end);
 551
 552     // Convert std::set to std::vector
 553     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 554     return result;
 555 }
 556
 557 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
 558 {
 559     TextPosition m = strlen((char *)pattern);
 560     if (m == 0)
 561         return TextCollection::document_result(); // empty result set
 562
 563     TextPosition sp = 0, ep = 0;
 564     SearchLessThan(pattern, m, &sp, &ep);
 565
 566     // Report end-markers in result interval
 567     return EnumerateEndmarkers(sp, ep);
 568 }
 569
 570 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
 571 {
 572     TextPosition m = strlen((char *)pattern);
 573     if (m == 0)
 574         return TextCollection::document_result(); // empty result set
 575
 576     TextPosition sp = 0, ep = 0;
 577     SearchLessThan(pattern, m, &sp, &ep);
 578
 579     // Iterate through end-markers in [sp,ep] and [begin, end]:
 580     return EnumerateEndmarkers(sp, ep, begin, end);
 581 }
 582
 583
 584 TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const
 585 {
 586     TextPosition m = strlen((char *)pattern);
 587     if (m == 0)
 588         return TextCollection::document_result(); // empty result set
 589
 590     suffix_range_vector ranges;
 591     kmismatches(ranges, pattern, 0, n-1, m, k);
 592     std::set<DocId> resultSet;
 593
 594     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 595         // Iterate through docs in [sp,ep]:
 596         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 597
 598     // Convert std::set to std::vector
 599     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 600     return result;
 601 }
 602
 603 TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const
 604 {
 605     TextPosition m = strlen((char *)pattern);
 606     if (m == 0)
 607         return TextCollection::document_result(); // empty result set
 608
 609     suffix_range_vector ranges;
 610     ulong *dd = new ulong[m+1];
 611     for (ulong i=0;i<m+1;i++)
 612         dd[i]=i;
 613     kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 614     delete [] dd;
 615
 616     std::set<DocId> resultSet;
 617     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 618         // Iterate through docs in [sp,ep]:
 619         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 620
 621     // Convert std::set to std::vector
 622     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 623     return result;
 624 }
 625
 626
 627 /**
 628  * Full result set queries
 629  */
 630 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
 631 {
 632     TextPosition m = strlen((char *)pattern);
 633     if (m == 0)
 634         return full_result(); // FIXME Throw exception?
 635
 636     TextPosition sp = 0, ep = 0;
 637     // Search all occurrences
 638     Search(pattern, m, &sp, &ep);
 639
 640     full_result result;
 641     result.reserve(ep-sp+1); // Try to avoid reallocation.
 642     EnumeratePositions(result, sp, ep);
 643
 644     return result;
 645 }
 646
 647 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
 648 {
 649     TextPosition m = strlen((char *)pattern);
 650     if (m == 0)
 651         return full_result(); // FIXME Throw exception?
 652
 653     TextPosition sp = 0, ep = 0;
 654     // Search all occurrences
 655     Search(pattern, m, &sp, &ep);
 656
 657     full_result result;
 658     result.reserve(ep-sp+1); // Try to avoid reallocation.
 659     EnumeratePositions(result, sp, ep, begin, end);
 660
 661     return result;
 662 }
 663
 664 TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const
 665 {
 666     TextPosition m = strlen((char *)pattern);
 667     if (m == 0)
 668         return TextCollection::full_result(); // empty result set
 669
 670     suffix_range_vector ranges;
 671     ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
 672
 673     TextCollection::full_result result;
 674     result.reserve(count); // avoid reallocation.
 675     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 676         // Iterate through docs in [sp,ep]:
 677         EnumeratePositions(result, (*it).first, (*it).second);
 678     return result;
 679 }
 680
 681 TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const
 682 {
 683     TextPosition m = strlen((char *)pattern);
 684     if (m == 0)
 685         return TextCollection::full_result(); // empty result set
 686
 687     suffix_range_vector ranges;
 688     ulong *dd = new ulong[m+1];
 689     for (unsigned i=0;i<m+1;i++)
 690         dd[i]=i;
 691     ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 692     delete [] dd;
 693
 694     TextCollection::full_result result;
 695     result.reserve(count); // avoid reallocation.
 696     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 697         // Iterate through docs in [sp,ep]:
 698         EnumeratePositions(result, (*it).first, (*it).second);
 699     return result;
 700 }
 701
 702
 703 /**
 704  * Save index to a file handle
 705  *
 706  * Throws a std::runtime_error exception on i/o error.
 707  * First byte that is saved represents the version number of the save file.
 708  * In version 2 files, the data fields are:
 709  *  uchar versionFlag;
 710     TextPosition n;
 711     unsigned samplerate;
 712     unsigned C[256];
 713     TextPosition bwtEndPos;
 714     static_sequence * alphabetrank;
 715     BSGAP *sampled;
 716     BlockArray *suffixes;
 717     BlockArray *suffixDocId;
 718     unsigned numberOfTexts;
 719     ulong maxTextLength;
 720     static_sequence *docId;
 721  */
 722 void TCImplementation::Save(FILE *file) const
 723 {
 724     // Saving version info:
 725     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 726         throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
 727
 728     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 729         throw std::runtime_error("TCImplementation::Save(): file write error (n).");
 730     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 731         throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
 732
 733     for(ulong i = 0; i < 256; ++i)
 734         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 735             throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
 736
 737     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 738         throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
 739
 740     alphabetrank->save(file);
 741     sampled->save(file);
 742     suffixes->Save(file);
 743     suffixDocId->Save(file);
 744
 745     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 746         throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
 747     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 748         throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
 749
 750     Doc->save(file);
 751     textStorage->Save(file);
 752     fflush(file);
 753 }
 754
 755
 756 /**
 757  * Load index from a file handle
 758  *
 759  * Throws a std::runtime_error exception on i/o error.
 760  * For more info, see TCImplementation::Save().
 761  */
 762 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
 763     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
 764       suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
 765 {
 766     uchar verFlag = 0;
 767     if (std::fread(&verFlag, 1, 1, file) != 1)
 768         throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
 769     if (verFlag != TCImplementation::versionFlag)
 770         throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
 771
 772     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 773         throw std::runtime_error("TCImplementation::Load(): file read error (n).");
 774     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 775         throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
 776 // FIXME samplerate can not be changed during load.
 777 //    if (this->samplerate == 0)
 778 //        this->samplerate = samplerate;
 779
 780     for(ulong i = 0; i < 256; ++i)
 781         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 782             throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
 783
 784     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 785         throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
 786
 787     alphabetrank = static_sequence::load(file);
 788     sampled = static_bitsequence::load(file);
 789     suffixes = new BlockArray(file);
 790     suffixDocId = new BlockArray(file);
 791
 792     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 793         throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
 794     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 795         throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
 796
 797     Doc = static_sequence::load(file);
 798     textStorage = TextStorage::Load(file);
 799
 800     // FIXME Construct data structures with new samplerate
 801     //maketables();
 802 }
 803
 804
 805
 806 /**
 807  * Rest of the functions follow...
 808  */
 809 ulong TCImplementation::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
 810 {
 811     int c;
 812     while (*sp<=*ep && i>=1)
 813     {
 814         c = (int)pattern[--i];
 815         *sp = C[c]+alphabetrank->rank(c,*sp-1);
 816         *ep = C[c]+alphabetrank->rank(c,*ep)-1;
 817     }
 818     if (*sp<=*ep)
 819         return *ep - *sp + 1;
 820     else
 821         return 0;
 822 }
 823
 824
 825 ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
 826 {
 827     if (sp>ep) return 0;
 828     if (j == 0)
 829     {
 830         result.push_back(std::make_pair(sp,ep));
 831         return ep-sp+1;
 832     }
 833     int c;
 834     ulong spnew;
 835     ulong epnew;
 836     int knew;
 837     ulong sum=0;
 838     if (k==0)
 839     {
 840         sum = searchPrefix(pattern, j, &sp, &ep);
 841         if (sp<=ep)
 842             result.push_back(std::make_pair(sp, ep));
 843         return sum;
 844     }
 845     vector<int> chars = alphabetrank->accessAll(sp, ep);
 846     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 847     {
 848         if (*it == 0)
 849             continue; // skip '\0'
 850         c = *it;
 851         spnew = C[c]+alphabetrank->rank(c,sp-1);
 852         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 853         if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
 854         if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
 855     }
 856     return sum;
 857 }
 858
 859 //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
 860 ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
 861 {
 862     ulong sum=0;
 863     if (d[m]<=k) // range of suffixes with at most k-errors found
 864     {
 865         if (sp<=ep)
 866             result.push_back(std::make_pair(sp, ep));
 867         sum += (sp<=ep)?ep-sp+1:0ul;
 868     }
 869     if (sp>ep || j==0)
 870         return sum;
 871     ulong *dnew = new ulong[m+1];
 872     int c;
 873     ulong spnew;
 874     ulong p,lowerbound;
 875     ulong epnew;
 876     vector<int> chars = alphabetrank->accessAll(sp, ep);
 877     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 878     {
 879         if (*it == 0)
 880             continue; // skip '\0'
 881         c = *it;
 882         spnew = C[c]+alphabetrank->rank(c,sp-1);
 883         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 884         if (spnew>epnew) continue;
 885         dnew[0]=m+k-j+1;
 886         lowerbound=k+1;
 887         for (p=1; p<=m; p++) {
 888             dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
 889             if (dnew[p]<lowerbound)
 890                 lowerbound = dnew[p];
 891         }
 892         if (lowerbound<=k)
 893             sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
 894     }
 895     delete [] dnew;
 896     return sum;
 897 }
 898
 899
 900 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 901 {
 902     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 903     int c = (int)pattern[m-1];
 904     TextPosition i=m-1;
 905     TextPosition sp = C[c];
 906     TextPosition ep = C[c+1]-1;
 907     while (sp<=ep && i>=1)
 908     {
 909 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 910         c = (int)pattern[--i];
 911         sp = C[c]+alphabetrank->rank(c,sp-1);
 912         ep = C[c]+alphabetrank->rank(c,ep)-1;
 913     }
 914     *spResult = sp;
 915     *epResult = ep;
 916     if (sp<=ep)
 917         return ep - sp + 1;
 918     else
 919         return 0;
 920 }
 921
 922 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
 923 {
 924     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 925     int c = (int)pattern[m-1];
 926     assert(c == 0); // Start from endmarkers
 927     TextPosition i=m-1;
 928     TextPosition sp = begin;
 929     TextPosition ep = end;
 930     while (sp<=ep && i>=1)
 931     {
 932 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 933         c = (int)pattern[--i];
 934         sp = C[c]+alphabetrank->rank(c,sp-1);
 935         ep = C[c]+alphabetrank->rank(c,ep)-1;
 936     }
 937     *spResult = sp;
 938     *epResult = ep;
 939     if (sp<=ep)
 940         return ep - sp + 1;
 941     else
 942         return 0;
 943 }
 944
 945
 946 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 947 {
 948     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 949     uint c = (int)pattern[m-1];
 950     TextPosition i=m-1;
 951     TextPosition sp = 1;
 952     TextPosition ep = C[c+1]-1;
 953     while (sp<=ep && i>=1)
 954     {
 955 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 956         c = (int)pattern[--i];
 957         uint result = alphabetrank->rankLessThan(c,ep);
 958         if (result == ~0u)
 959             ep = 0;
 960         else
 961             ep = C[c]+result-1;
 962     }
 963     *spResult = sp;
 964     *epResult = ep;
 965     if (sp<=ep)
 966         return ep - sp + 1;
 967     else
 968         return 0;
 969 }
 970
 971
 972 TCImplementation::~TCImplementation() {
 973     delete alphabetrank;
 974     delete sampled;
 975     delete suffixes;
 976     delete suffixDocId;
 977     delete Doc;
 978     delete textStorage;
 979 }
 980
 981 void TCImplementation::makewavelet(uchar *bwt)
 982 {
 983     ulong i, min = 0,
 984              max;
 985     for (i=0;i<256;i++)
 986         C[i]=0;
 987     for (i=0;i<n;++i)
 988         C[(int)bwt[i]]++;
 989     for (i=0;i<256;i++)
 990         if (C[i]>0) {min = i; break;}
 991     for (i=255;i>=min;--i)
 992         if (C[i]>0) {max = i; break;}
 993
 994     ulong prev=C[0], temp;
 995     C[0]=0;
 996     for (i=1;i<256;i++) {
 997         temp = C[i];
 998         C[i]=C[i-1]+prev;
 999         prev = temp;
1000     }
1001 //    this->codetable = node::makecodetable(bwt,n);
1002 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1003 //    delete [] bwt;
1004     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1005 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1006
1007 #ifdef DEBUG_MEMUSAGE
1008     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1009     HeapProfiler::ResetMaxHeapConsumption();
1010 #endif
1011
1012     alphabet_mapper * am = new alphabet_mapper_none();
1013     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1014     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1015     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1016     delete bmb;
1017     bwt = 0; // already deleted
1018
1019 #ifdef DEBUG_MEMUSAGE
1020     std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1021     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1022 #endif
1023 }
1024
1025 void TCImplementation::maketables(ulong sampleLength, char tsType)
1026 {
1027     // Calculate BWT end-marker position (of last inserted text)
1028     {
1029         ulong i = 0;
1030         uint alphabetrank_i_tmp = 0;
1031         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1032         while (c != '\0')
1033         {
1034             i = C[c]+alphabetrank_i_tmp-1;
1035             c = alphabetrank->access(i, alphabetrank_i_tmp);
1036         }
1037
1038         this->bwtEndPos = i;
1039     }
1040
1041 #ifdef DEBUG_MEMUSAGE
1042     std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1043     HeapProfiler::ResetMaxHeapConsumption();
1044 #endif
1045
1046     // Build up array for text starting positions
1047 //    BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1048 //    (*textStartPos)[0] = 0;
1049
1050     // Mapping from end-markers to doc ID's:
1051     unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts);
1052     uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1];
1053
1054     BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n));
1055     uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
1056     for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
1057         sampledpositions[i] = 0;
1058
1059     ulong x,p=bwtEndPos;
1060     ulong sampleCount = 0;
1061     // Keeping track of text position of prev. end-marker seen
1062     ulong posOfSuccEndmarker = n-1;
1063     DocId textId = numberOfTexts;
1064     ulong ulongmax = 0;
1065     ulongmax--;
1066     uint alphabetrank_i_tmp =0;
1067
1068     TextStorageBuilder tsbuilder(n);
1069     Tools::StartTimer();
1070
1071     for (ulong i=n-1;i<ulongmax;i--) {
1072         // i substitutes SA->GetPos(i)
1073         x=(i==n-1)?0:i+1;
1074
1075         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1076         tsbuilder[i] = c;
1077
1078         if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0')
1079         {
1080             set_field(sampledpositions,1,p,1);
1081             (*positions)[sampleCount] = p;
1082             sampleCount ++;
1083         }
1084
1085         if (c == '\0')
1086         {
1087             --textId;
1088
1089             // Record the order of end-markers in BWT:
1090             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1091             set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts);
1092
1093             // Store text length and text start position:
1094             if (textId < (DocId)numberOfTexts - 1)
1095             {
1096 //                (*textStartPos)[textId + 1] = x;  // x-1 is text position of end-marker.
1097                 posOfSuccEndmarker = i;
1098             }
1099
1100             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1101             p = textId; // Correct LF-mapping to the last char of the previous text.
1102         }
1103         else // Now c != '\0', do LF-mapping:
1104             p = C[c]+alphabetrank_i_tmp-1;
1105     }
1106     assert(textId == 0);
1107
1108 #ifdef DEBUG_MEMUSAGE
1109     std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1110     HeapProfiler::ResetMaxHeapConsumption();
1111 #endif
1112
1113     textStorage = tsbuilder.InitTextStorage(tsType);
1114
1115 #ifdef DEBUG_MEMUSAGE
1116     std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1117     HeapProfiler::ResetMaxHeapConsumption();
1118 #endif
1119
1120     sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1121     delete [] sampledpositions;
1122     assert(sampleCount == sampleLength);
1123     assert(sampled->rank1(n-1) == sampleLength);
1124
1125 #ifdef DEBUG_MEMUSAGE
1126     std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1127     HeapProfiler::ResetMaxHeapConsumption();
1128 #endif
1129
1130     // Suffixes store an offset from the text start position
1131     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1132     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1133
1134     x = n - 2;
1135     posOfSuccEndmarker = n-1;
1136     for(ulong i=0; i<sampleLength; i++) {
1137         // Find next sampled text position
1138         while ((posOfSuccEndmarker - x) % samplerate != 0)
1139         {
1140             --x;
1141             assert(x != ~0lu);
1142             if (textStorage->IsEndmarker(x))
1143                 posOfSuccEndmarker = x--;
1144         }
1145         assert((*positions)[i] < n);
1146         ulong j = sampled->rank1((*positions)[i]);
1147
1148         assert(j != 0); // if (j==0) j=sampleLength;
1149
1150         TextPosition textPos = (x==n-1)?0:x+1;
1151         (*suffixDocId)[j-1] = textStorage->DocIdAtTextPos(textPos);
1152
1153         assert((*suffixDocId)[j-1] < numberOfTexts);
1154         // calculate offset from text start:
1155         (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]);
1156         --x;
1157         if (x != ~0lu && textStorage->IsEndmarker(x))
1158             posOfSuccEndmarker = x--;
1159     }
1160
1161     delete positions;
1162
1163 #ifdef DEBUG_MEMUSAGE
1164     std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " <<  HeapProfiler::GetHeapConsumption() << " / " <<  HeapProfiler::GetMaxHeapConsumption() << std::endl;
1165     HeapProfiler::ResetMaxHeapConsumption();
1166 #endif
1167
1168     /**
1169      * Second pass: check tables
1170      */
1171 /*    p=bwtEndPos;
1172     textId = numberOfTexts;
1173     for (ulong i=n-1;i<ulongmax;i--) {
1174         x=(i==n-1)?0:i+1;
1175
1176         if (sampled->access(p)) {
1177             ulong j = sampled->rank1(p)-1;
1178             assert((*suffixDocId)[j] == DocIdAtTextPos(textStartPos, x));
1179
1180             // calculate offset from text start:
1181             assert((*suffixes)[j] == x - (*textStartPos)[(*suffixDocId)[j]]);
1182         }
1183
1184         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1185
1186         if (c == '\0')
1187         {
1188             --textId;
1189             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1190             p = textId; // Correct LF-mapping to the last char of the previous text.
1191         }
1192         else // Now c != '\0', do LF-mapping:
1193             p = C[c]+alphabetrank_i_tmp-1;
1194     }
1195     assert(textId == 0);
1196     delete textStartPos
1197 */
1198
1199 #ifdef DEBUG_MEMUSAGE
1200     std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1201     HeapProfiler::ResetMaxHeapConsumption();
1202 #endif
1203
1204     alphabet_mapper * am = new alphabet_mapper_none();
1205     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate?
1206     Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true);
1207     delete bmb;
1208     // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1209
1210 #ifdef DEBUG_MEMUSAGE
1211     std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1212 #endif
1213 }
1214
1215
1216 /**
1217  * Finds document identifier for given text position
1218  *
1219  * Starting text position of the document is stored into second parameter.
1220  * Binary searching on text starting positions.
1221  */
1222 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1223 {
1224     assert(i < n);
1225
1226     DocId a = 0;
1227     DocId b = numberOfTexts - 1;
1228     while (a < b)
1229     {
1230         DocId c = a + (b - a)/2;
1231         if ((*textStartPos)[c] > i)
1232             b = c - 1;
1233         else if ((*textStartPos)[c+1] > i)
1234             return c;
1235         else
1236             a = c + 1;
1237     }
1238
1239     assert(a < (DocId)numberOfTexts);
1240     assert(i >= (*textStartPos)[a]);
1241     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1242     return a;
1243 }
1244
1245
1246 } // namespace SXSI
1247