TCImplementation.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "TCImplementation.h"
  21
  22 //#define DEBUG_MEMUSAGE
  23 #ifdef DEBUG_MEMUSAGE
  24 #include "HeapProfiler.h" // FIXME remove
  25 #endif
  26
  27 #include <iostream>
  28 #include <map>
  29 #include <set>
  30 #include <vector>
  31 #include <utility>
  32 #include <stdexcept>
  33 #include <cassert>
  34 #include <cstring> // For strlen()
  35 using std::vector;
  36 using std::pair;
  37 using std::make_pair;
  38 using std::map;
  39
  40 namespace SXSI
  41 {
  42
  43 // Save file version info
  44 const uchar TCImplementation::versionFlag = 6;
  45
  46 /**
  47  * Constructor inits an empty dynamic FM-index.
  48  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  49  */
  50 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
  51     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  52       suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
  53 {
  54     makewavelet(bwt); // Deletes bwt!
  55     bwt = 0;
  56
  57     // Make sampling tables
  58     maketables();
  59 }
  60
  61 bool TCImplementation::EmptyText(DocId k) const
  62 {
  63     assert(k < (DocId)numberOfTexts);
  64     return false; // Empty texts are not indexed
  65 }
  66
  67 uchar * TCImplementation::GetText(DocId k) const
  68 {
  69     assert(k < (DocId)numberOfTexts);
  70
  71     return textStorage->GetText(k);
  72 /*    TextPosition i = k;
  73
  74     string result;
  75     // Reserve average string length to avoid reallocs
  76     result.reserve(n/numberOfTexts);
  77
  78     uchar c = alphabetrank->access(i);
  79     while (c != '\0')
  80     {
  81         result.push_back(c);
  82         i = C[c]+alphabetrank->rank(c,i)-1;
  83
  84         c = alphabetrank->access(i); // "next" char.
  85     }
  86
  87     // Convert to uchar (FIXME return string?)
  88     i = result.size();
  89     uchar* res = new uchar[i+1];
  90     res[i] = '\0';
  91     for (ulong j = 0; j < i; ++j)
  92         res[i-j-1] = result[j];
  93         return res;*/
  94 }
  95 /*
  96  * Not supported
  97 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
  98 {
  99     assert(k < (DocId)numberOfTexts);
 100     assert(j < (*textLength)[k]);
 101     assert(i <= j);
 102
 103     ulong textRank = 0;
 104
 105     // Start position of k'th text
 106     ulong start = (*textStartPos)[k];
 107
 108     return Substring(i + start, j-i+1);
 109     }*/
 110
 111 /******************************************************************
 112  * Existential queries
 113  */
 114 bool TCImplementation::IsPrefix(uchar const * pattern) const
 115 {
 116     TextPosition m = strlen((char *)pattern);
 117     if (m == 0)
 118         return true;
 119
 120     TextPosition sp = 0, ep = 0;
 121     Search(pattern, m, &sp, &ep);
 122
 123     // Check for end-marker(s) in result interval
 124     if (CountEndmarkers(sp, ep))
 125         return true;
 126     return false;
 127 }
 128
 129 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 130 {
 131     TextPosition m = strlen((char *)pattern);
 132     if (m == 0)
 133         return true;
 134
 135     TextPosition sp = 0, ep = 0;
 136     Search(pattern, m, &sp, &ep);
 137
 138     // Check for end-marker(s) in result interval
 139     if (CountEndmarkers(sp, ep, begin, end))
 140         return true;
 141     return false;
 142 }
 143
 144
 145 bool TCImplementation::IsSuffix(uchar const *pattern) const
 146 {
 147     // Here counting is as fast as IsSuffix():
 148     if (CountSuffix(pattern) > 0)
 149         return true;
 150     return false;
 151 }
 152
 153 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 154 {
 155     // Here counting is as fast as IsSuffix():
 156     if (CountSuffix(pattern, begin, end) > 0)
 157         return true;
 158     return false;
 159 }
 160
 161 bool TCImplementation::IsEqual(uchar const *pattern) const
 162 {
 163     TextPosition m = std::strlen((char *)pattern);
 164     if (m == 0)
 165         return false; // No empty texts exists
 166
 167     TextPosition sp = 0, ep = 0;
 168     // Match including end-marker
 169     Search(pattern, m+1, &sp, &ep);
 170
 171     // Check for end-marker(s) in result interval
 172     if (CountEndmarkers(sp, ep))
 173         return true;
 174     return false;
 175 }
 176
 177 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 178 {
 179     TextPosition m = std::strlen((char *)pattern);
 180     if (m == 0)
 181         return false; // No empty texts exists
 182
 183     TextPosition sp = 0, ep = 0;
 184     // Match including end-marker
 185     Search(pattern, m+1, &sp, &ep, begin, end);
 186
 187     // Check for end-marker(s) in result interval
 188     if (CountEndmarkers(sp, ep))
 189         return true;
 190     return false;
 191 }
 192
 193 bool TCImplementation::IsContains(uchar const * pattern) const
 194 {
 195     TextPosition m = strlen((char *)pattern);
 196     if (m == 0)
 197         return true;
 198
 199     TextPosition sp = 0, ep = 0;
 200     // Just check if pattern exists somewhere
 201     ulong count = Search(pattern, m, &sp, &ep);
 202
 203     if (count > 0)
 204         return true;
 205     return false;
 206 }
 207
 208 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
 209 {
 210     // Here counting is as fast as existential querying
 211     if (CountContains(pattern, begin, end) > 0)
 212         return true; // FIXME No need to filter result set
 213     return false;
 214 }
 215
 216 bool TCImplementation::IsLessThan(uchar const * pattern) const
 217 {
 218     if (CountLessThan(pattern) > 0)
 219         return true;
 220     return false;
 221 }
 222
 223 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 224 {
 225     if (CountLessThan(pattern, begin, end) > 0)
 226         return true;
 227     return false;
 228 }
 229
 230 /******************************************************************
 231  * Counting queries
 232  */
 233 ulong TCImplementation::Count(uchar const * pattern) const
 234 {
 235     TextPosition m = strlen((char *)pattern);
 236     if (m == 0)
 237         return 0;
 238
 239     TextPosition sp = 0, ep = 0;
 240     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 241     return count;
 242 }
 243
 244 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
 245 {
 246     TextPosition m = strlen((char *)pattern);
 247     if (m == 0)
 248         return numberOfTexts;
 249
 250     TextPosition sp = 0, ep = 0;
 251     Search(pattern, m, &sp, &ep);
 252
 253     // Count end-markers in result interval
 254     return CountEndmarkers(sp, ep);
 255 }
 256
 257 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 258 {
 259     TextPosition m = strlen((char *)pattern);
 260     if (m == 0)
 261         return numberOfTexts;
 262
 263     TextPosition sp = 0, ep = 0;
 264     Search(pattern, m, &sp, &ep);
 265
 266     // Count end-markers in result interval
 267     return CountEndmarkers(sp, ep, begin, end);
 268 }
 269
 270 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
 271 {
 272     TextPosition m = strlen((char *)pattern);
 273     if (m == 0)
 274         return numberOfTexts;
 275
 276     TextPosition sp = 0, ep = 0;
 277     // Search with end-marker
 278     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 279
 280     return count;
 281 }
 282
 283 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 284 {
 285     TextPosition m = strlen((char *)pattern);
 286     if (m == 0)
 287         return numberOfTexts;
 288
 289     TextPosition sp = 0, ep = 0;
 290     // Search with end-marker
 291     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 292
 293     return count;
 294 }
 295
 296 unsigned TCImplementation::CountEqual(uchar const *pattern) const
 297 {
 298     TextPosition m = strlen((char const *)pattern);
 299     if (m == 0)
 300         return 0; // No empty texts.
 301
 302     TextPosition sp = 0, ep = 0;
 303     // Match including end-marker
 304     Search(pattern, m+1, &sp, &ep);
 305
 306     // Count end-markers in result interval
 307     return CountEndmarkers(sp, ep);
 308 }
 309
 310 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 311 {
 312     TextPosition m = strlen((char const *)pattern);
 313     if (m == 0)
 314         return 0; // No empty texts.
 315
 316     TextPosition sp = 0, ep = 0;
 317     // Match including end-marker
 318     Search(pattern, m+1, &sp, &ep, begin, end);
 319
 320     // Count end-markers in result interval
 321     return CountEndmarkers(sp, ep); // Already within [begin, end]
 322 }
 323
 324 unsigned TCImplementation::CountContains(uchar const * pattern) const
 325 {
 326     TextPosition m = strlen((char const *)pattern);
 327     if (m == 0)
 328         return numberOfTexts; // Total number of texts.
 329
 330     // Here counting is as slow as fetching the result set
 331     // because we have to filter out occ's that fall within same document.
 332     TextCollection::document_result result = Contains(pattern);
 333     return result.size();
 334 }
 335
 336 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
 337 {
 338     TextPosition m = strlen((char const *)pattern);
 339     if (m == 0)
 340         return numberOfTexts; // Total number of texts.
 341
 342     // Here counting is as slow as fetching the result set
 343     // because we have to filter out occ's that fall within same document.
 344     TextCollection::document_result result = Contains(pattern, begin, end);
 345     return result.size();
 346 }
 347
 348 // Less than or equal
 349 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
 350 {
 351     TextPosition m = strlen((char const *)pattern);
 352     if (m == 0)
 353         return 0; // No empty texts.
 354
 355     TextPosition sp = 0, ep = 0;
 356     SearchLessThan(pattern, m, &sp, &ep);
 357
 358     // Count end-markers in result interval
 359     return CountEndmarkers(sp, ep);
 360 }
 361
 362 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 363 {
 364     TextPosition m = strlen((char const *)pattern);
 365     if (m == 0)
 366         return 0; // No empty texts.
 367
 368     TextPosition sp = 0, ep = 0;
 369     SearchLessThan(pattern, m, &sp, &ep);
 370
 371     // Count end-markers in result interval
 372     return CountEndmarkers(sp, ep, begin, end);
 373 }
 374
 375 /**
 376  * Document reporting queries
 377  */
 378 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
 379 {
 380     TextPosition m = strlen((char *)pattern);
 381     if (m == 0)
 382         return TextCollection::document_result(); // FIXME Should return all 1...k
 383
 384     TextPosition sp = 0, ep = 0;
 385     Search(pattern, m, &sp, &ep);
 386
 387     // Iterate through end-markers in [sp,ep]:
 388     return EnumerateEndmarkers(sp, ep);
 389 }
 390
 391 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
 392 {
 393     TextPosition m = strlen((char *)pattern);
 394     if (m == 0)
 395         return TextCollection::document_result(); // FIXME Should return all 1...k
 396
 397     TextPosition sp = 0, ep = 0;
 398     Search(pattern, m, &sp, &ep);
 399
 400     // Return end-markers in [sp,ep] and [begin, end]:
 401     return EnumerateEndmarkers(sp, ep, begin, end);
 402 }
 403
 404 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
 405 {
 406     TextPosition m = strlen((char *)pattern);
 407     if (m == 0)
 408         return TextCollection::document_result(); // FIXME Should return all 1...k
 409
 410     TextPosition sp = 0, ep = 0;
 411     // Search with end-marker
 412     Search(pattern, m+1, &sp, &ep);
 413
 414     TextCollection::document_result result;
 415     result.reserve(ep-sp+1); // Try to avoid reallocation.
 416
 417     // Check each occurrence
 418     for (; sp <= ep; ++sp)
 419     {
 420         TextPosition i = sp;
 421
 422         uchar c = alphabetrank->access(i);
 423         while (c != '\0' && !sampled->access(i))
 424         {
 425             i = C[c]+alphabetrank->rank(c,i)-1;
 426             c = alphabetrank->access(i);
 427         }
 428         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 429
 430         if (c == '\0')
 431         {
 432             // Rank among the end-markers in BWT
 433             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 434             result.push_back(Doc->access(endmarkerRank));
 435         }
 436         else // Sampled position
 437         {
 438             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 439             result.push_back(docId);
 440         }
 441     }
 442
 443     return result;
 444 }
 445
 446 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
 447 {
 448     TextPosition m = strlen((char *)pattern);
 449     if (m == 0)
 450         return TextCollection::document_result(); // FIXME Should return all 1...k
 451
 452     TextPosition sp = 0, ep = 0;
 453     // Search with end-marker
 454     Search(pattern, m+1, &sp, &ep, begin, end);
 455
 456     TextCollection::document_result result;
 457     result.reserve(ep-sp+1); // Try to avoid reallocation.
 458
 459     // Check each occurrence, already within [begin, end]
 460     for (; sp <= ep; ++sp)
 461     {
 462         TextPosition i = sp;
 463
 464         uchar c = alphabetrank->access(i);
 465         while (c != '\0' && !sampled->access(i))
 466         {
 467             i = C[c]+alphabetrank->rank(c,i)-1;
 468             c = alphabetrank->access(i);
 469         }
 470         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 471
 472         if (c == '\0')
 473         {
 474             // Rank among the end-markers in BWT
 475             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 476             result.push_back(Doc->access(endmarkerRank));
 477         }
 478         else // Sampled position
 479         {
 480             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 481             result.push_back(docId);
 482         }
 483     }
 484
 485     return result;
 486 }
 487
 488
 489 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
 490 {
 491     TextPosition m = strlen((char const *)pattern);
 492     if (m == 0)
 493         return TextCollection::document_result(); // FIXME Should return all empty texts
 494
 495     TextPosition sp = 0, ep = 0;
 496     // Match including end-marker
 497     Search(pattern, m+1, &sp, &ep);
 498
 499     // Report end-markers in result interval
 500     return EnumerateEndmarkers(sp, ep);
 501 }
 502
 503 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
 504 {
 505     TextPosition m = strlen((char const *)pattern);
 506     if (m == 0)
 507         return TextCollection::document_result(); // FIXME Should return all empty texts
 508
 509     TextPosition sp = 0, ep = 0;
 510     // Match including end-marker
 511     Search(pattern, m+1, &sp, &ep, begin, end);
 512
 513     // Report end-markers in result interval
 514     return EnumerateEndmarkers(sp, ep, begin, end);
 515 }
 516
 517
 518 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
 519 {
 520     TextPosition m = strlen((char *)pattern);
 521     if (m == 0)
 522         return TextCollection::document_result();
 523
 524     TextPosition sp = 0, ep = 0;
 525     // Search all occurrences
 526     Search(pattern, m, &sp, &ep);
 527
 528     // We want unique document indentifiers, using std::set to collect them
 529     std::set<DocId> resultSet;
 530     EnumerateDocuments(resultSet, sp, ep);
 531
 532     // Convert std::set to std::vector
 533     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 534     return result;
 535 }
 536
 537 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
 538 {
 539     TextPosition m = strlen((char *)pattern);
 540     if (m == 0)
 541         return TextCollection::document_result();
 542
 543     TextPosition sp = 0, ep = 0;
 544     // Search all occurrences
 545     Search(pattern, m, &sp, &ep);
 546
 547     // We want unique document indentifiers, using std::set to collect them
 548     std::set<DocId> resultSet;
 549     EnumerateDocuments(resultSet, sp, ep, begin, end);
 550
 551     // Convert std::set to std::vector
 552     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 553     return result;
 554 }
 555
 556 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
 557 {
 558     TextPosition m = strlen((char *)pattern);
 559     if (m == 0)
 560         return TextCollection::document_result(); // empty result set
 561
 562     TextPosition sp = 0, ep = 0;
 563     SearchLessThan(pattern, m, &sp, &ep);
 564
 565     // Report end-markers in result interval
 566     return EnumerateEndmarkers(sp, ep);
 567 }
 568
 569 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
 570 {
 571     TextPosition m = strlen((char *)pattern);
 572     if (m == 0)
 573         return TextCollection::document_result(); // empty result set
 574
 575     TextPosition sp = 0, ep = 0;
 576     SearchLessThan(pattern, m, &sp, &ep);
 577
 578     // Iterate through end-markers in [sp,ep] and [begin, end]:
 579     return EnumerateEndmarkers(sp, ep, begin, end);
 580 }
 581
 582
 583 TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const
 584 {
 585     TextPosition m = strlen((char *)pattern);
 586     if (m == 0)
 587         return TextCollection::document_result(); // empty result set
 588
 589     suffix_range_vector ranges;
 590     kmismatches(ranges, pattern, 0, n-1, m, k);
 591     std::set<DocId> resultSet;
 592
 593     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 594         // Iterate through docs in [sp,ep]:
 595         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 596
 597     // Convert std::set to std::vector
 598     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 599     return result;
 600 }
 601
 602 TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const
 603 {
 604     TextPosition m = strlen((char *)pattern);
 605     if (m == 0)
 606         return TextCollection::document_result(); // empty result set
 607
 608     suffix_range_vector ranges;
 609     ulong *dd = new ulong[m+1];
 610     for (ulong i=0;i<m+1;i++)
 611         dd[i]=i;
 612     kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 613     delete [] dd;
 614
 615     std::set<DocId> resultSet;
 616     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 617         // Iterate through docs in [sp,ep]:
 618         EnumerateDocuments(resultSet, (*it).first, (*it).second);
 619
 620     // Convert std::set to std::vector
 621     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 622     return result;
 623 }
 624
 625
 626 /**
 627  * Full result set queries
 628  */
 629 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
 630 {
 631     TextPosition m = strlen((char *)pattern);
 632     if (m == 0)
 633         return full_result(); // FIXME Throw exception?
 634
 635     TextPosition sp = 0, ep = 0;
 636     // Search all occurrences
 637     Search(pattern, m, &sp, &ep);
 638
 639     full_result result;
 640     result.reserve(ep-sp+1); // Try to avoid reallocation.
 641     EnumeratePositions(result, sp, ep);
 642
 643     return result;
 644 }
 645
 646 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
 647 {
 648     TextPosition m = strlen((char *)pattern);
 649     if (m == 0)
 650         return full_result(); // FIXME Throw exception?
 651
 652     TextPosition sp = 0, ep = 0;
 653     // Search all occurrences
 654     Search(pattern, m, &sp, &ep);
 655
 656     full_result result;
 657     result.reserve(ep-sp+1); // Try to avoid reallocation.
 658     EnumeratePositions(result, sp, ep, begin, end);
 659
 660     return result;
 661 }
 662
 663 TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const
 664 {
 665     TextPosition m = strlen((char *)pattern);
 666     if (m == 0)
 667         return TextCollection::full_result(); // empty result set
 668
 669     suffix_range_vector ranges;
 670     ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
 671
 672     TextCollection::full_result result;
 673     result.reserve(count); // avoid reallocation.
 674     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 675         // Iterate through docs in [sp,ep]:
 676         EnumeratePositions(result, (*it).first, (*it).second);
 677     return result;
 678 }
 679
 680 TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const
 681 {
 682     TextPosition m = strlen((char *)pattern);
 683     if (m == 0)
 684         return TextCollection::full_result(); // empty result set
 685
 686     suffix_range_vector ranges;
 687     ulong *dd = new ulong[m+1];
 688     for (unsigned i=0;i<m+1;i++)
 689         dd[i]=i;
 690     ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
 691     delete [] dd;
 692
 693     TextCollection::full_result result;
 694     result.reserve(count); // avoid reallocation.
 695     for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
 696         // Iterate through docs in [sp,ep]:
 697         EnumeratePositions(result, (*it).first, (*it).second);
 698     return result;
 699 }
 700
 701
 702 /**
 703  * Save index to a file handle
 704  *
 705  * Throws a std::runtime_error exception on i/o error.
 706  * First byte that is saved represents the version number of the save file.
 707  * In version 2 files, the data fields are:
 708  *  uchar versionFlag;
 709     TextPosition n;
 710     unsigned samplerate;
 711     unsigned C[256];
 712     TextPosition bwtEndPos;
 713     static_sequence * alphabetrank;
 714     BSGAP *sampled;
 715     BlockArray *suffixes;
 716     BlockArray *suffixDocId;
 717     unsigned numberOfTexts;
 718     ulong maxTextLength;
 719     static_sequence *docId;
 720  */
 721 void TCImplementation::Save(FILE *file) const
 722 {
 723     // Saving version info:
 724     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 725         throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
 726
 727     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 728         throw std::runtime_error("TCImplementation::Save(): file write error (n).");
 729     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 730         throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
 731
 732     for(ulong i = 0; i < 256; ++i)
 733         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 734             throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
 735
 736     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 737         throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
 738
 739     alphabetrank->save(file);
 740     sampled->save(file);
 741     suffixes->Save(file);
 742     suffixDocId->Save(file);
 743
 744     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 745         throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
 746     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 747         throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
 748
 749     Doc->save(file);
 750     textStorage->Save(file);
 751     fflush(file);
 752 }
 753
 754
 755 /**
 756  * Load index from a file handle
 757  *
 758  * Throws a std::runtime_error exception on i/o error.
 759  * For more info, see TCImplementation::Save().
 760  */
 761 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
 762     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
 763       suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
 764 {
 765     uchar verFlag = 0;
 766     if (std::fread(&verFlag, 1, 1, file) != 1)
 767         throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
 768     if (verFlag != TCImplementation::versionFlag)
 769         throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
 770
 771     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 772         throw std::runtime_error("TCImplementation::Load(): file read error (n).");
 773     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 774         throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
 775 // FIXME samplerate can not be changed during load.
 776 //    if (this->samplerate == 0)
 777 //        this->samplerate = samplerate;
 778
 779     for(ulong i = 0; i < 256; ++i)
 780         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 781             throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
 782
 783     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 784         throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
 785
 786     alphabetrank = static_sequence::load(file);
 787     sampled = static_bitsequence::load(file);
 788     suffixes = new BlockArray(file);
 789     suffixDocId = new BlockArray(file);
 790
 791     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 792         throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
 793     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 794         throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
 795
 796     Doc = static_sequence::load(file);
 797     textStorage = new TextStorage(file);
 798
 799     // FIXME Construct data structures with new samplerate
 800     //maketables();
 801 }
 802
 803
 804
 805 /**
 806  * Rest of the functions follow...
 807  */
 808 ulong TCImplementation::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
 809 {
 810     int c;
 811     while (*sp<=*ep && i>=1)
 812     {
 813         c = (int)pattern[--i];
 814         *sp = C[c]+alphabetrank->rank(c,*sp-1);
 815         *ep = C[c]+alphabetrank->rank(c,*ep)-1;
 816     }
 817     if (*sp<=*ep)
 818         return *ep - *sp + 1;
 819     else
 820         return 0;
 821 }
 822
 823
 824 ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
 825 {
 826     if (sp>ep) return 0;
 827     if (j == 0)
 828     {
 829         result.push_back(std::make_pair(sp,ep));
 830         return ep-sp+1;
 831     }
 832     int c;
 833     ulong spnew;
 834     ulong epnew;
 835     int knew;
 836     ulong sum=0;
 837     if (k==0)
 838     {
 839         sum = searchPrefix(pattern, j, &sp, &ep);
 840         if (sp<=ep)
 841             result.push_back(std::make_pair(sp, ep));
 842         return sum;
 843     }
 844     vector<int> chars = alphabetrank->accessAll(sp, ep);
 845     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 846     {
 847         if (*it == 0)
 848             continue; // skip '\0'
 849         c = *it;
 850         spnew = C[c]+alphabetrank->rank(c,sp-1);
 851         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 852         if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
 853         if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
 854     }
 855     return sum;
 856 }
 857
 858 //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
 859 ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
 860 {
 861     cout << "j = " << j << ", k = " << k << ", d:";
 862     for (unsigned i = 0; i < m+1; ++i)
 863         cout << " " << d[i];
 864     cout << endl;
 865
 866     if (d[m]<=k) // range of suffixes with at most k-errors found
 867     {
 868         if (sp<=ep)
 869             result.push_back(std::make_pair(sp, ep));
 870         return (sp<=ep)?ep-sp+1:0ul;
 871     }
 872     if (sp>ep || j==0)
 873         return 0;
 874     ulong *dnew = new ulong[m+1];
 875     int c;
 876     ulong spnew;
 877     ulong p,lowerbound;
 878     ulong epnew;
 879     ulong sum=0;
 880     vector<int> chars = alphabetrank->accessAll(sp, ep);
 881     for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
 882     {
 883         if (*it == 0)
 884             continue; // skip '\0'
 885         c = *it;
 886         spnew = C[c]+alphabetrank->rank(c,sp-1);
 887         epnew = C[c]+alphabetrank->rank(c,ep)-1;
 888         if (spnew>epnew) continue;
 889         dnew[0]=m+k-j+1;
 890         lowerbound=k+1;
 891         for (p=1; p<=m; p++) {
 892             dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
 893             if (dnew[p]<lowerbound)
 894                 lowerbound = dnew[p];
 895         }
 896         if (lowerbound<=k)
 897             sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
 898     }
 899     delete [] dnew;
 900     return sum;
 901 }
 902
 903
 904 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 905 {
 906     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 907     int c = (int)pattern[m-1];
 908     TextPosition i=m-1;
 909     TextPosition sp = C[c];
 910     TextPosition ep = C[c+1]-1;
 911     while (sp<=ep && i>=1)
 912     {
 913 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 914         c = (int)pattern[--i];
 915         sp = C[c]+alphabetrank->rank(c,sp-1);
 916         ep = C[c]+alphabetrank->rank(c,ep)-1;
 917     }
 918     *spResult = sp;
 919     *epResult = ep;
 920     if (sp<=ep)
 921         return ep - sp + 1;
 922     else
 923         return 0;
 924 }
 925
 926 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
 927 {
 928     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 929     int c = (int)pattern[m-1];
 930     assert(c == 0); // Start from endmarkers
 931     TextPosition i=m-1;
 932     TextPosition sp = begin;
 933     TextPosition ep = end;
 934     while (sp<=ep && i>=1)
 935     {
 936 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 937         c = (int)pattern[--i];
 938         sp = C[c]+alphabetrank->rank(c,sp-1);
 939         ep = C[c]+alphabetrank->rank(c,ep)-1;
 940     }
 941     *spResult = sp;
 942     *epResult = ep;
 943     if (sp<=ep)
 944         return ep - sp + 1;
 945     else
 946         return 0;
 947 }
 948
 949
 950 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 951 {
 952     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 953     uint c = (int)pattern[m-1];
 954     TextPosition i=m-1;
 955     TextPosition sp = 1;
 956     TextPosition ep = C[c+1]-1;
 957     while (sp<=ep && i>=1)
 958     {
 959 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 960         c = (int)pattern[--i];
 961         uint result = alphabetrank->rankLessThan(c,ep);
 962         if (result == ~0u)
 963             ep = 0;
 964         else
 965             ep = C[c]+result-1;
 966     }
 967     *spResult = sp;
 968     *epResult = ep;
 969     if (sp<=ep)
 970         return ep - sp + 1;
 971     else
 972         return 0;
 973 }
 974
 975
 976 TCImplementation::~TCImplementation() {
 977     delete alphabetrank;
 978     delete sampled;
 979     delete suffixes;
 980     delete suffixDocId;
 981     delete Doc;
 982     delete textStorage;
 983 }
 984
 985 void TCImplementation::makewavelet(uchar *bwt)
 986 {
 987     ulong i, min = 0,
 988              max;
 989     for (i=0;i<256;i++)
 990         C[i]=0;
 991     for (i=0;i<n;++i)
 992         C[(int)bwt[i]]++;
 993     for (i=0;i<256;i++)
 994         if (C[i]>0) {min = i; break;}
 995     for (i=255;i>=min;--i)
 996         if (C[i]>0) {max = i; break;}
 997
 998     ulong prev=C[0], temp;
 999     C[0]=0;
1000     for (i=1;i<256;i++) {
1001         temp = C[i];
1002         C[i]=C[i-1]+prev;
1003         prev = temp;
1004     }
1005 //    this->codetable = node::makecodetable(bwt,n);
1006 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1007 //    delete [] bwt;
1008     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1009 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1010
1011 #ifdef DEBUG_MEMUSAGE
1012     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1013     HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1014 #endif
1015
1016     alphabet_mapper * am = new alphabet_mapper_none();
1017     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1018     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1019     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1020     delete bmb;
1021     bwt = 0; // already deleted
1022
1023 #ifdef DEBUG_MEMUSAGE
1024     std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1025     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1026 #endif
1027 }
1028
1029 void TCImplementation::maketables()
1030 {
1031     // Calculate BWT end-marker position (of last inserted text)
1032     {
1033         ulong i = 0;
1034         uint alphabetrank_i_tmp = 0;
1035         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1036         while (c != '\0')
1037         {
1038             i = C[c]+alphabetrank_i_tmp-1;
1039             c = alphabetrank->access(i, alphabetrank_i_tmp);
1040         }
1041
1042         this->bwtEndPos = i;
1043     }
1044
1045     // Build up array for text starting positions
1046     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1047     (*textStartPos)[0] = 0;
1048
1049     // Mapping from end-markers to doc ID's:
1050     uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs.
1051
1052     uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
1053     for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
1054         sampledpositions[i] = 0;
1055
1056     ulong x,p=bwtEndPos;
1057     ulong sampleCount = 0;
1058     // Keeping track of text position of prev. end-marker seen
1059     ulong posOfSuccEndmarker = n;
1060     DocId textId = numberOfTexts;
1061     ulong ulongmax = 0;
1062     ulongmax--;
1063     uint alphabetrank_i_tmp =0;
1064
1065     /**
1066      * First pass: populate tables textStartPos and sampledpositions.
1067      */
1068     for (ulong i=n-1;i<ulongmax;i--) {
1069         // i substitutes SA->GetPos(i)
1070         x=(i==n-1)?0:i+1;
1071
1072         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1073             set_field(sampledpositions,1,p,1);
1074             sampleCount ++;
1075         }
1076
1077         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1078         if (c == '\0')
1079         {
1080             --textId;
1081
1082             // Record the order of end-markers in BWT:
1083             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1084             endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts;
1085
1086             // Store text length and text start position:
1087             if (textId < (DocId)numberOfTexts - 1)
1088             {
1089                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1090                 posOfSuccEndmarker = x;
1091             }
1092
1093             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1094             p = textId; // Correct LF-mapping to the last char of the previous text.
1095         }
1096         else // Now c != '\0', do LF-mapping:
1097             p = C[c]+alphabetrank_i_tmp-1;
1098     }
1099     assert(textId == 0);
1100
1101     sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1102     delete [] sampledpositions;
1103     ulong sampleLength = sampled->rank1(n-1);
1104     assert(sampleCount == sampleLength);
1105
1106     // Suffixes store an offset from the text start position
1107     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1108     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1109
1110     p=bwtEndPos;
1111     textId = numberOfTexts;
1112
1113     TextStorageBuilder tsbuilder(n);
1114
1115     /**
1116      * Second pass: populate tables suffixes and suffixDocId.
1117      */
1118     for (ulong i=n-1;i<ulongmax;i--) {
1119         x=(i==n-1)?0:i+1;
1120
1121         if (sampled->access(p)) {
1122             ulong j = sampled->rank1(p)-1;
1123
1124             (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x);
1125
1126             // calculate offset from text start:
1127             (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]];
1128         }
1129
1130         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1131         tsbuilder[i] = c;
1132
1133         if (c == '\0')
1134         {
1135             --textId;
1136             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1137             p = textId; // Correct LF-mapping to the last char of the previous text.
1138         }
1139         else // Now c != '\0', do LF-mapping:
1140             p = C[c]+alphabetrank_i_tmp-1;
1141     }
1142     assert(textId == 0);
1143     delete textStartPos;
1144
1145     textStorage = tsbuilder.InitTextStorage();
1146
1147 #ifdef DEBUG_MEMUSAGE
1148     std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1149     HeapProfiler::ResetMaxHeapConsumption();
1150 #endif
1151
1152     alphabet_mapper * am = new alphabet_mapper_none();
1153     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1154     Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true);
1155     delete bmb;
1156     // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1157
1158 #ifdef DEBUG_MEMUSAGE
1159     std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1160 #endif
1161 }
1162
1163
1164 /**
1165  * Finds document identifier for given text position
1166  *
1167  * Starting text position of the document is stored into second parameter.
1168  * Binary searching on text starting positions.
1169  */
1170 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1171 {
1172     assert(i < n);
1173
1174     DocId a = 0;
1175     DocId b = numberOfTexts - 1;
1176     while (a < b)
1177     {
1178         DocId c = a + (b - a)/2;
1179         if ((*textStartPos)[c] > i)
1180             b = c - 1;
1181         else if ((*textStartPos)[c+1] > i)
1182             return c;
1183         else
1184             a = c + 1;
1185     }
1186
1187     assert(a < (DocId)numberOfTexts);
1188     assert(i >= (*textStartPos)[a]);
1189     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1190     return a;
1191 }
1192
1193
1194 } // namespace SXSI
1195