TCImplementation.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "TCImplementation.h"
  21
  22 //#define DEBUG_MEMUSAGE
  23 #ifdef DEBUG_MEMUSAGE
  24 #include "HeapProfiler.h" // FIXME remove
  25 #endif
  26
  27 #include <iostream>
  28 #include <map>
  29 #include <set>
  30 #include <vector>
  31 #include <utility>
  32 #include <stdexcept>
  33 #include <cassert>
  34 #include <cstring> // For strlen()
  35 using std::vector;
  36 using std::pair;
  37 using std::make_pair;
  38 using std::map;
  39
  40 namespace SXSI
  41 {
  42
  43 // Save file version info
  44 const uchar TCImplementation::versionFlag = 6;
  45
  46 /**
  47  * Constructor inits an empty dynamic FM-index.
  48  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  49  */
  50 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
  51     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  52       suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
  53 {
  54     makewavelet(bwt); // Deletes bwt!
  55     bwt = 0;
  56
  57     // Make sampling tables
  58     maketables();
  59 }
  60
  61 bool TCImplementation::EmptyText(DocId k) const
  62 {
  63     assert(k < (DocId)numberOfTexts);
  64     return false; // Empty texts are not indexed
  65 }
  66
  67 uchar * TCImplementation::GetText(DocId k) const
  68 {
  69     assert(k < (DocId)numberOfTexts);
  70
  71     return textStorage->GetText(k);
  72 /*    TextPosition i = k;
  73
  74     string result;
  75     // Reserve average string length to avoid reallocs
  76     result.reserve(n/numberOfTexts);
  77
  78     uchar c = alphabetrank->access(i);
  79     while (c != '\0')
  80     {
  81         result.push_back(c);
  82         i = C[c]+alphabetrank->rank(c,i)-1;
  83
  84         c = alphabetrank->access(i); // "next" char.
  85     }
  86
  87     // Convert to uchar (FIXME return string?)
  88     i = result.size();
  89     uchar* res = new uchar[i+1];
  90     res[i] = '\0';
  91     for (ulong j = 0; j < i; ++j)
  92         res[i-j-1] = result[j];
  93         return res;*/
  94 }
  95 /*
  96  * Not supported
  97 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
  98 {
  99     assert(k < (DocId)numberOfTexts);
 100     assert(j < (*textLength)[k]);
 101     assert(i <= j);
 102
 103     ulong textRank = 0;
 104
 105     // Start position of k'th text
 106     ulong start = (*textStartPos)[k];
 107
 108     return Substring(i + start, j-i+1);
 109     }*/
 110
 111 /******************************************************************
 112  * Existential queries
 113  */
 114 bool TCImplementation::IsPrefix(uchar const * pattern) const
 115 {
 116     TextPosition m = strlen((char *)pattern);
 117     if (m == 0)
 118         return true;
 119
 120     TextPosition sp = 0, ep = 0;
 121     Search(pattern, m, &sp, &ep);
 122
 123     // Check for end-marker(s) in result interval
 124     if (CountEndmarkers(sp, ep))
 125         return true;
 126     return false;
 127 }
 128
 129 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 130 {
 131     TextPosition m = strlen((char *)pattern);
 132     if (m == 0)
 133         return true;
 134
 135     TextPosition sp = 0, ep = 0;
 136     Search(pattern, m, &sp, &ep);
 137
 138     // Check for end-marker(s) in result interval
 139     if (CountEndmarkers(sp, ep, begin, end))
 140         return true;
 141     return false;
 142 }
 143
 144
 145 bool TCImplementation::IsSuffix(uchar const *pattern) const
 146 {
 147     // Here counting is as fast as IsSuffix():
 148     if (CountSuffix(pattern) > 0)
 149         return true;
 150     return false;
 151 }
 152
 153 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 154 {
 155     // Here counting is as fast as IsSuffix():
 156     if (CountSuffix(pattern, begin, end) > 0)
 157         return true;
 158     return false;
 159 }
 160
 161 bool TCImplementation::IsEqual(uchar const *pattern) const
 162 {
 163     TextPosition m = std::strlen((char *)pattern);
 164     if (m == 0)
 165         return false; // No empty texts exists
 166
 167     TextPosition sp = 0, ep = 0;
 168     // Match including end-marker
 169     Search(pattern, m+1, &sp, &ep);
 170
 171     // Check for end-marker(s) in result interval
 172     if (CountEndmarkers(sp, ep))
 173         return true;
 174     return false;
 175 }
 176
 177 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 178 {
 179     TextPosition m = std::strlen((char *)pattern);
 180     if (m == 0)
 181         return false; // No empty texts exists
 182
 183     TextPosition sp = 0, ep = 0;
 184     // Match including end-marker
 185     Search(pattern, m+1, &sp, &ep, begin, end);
 186
 187     // Check for end-marker(s) in result interval
 188     if (CountEndmarkers(sp, ep))
 189         return true;
 190     return false;
 191 }
 192
 193 bool TCImplementation::IsContains(uchar const * pattern) const
 194 {
 195     TextPosition m = strlen((char *)pattern);
 196     if (m == 0)
 197         return true;
 198
 199     TextPosition sp = 0, ep = 0;
 200     // Just check if pattern exists somewhere
 201     ulong count = Search(pattern, m, &sp, &ep);
 202
 203     if (count > 0)
 204         return true;
 205     return false;
 206 }
 207
 208 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
 209 {
 210     // Here counting is as fast as existential querying
 211     if (CountContains(pattern, begin, end) > 0)
 212         return true; // FIXME No need to filter result set
 213     return false;
 214 }
 215
 216 bool TCImplementation::IsLessThan(uchar const * pattern) const
 217 {
 218     if (CountLessThan(pattern) > 0)
 219         return true;
 220     return false;
 221 }
 222
 223 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 224 {
 225     if (CountLessThan(pattern, begin, end) > 0)
 226         return true;
 227     return false;
 228 }
 229
 230 /******************************************************************
 231  * Counting queries
 232  */
 233 ulong TCImplementation::Count(uchar const * pattern) const
 234 {
 235     TextPosition m = strlen((char *)pattern);
 236     if (m == 0)
 237         return 0;
 238
 239     TextPosition sp = 0, ep = 0;
 240     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 241     return count;
 242 }
 243
 244 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
 245 {
 246     TextPosition m = strlen((char *)pattern);
 247     if (m == 0)
 248         return numberOfTexts;
 249
 250     TextPosition sp = 0, ep = 0;
 251     Search(pattern, m, &sp, &ep);
 252
 253     // Count end-markers in result interval
 254     return CountEndmarkers(sp, ep);
 255 }
 256
 257 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 258 {
 259     TextPosition m = strlen((char *)pattern);
 260     if (m == 0)
 261         return numberOfTexts;
 262
 263     TextPosition sp = 0, ep = 0;
 264     Search(pattern, m, &sp, &ep);
 265
 266     // Count end-markers in result interval
 267     return CountEndmarkers(sp, ep, begin, end);
 268 }
 269
 270 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
 271 {
 272     TextPosition m = strlen((char *)pattern);
 273     if (m == 0)
 274         return numberOfTexts;
 275
 276     TextPosition sp = 0, ep = 0;
 277     // Search with end-marker
 278     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 279
 280     return count;
 281 }
 282
 283 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 284 {
 285     TextPosition m = strlen((char *)pattern);
 286     if (m == 0)
 287         return numberOfTexts;
 288
 289     TextPosition sp = 0, ep = 0;
 290     // Search with end-marker
 291     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 292
 293     return count;
 294 }
 295
 296 unsigned TCImplementation::CountEqual(uchar const *pattern) const
 297 {
 298     TextPosition m = strlen((char const *)pattern);
 299     if (m == 0)
 300         return 0; // No empty texts.
 301
 302     TextPosition sp = 0, ep = 0;
 303     // Match including end-marker
 304     Search(pattern, m+1, &sp, &ep);
 305
 306     // Count end-markers in result interval
 307     return CountEndmarkers(sp, ep);
 308 }
 309
 310 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 311 {
 312     TextPosition m = strlen((char const *)pattern);
 313     if (m == 0)
 314         return 0; // No empty texts.
 315
 316     TextPosition sp = 0, ep = 0;
 317     // Match including end-marker
 318     Search(pattern, m+1, &sp, &ep, begin, end);
 319
 320     // Count end-markers in result interval
 321     return CountEndmarkers(sp, ep); // Already within [begin, end]
 322 }
 323
 324 unsigned TCImplementation::CountContains(uchar const * pattern) const
 325 {
 326     TextPosition m = strlen((char const *)pattern);
 327     if (m == 0)
 328         return numberOfTexts; // Total number of texts.
 329
 330     // Here counting is as slow as fetching the result set
 331     // because we have to filter out occ's that fall within same document.
 332     TextCollection::document_result result = Contains(pattern);
 333     return result.size();
 334 }
 335
 336 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
 337 {
 338     TextPosition m = strlen((char const *)pattern);
 339     if (m == 0)
 340         return numberOfTexts; // Total number of texts.
 341
 342     // Here counting is as slow as fetching the result set
 343     // because we have to filter out occ's that fall within same document.
 344     TextCollection::document_result result = Contains(pattern, begin, end);
 345     return result.size();
 346 }
 347
 348 // Less than or equal
 349 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
 350 {
 351     TextPosition m = strlen((char const *)pattern);
 352     if (m == 0)
 353         return 0; // No empty texts.
 354
 355     TextPosition sp = 0, ep = 0;
 356     SearchLessThan(pattern, m, &sp, &ep);
 357
 358     // Count end-markers in result interval
 359     return CountEndmarkers(sp, ep);
 360 }
 361
 362 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 363 {
 364     TextPosition m = strlen((char const *)pattern);
 365     if (m == 0)
 366         return 0; // No empty texts.
 367
 368     TextPosition sp = 0, ep = 0;
 369     SearchLessThan(pattern, m, &sp, &ep);
 370
 371     // Count end-markers in result interval
 372     return CountEndmarkers(sp, ep, begin, end);
 373 }
 374
 375 /**
 376  * Document reporting queries
 377  */
 378 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
 379 {
 380     TextPosition m = strlen((char *)pattern);
 381     if (m == 0)
 382         return TextCollection::document_result(); // FIXME Should return all 1...k
 383
 384     TextPosition sp = 0, ep = 0;
 385     Search(pattern, m, &sp, &ep);
 386
 387     // Iterate through end-markers in [sp,ep]:
 388     return EnumerateEndmarkers(sp, ep);
 389 }
 390
 391 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
 392 {
 393     TextPosition m = strlen((char *)pattern);
 394     if (m == 0)
 395         return TextCollection::document_result(); // FIXME Should return all 1...k
 396
 397     TextPosition sp = 0, ep = 0;
 398     Search(pattern, m, &sp, &ep);
 399
 400     // Return end-markers in [sp,ep] and [begin, end]:
 401     return EnumerateEndmarkers(sp, ep, begin, end);
 402 }
 403
 404 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
 405 {
 406     TextPosition m = strlen((char *)pattern);
 407     if (m == 0)
 408         return TextCollection::document_result(); // FIXME Should return all 1...k
 409
 410     TextPosition sp = 0, ep = 0;
 411     // Search with end-marker
 412     Search(pattern, m+1, &sp, &ep);
 413
 414     TextCollection::document_result result;
 415     result.reserve(ep-sp+1); // Try to avoid reallocation.
 416
 417     // Check each occurrence
 418     for (; sp <= ep; ++sp)
 419     {
 420         TextPosition i = sp;
 421
 422         uchar c = alphabetrank->access(i);
 423         while (c != '\0' && !sampled->access(i))
 424         {
 425             i = C[c]+alphabetrank->rank(c,i)-1;
 426             c = alphabetrank->access(i);
 427         }
 428         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 429
 430         if (c == '\0')
 431         {
 432             // Rank among the end-markers in BWT
 433             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 434             result.push_back(Doc->access(endmarkerRank));
 435         }
 436         else // Sampled position
 437         {
 438             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 439             result.push_back(docId);
 440         }
 441     }
 442
 443     return result;
 444 }
 445
 446 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
 447 {
 448     TextPosition m = strlen((char *)pattern);
 449     if (m == 0)
 450         return TextCollection::document_result(); // FIXME Should return all 1...k
 451
 452     TextPosition sp = 0, ep = 0;
 453     // Search with end-marker
 454     Search(pattern, m+1, &sp, &ep, begin, end);
 455
 456     TextCollection::document_result result;
 457     result.reserve(ep-sp+1); // Try to avoid reallocation.
 458
 459     // Check each occurrence, already within [begin, end]
 460     for (; sp <= ep; ++sp)
 461     {
 462         TextPosition i = sp;
 463
 464         uchar c = alphabetrank->access(i);
 465         while (c != '\0' && !sampled->access(i))
 466         {
 467             i = C[c]+alphabetrank->rank(c,i)-1;
 468             c = alphabetrank->access(i);
 469         }
 470         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 471
 472         if (c == '\0')
 473         {
 474             // Rank among the end-markers in BWT
 475             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 476             result.push_back(Doc->access(endmarkerRank));
 477         }
 478         else // Sampled position
 479         {
 480             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 481             result.push_back(docId);
 482         }
 483     }
 484
 485     return result;
 486 }
 487
 488
 489 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
 490 {
 491     TextPosition m = strlen((char const *)pattern);
 492     if (m == 0)
 493         return TextCollection::document_result(); // FIXME Should return all empty texts
 494
 495     TextPosition sp = 0, ep = 0;
 496     // Match including end-marker
 497     Search(pattern, m+1, &sp, &ep);
 498
 499     // Report end-markers in result interval
 500     return EnumerateEndmarkers(sp, ep);
 501 }
 502
 503 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
 504 {
 505     TextPosition m = strlen((char const *)pattern);
 506     if (m == 0)
 507         return TextCollection::document_result(); // FIXME Should return all empty texts
 508
 509     TextPosition sp = 0, ep = 0;
 510     // Match including end-marker
 511     Search(pattern, m+1, &sp, &ep, begin, end);
 512
 513     // Report end-markers in result interval
 514     return EnumerateEndmarkers(sp, ep, begin, end);
 515 }
 516
 517
 518 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
 519 {
 520     TextPosition m = strlen((char *)pattern);
 521     if (m == 0)
 522         return TextCollection::document_result();
 523
 524     TextPosition sp = 0, ep = 0;
 525     // Search all occurrences
 526     Search(pattern, m, &sp, &ep);
 527
 528     // We want unique document indentifiers, using std::set to collect them
 529     std::set<DocId> resultSet;
 530
 531     // Check each occurrence
 532     for (; sp <= ep; ++sp)
 533     {
 534         TextPosition i = sp;
 535         uchar c = alphabetrank->access(i);
 536         while (c != '\0' && !sampled->access(i))
 537         {
 538             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 539             c = alphabetrank->access(i);
 540         }
 541         if (c == '\0')
 542         {
 543             // Rank among the end-markers in BWT
 544             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 545             resultSet.insert(Doc->access(endmarkerRank));
 546         }
 547         else
 548         {
 549             DocId di = (*suffixDocId)[sampled->rank1(i)-1];
 550             assert((unsigned)di < numberOfTexts);
 551             resultSet.insert(di);
 552         }
 553     }
 554
 555     // Convert std::set to std::vector
 556     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 557     return result;
 558 }
 559
 560 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
 561 {
 562     TextPosition m = strlen((char *)pattern);
 563     if (m == 0)
 564         return TextCollection::document_result();
 565
 566     TextPosition sp = 0, ep = 0;
 567     // Search all occurrences
 568     Search(pattern, m, &sp, &ep);
 569
 570     // We want unique document indentifiers, using std::set to collect them
 571     std::set<DocId> resultSet;
 572
 573     // Check each occurrence
 574     for (; sp <= ep; ++sp)
 575     {
 576         TextPosition i = sp;
 577         uchar c = alphabetrank->access(i);
 578         while (c != '\0' && !sampled->access(i))
 579         {
 580             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 581             c = alphabetrank->access(i);
 582         }
 583         if (c == '\0')
 584         {
 585             // Rank among the end-markers in BWT
 586             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 587             DocId docId = Doc->access(endmarkerRank);
 588             if (docId >= begin && docId <= end)
 589                 resultSet.insert(docId);
 590         }
 591         else
 592         {
 593             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 594             assert((unsigned)docId < numberOfTexts);
 595             if (docId >= begin && docId <= end)
 596                 resultSet.insert(docId);
 597         }
 598     }
 599
 600     // Convert std::set to std::vector
 601     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 602     return result;
 603 }
 604
 605 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
 606 {
 607     TextPosition m = strlen((char *)pattern);
 608     if (m == 0)
 609         return TextCollection::document_result(); // empty result set
 610
 611     TextPosition sp = 0, ep = 0;
 612     SearchLessThan(pattern, m, &sp, &ep);
 613
 614     // Report end-markers in result interval
 615     return EnumerateEndmarkers(sp, ep);
 616 }
 617
 618 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
 619 {
 620     TextPosition m = strlen((char *)pattern);
 621     if (m == 0)
 622         return TextCollection::document_result(); // empty result set
 623
 624     TextPosition sp = 0, ep = 0;
 625     SearchLessThan(pattern, m, &sp, &ep);
 626
 627     // Iterate through end-markers in [sp,ep] and [begin, end]:
 628     return EnumerateEndmarkers(sp, ep, begin, end);
 629 }
 630
 631 /**
 632  * Full result set queries
 633  */
 634 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
 635 {
 636     TextPosition m = strlen((char *)pattern);
 637     if (m == 0)
 638         return full_result(); // FIXME Throw exception?
 639
 640     TextPosition sp = 0, ep = 0;
 641     // Search all occurrences
 642     Search(pattern, m, &sp, &ep);
 643
 644     full_result result;
 645     result.reserve(ep-sp+1); // Try to avoid reallocation.
 646
 647     // Report each occurrence
 648     for (; sp <= ep; ++sp)
 649     {
 650         TextPosition i = sp;
 651         TextPosition dist = 0;
 652         uchar c = alphabetrank->access(i);
 653         while (c != '\0' && !sampled->access(i))
 654         {
 655             i = C[c]+alphabetrank->rank(c,i)-1;
 656             c = alphabetrank->access(i);
 657             ++ dist;
 658         }
 659         if (c == '\0')
 660         {
 661             // Rank among the end-markers in BWT
 662             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 663             DocId docId = Doc->access(endmarkerRank);
 664             result.push_back(make_pair(docId, dist));
 665         }
 666         else
 667         {
 668             TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 669             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 670
 671             result.push_back(make_pair(docId, textPos));
 672         }
 673     }
 674
 675     return result;
 676 }
 677
 678 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
 679 {
 680     TextPosition m = strlen((char *)pattern);
 681     if (m == 0)
 682         return full_result(); // FIXME Throw exception?
 683
 684     TextPosition sp = 0, ep = 0;
 685     // Search all occurrences
 686     Search(pattern, m, &sp, &ep);
 687
 688     full_result result;
 689     result.reserve(ep-sp+1); // Try to avoid reallocation.
 690
 691     // Report each occurrence
 692     for (; sp <= ep; ++sp)
 693     {
 694         TextPosition i = sp;
 695         TextPosition dist = 0;
 696         uchar c = alphabetrank->access(i);
 697         while (c != '\0' && !sampled->access(i))
 698         {
 699             i = C[c]+alphabetrank->rank(c,i)-1;
 700             c = alphabetrank->access(i);
 701             ++ dist;
 702         }
 703         if (c == '\0')
 704         {
 705             // Rank among the end-markers in BWT
 706             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 707
 708             // End-marker that we found belongs to the "preceeding" doc in collection:
 709             DocId docId = Doc->access(endmarkerRank);
 710             if (docId >= begin && docId <= end)
 711                 result.push_back(make_pair(docId, dist));
 712         }
 713         else
 714         {
 715             TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
 716             DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
 717
 718             if (docId >= begin && docId <= end)
 719                 result.push_back(make_pair(docId, textPos));
 720         }
 721     }
 722
 723     return result;
 724 }
 725
 726
 727 /**
 728  * Save index to a file handle
 729  *
 730  * Throws a std::runtime_error exception on i/o error.
 731  * First byte that is saved represents the version number of the save file.
 732  * In version 2 files, the data fields are:
 733  *  uchar versionFlag;
 734     TextPosition n;
 735     unsigned samplerate;
 736     unsigned C[256];
 737     TextPosition bwtEndPos;
 738     static_sequence * alphabetrank;
 739     BSGAP *sampled;
 740     BlockArray *suffixes;
 741     BlockArray *suffixDocId;
 742     unsigned numberOfTexts;
 743     ulong maxTextLength;
 744     static_sequence *docId;
 745  */
 746 void TCImplementation::Save(FILE *file) const
 747 {
 748     // Saving version info:
 749     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 750         throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
 751
 752     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 753         throw std::runtime_error("TCImplementation::Save(): file write error (n).");
 754     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 755         throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
 756
 757     for(ulong i = 0; i < 256; ++i)
 758         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 759             throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
 760
 761     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 762         throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
 763
 764     alphabetrank->save(file);
 765     sampled->save(file);
 766     suffixes->Save(file);
 767     suffixDocId->Save(file);
 768
 769     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 770         throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
 771     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 772         throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
 773
 774     Doc->save(file);
 775     textStorage->Save(file);
 776     fflush(file);
 777 }
 778
 779
 780 /**
 781  * Load index from a file handle
 782  *
 783  * Throws a std::runtime_error exception on i/o error.
 784  * For more info, see TCImplementation::Save().
 785  */
 786 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
 787     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
 788       suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
 789 {
 790     uchar verFlag = 0;
 791     if (std::fread(&verFlag, 1, 1, file) != 1)
 792         throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
 793     if (verFlag != TCImplementation::versionFlag)
 794         throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
 795
 796     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 797         throw std::runtime_error("TCImplementation::Load(): file read error (n).");
 798     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 799         throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
 800 // FIXME samplerate can not be changed during load.
 801 //    if (this->samplerate == 0)
 802 //        this->samplerate = samplerate;
 803
 804     for(ulong i = 0; i < 256; ++i)
 805         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 806             throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
 807
 808     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 809         throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
 810
 811     alphabetrank = static_sequence::load(file);
 812     sampled = static_bitsequence::load(file);
 813     suffixes = new BlockArray(file);
 814     suffixDocId = new BlockArray(file);
 815
 816     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 817         throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
 818     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 819         throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
 820
 821     Doc = static_sequence::load(file);
 822     textStorage = new TextStorage(file);
 823
 824     // FIXME Construct data structures with new samplerate
 825     //maketables();
 826 }
 827
 828
 829
 830 /**
 831  * Rest of the functions follow...
 832  */
 833
 834
 835
 836 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 837 {
 838     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 839     int c = (int)pattern[m-1];
 840     TextPosition i=m-1;
 841     TextPosition sp = C[c];
 842     TextPosition ep = C[c+1]-1;
 843     while (sp<=ep && i>=1)
 844     {
 845 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 846         c = (int)pattern[--i];
 847         sp = C[c]+alphabetrank->rank(c,sp-1);
 848         ep = C[c]+alphabetrank->rank(c,ep)-1;
 849     }
 850     *spResult = sp;
 851     *epResult = ep;
 852     if (sp<=ep)
 853         return ep - sp + 1;
 854     else
 855         return 0;
 856 }
 857
 858 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
 859 {
 860     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 861     int c = (int)pattern[m-1];
 862     assert(c == 0); // Start from endmarkers
 863     TextPosition i=m-1;
 864     TextPosition sp = begin;
 865     TextPosition ep = end;
 866     while (sp<=ep && i>=1)
 867     {
 868 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 869         c = (int)pattern[--i];
 870         sp = C[c]+alphabetrank->rank(c,sp-1);
 871         ep = C[c]+alphabetrank->rank(c,ep)-1;
 872     }
 873     *spResult = sp;
 874     *epResult = ep;
 875     if (sp<=ep)
 876         return ep - sp + 1;
 877     else
 878         return 0;
 879 }
 880
 881
 882 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 883 {
 884     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 885     uint c = (int)pattern[m-1];
 886     TextPosition i=m-1;
 887     TextPosition sp = 1;
 888     TextPosition ep = C[c+1]-1;
 889     while (sp<=ep && i>=1)
 890     {
 891 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 892         c = (int)pattern[--i];
 893         uint result = alphabetrank->rankLessThan(c,ep);
 894         if (result == ~0u)
 895             ep = 0;
 896         else
 897             ep = C[c]+result-1;
 898     }
 899     *spResult = sp;
 900     *epResult = ep;
 901     if (sp<=ep)
 902         return ep - sp + 1;
 903     else
 904         return 0;
 905 }
 906
 907
 908 TCImplementation::~TCImplementation() {
 909     delete alphabetrank;
 910     delete sampled;
 911     delete suffixes;
 912     delete suffixDocId;
 913     delete Doc;
 914     delete textStorage;
 915 }
 916
 917 void TCImplementation::makewavelet(uchar *bwt)
 918 {
 919     ulong i, min = 0,
 920              max;
 921     for (i=0;i<256;i++)
 922         C[i]=0;
 923     for (i=0;i<n;++i)
 924         C[(int)bwt[i]]++;
 925     for (i=0;i<256;i++)
 926         if (C[i]>0) {min = i; break;}
 927     for (i=255;i>=min;--i)
 928         if (C[i]>0) {max = i; break;}
 929
 930     ulong prev=C[0], temp;
 931     C[0]=0;
 932     for (i=1;i<256;i++) {
 933         temp = C[i];
 934         C[i]=C[i-1]+prev;
 935         prev = temp;
 936     }
 937 //    this->codetable = node::makecodetable(bwt,n);
 938 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 939 //    delete [] bwt;
 940     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
 941 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 942
 943 #ifdef DEBUG_MEMUSAGE
 944     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 945     HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
 946 #endif
 947
 948     alphabet_mapper * am = new alphabet_mapper_none();
 949     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
 950     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
 951     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
 952     delete bmb;
 953     bwt = 0; // already deleted
 954
 955 #ifdef DEBUG_MEMUSAGE
 956     std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 957     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
 958 #endif
 959 }
 960
 961 void TCImplementation::maketables()
 962 {
 963     // Calculate BWT end-marker position (of last inserted text)
 964     {
 965         ulong i = 0;
 966         uint alphabetrank_i_tmp = 0;
 967         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
 968         while (c != '\0')
 969         {
 970             i = C[c]+alphabetrank_i_tmp-1;
 971             c = alphabetrank->access(i, alphabetrank_i_tmp);
 972         }
 973
 974         this->bwtEndPos = i;
 975     }
 976
 977     // Build up array for text starting positions
 978     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
 979     (*textStartPos)[0] = 0;
 980
 981     // Mapping from end-markers to doc ID's:
 982     uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs.
 983
 984     uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
 985     for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
 986         sampledpositions[i] = 0;
 987
 988     ulong x,p=bwtEndPos;
 989     ulong sampleCount = 0;
 990     // Keeping track of text position of prev. end-marker seen
 991     ulong posOfSuccEndmarker = n;
 992     DocId textId = numberOfTexts;
 993     ulong ulongmax = 0;
 994     ulongmax--;
 995     uint alphabetrank_i_tmp =0;
 996
 997     /**
 998      * First pass: populate tables textStartPos and sampledpositions.
 999      */
1000     for (ulong i=n-1;i<ulongmax;i--) {
1001         // i substitutes SA->GetPos(i)
1002         x=(i==n-1)?0:i+1;
1003
1004         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1005             set_field(sampledpositions,1,p,1);
1006             sampleCount ++;
1007         }
1008
1009         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1010         if (c == '\0')
1011         {
1012             --textId;
1013
1014             // Record the order of end-markers in BWT:
1015             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1016             endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts;
1017
1018             // Store text length and text start position:
1019             if (textId < (DocId)numberOfTexts - 1)
1020             {
1021                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1022                 posOfSuccEndmarker = x;
1023             }
1024
1025             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1026             p = textId; // Correct LF-mapping to the last char of the previous text.
1027         }
1028         else // Now c != '\0', do LF-mapping:
1029             p = C[c]+alphabetrank_i_tmp-1;
1030     }
1031     assert(textId == 0);
1032
1033     sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1034     delete [] sampledpositions;
1035     ulong sampleLength = sampled->rank1(n-1);
1036     assert(sampleCount == sampleLength);
1037
1038     // Suffixes store an offset from the text start position
1039     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1040     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1041
1042     p=bwtEndPos;
1043     textId = numberOfTexts;
1044
1045     TextStorageBuilder tsbuilder(n);
1046
1047     /**
1048      * Second pass: populate tables suffixes and suffixDocId.
1049      */
1050     for (ulong i=n-1;i<ulongmax;i--) {
1051         x=(i==n-1)?0:i+1;
1052
1053         if (sampled->access(p)) {
1054             ulong j = sampled->rank1(p)-1;
1055
1056             (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x);
1057
1058             // calculate offset from text start:
1059             (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]];
1060         }
1061
1062         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1063         tsbuilder[i] = c;
1064
1065         if (c == '\0')
1066         {
1067             --textId;
1068             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1069             p = textId; // Correct LF-mapping to the last char of the previous text.
1070         }
1071         else // Now c != '\0', do LF-mapping:
1072             p = C[c]+alphabetrank_i_tmp-1;
1073     }
1074     assert(textId == 0);
1075     delete textStartPos;
1076
1077     textStorage = tsbuilder.InitTextStorage();
1078
1079 #ifdef DEBUG_MEMUSAGE
1080     std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1081     HeapProfiler::ResetMaxHeapConsumption();
1082 #endif
1083
1084     alphabet_mapper * am = new alphabet_mapper_none();
1085     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1086     Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true);
1087     delete bmb;
1088     // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1089
1090 #ifdef DEBUG_MEMUSAGE
1091     std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1092 #endif
1093 }
1094
1095
1096 /**
1097  * Finds document identifier for given text position
1098  *
1099  * Starting text position of the document is stored into second parameter.
1100  * Binary searching on text starting positions.
1101  */
1102 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1103 {
1104     assert(i < n);
1105
1106     DocId a = 0;
1107     DocId b = numberOfTexts - 1;
1108     while (a < b)
1109     {
1110         DocId c = a + (b - a)/2;
1111         if ((*textStartPos)[c] > i)
1112             b = c - 1;
1113         else if ((*textStartPos)[c+1] > i)
1114             return c;
1115         else
1116             a = c + 1;
1117     }
1118
1119     assert(a < (DocId)numberOfTexts);
1120     assert(i >= (*textStartPos)[a]);
1121     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1122     return a;
1123 }
1124
1125
1126 } // namespace SXSI
1127