CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22
  23 // Conflicts with std::min and std::max
  24 #undef min
  25 #undef max
  26 #include "dcover/bwt.hpp"
  27
  28 //#include "HeapProfiler.h" // FIXME remove
  29
  30 #include <iostream>
  31 #include <queue>
  32 #include <set>
  33 #include <vector>
  34 #include <utility>
  35 #include <stdexcept>
  36 #include <cassert>
  37 #include <cstring> // For strlen()
  38 using std::vector;
  39 using std::pair;
  40 using std::make_pair;
  41 using std::map;
  42
  43 namespace SXSI
  44 {
  45
  46 // Save file version info
  47 const uchar CSA::versionFlag = 2;
  48
  49 ////////////////////////////////////////////////////////////////////////////
  50 // Class CSA::THuffAlphabetRank
  51 // FIXME Unused code
  52 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  53     left = NULL;
  54     right = NULL;
  55     bitrank = NULL;
  56     ch = s[0];
  57     leaf = false;
  58     this->codetable = codetable;
  59
  60     bool *B = new bool[n];
  61     TextPosition sum=0,i;
  62     /*
  63     for (i=0; i< n; i++) {
  64        printf("%c:", (char)((int)s[i]-128));
  65        for (r=0;r<codetable[(int)s[i]].bits;r++)
  66           if (codetable[(int)s[i]].code & (1u <<r))
  67              printf("1");
  68           else printf("0");
  69        printf("\n");
  70     }
  71     printf("\n");
  72     if (level > 100) return;*/
  73     for (i=0;i<n;i++)
  74        if (codetable[(int)s[i]].code & (1u << level)) {
  75           B[i] = true;
  76           sum++;
  77        }
  78        else B[i] = false;
  79     if (sum==0 || sum==n) {
  80         delete [] B;
  81         leaf = true;
  82         return;
  83     }
  84     uchar *sfirst, *ssecond;
  85     //if (n-sum > 0)
  86         sfirst = new uchar[n-sum];
  87     //if (sum > 0)
  88         ssecond = new uchar[sum];
  89     unsigned j=0,k=0;
  90    for (i=0;i<n;i++)
  91         if (B[i]) ssecond[k++] = s[i];
  92         else sfirst[j++] = s[i];
  93     ulong *Binbits = new ulong[n/W+1];
  94     for (i=0;i<n;i++)
  95         Tools::SetField(Binbits,1,i,B[i]);
  96     delete [] B;
  97     bitrank = new BitRank(Binbits,n,true);
  98     //if (j > 0) {
  99         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
 100         delete [] sfirst;
 101     //}
 102     //if (k>0) {
 103         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
 104         delete [] ssecond;
 105     //}
 106 }
 107
 108
 109 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 110     // testing that the code works correctly
 111     int C[256];
 112     unsigned i,j;
 113     bool correct=true;
 114     for (j=0;j<256;j++)
 115         C[j] = 0;
 116     for (i=0;i<n;i++) {
 117         C[(int)s[i]]++;
 118         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 119         correct = false;
 120         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 121         }
 122     }
 123     return correct;
 124 }
 125
 126 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 127     if (left!=NULL) delete left;
 128     if (right!=NULL) delete right;
 129     if (bitrank!=NULL)
 130         delete bitrank;
 131 }
 132
 133
 134 ////////////////////////////////////////////////////////////////////////////
 135 // Class CSA
 136
 137 /**
 138  * Constructor inits an empty dynamic FM-index.
 139  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 140  */
 141 CSA::CSA(unsigned samplerate)
 142     : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
 143       suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
 144       emptyTextRank(0)
 145 {
 146     this->samplerate = samplerate;
 147
 148 #ifdef CSA_TEST_BWT
 149     // FIXME TODO : DynFMI needs distribution of characters before hand
 150     // This will create fully balanced wavelet tree for all chars [0, 255].
 151     uchar temp[256];
 152     for (unsigned i = 0; i < 255; ++i)
 153         temp[i] = i+1;
 154     temp[255] = 0;
 155     dynFMI = new DynFMI(temp, 1, 255, false);
 156
 157     /* Debug code: take char distribution from data.txt.
 158     uchar *temp = Tools::GetFileContents("data.txt", 0);
 159     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 160     delete [] temp;*/
 161 #endif
 162 }
 163
 164 /**
 165  * Insert new text
 166  *
 167  * Given text must include end-marker.
 168  * Text identifiers are numbered starting from 0.
 169  */
 170 void CSA::InsertText(uchar const * text)
 171 {
 172     // Sanity check:
 173 #ifdef CSA_TEST_BWT
 174     assert(dynFMI != 0);
 175 #endif
 176
 177     TextPosition m = std::strlen((char *)text) + 1;
 178     if (m > maxTextLength)
 179         maxTextLength = m; // Store length of the longest text seen so far.
 180
 181     if (m > 1)
 182     {
 183         this->n += m;
 184         this->numberOfTexts ++;
 185         this->numberOfAllTexts ++;
 186 #ifdef CSA_TEST_BWT
 187         dynFMI->addText(text, m);
 188 #endif
 189
 190         texts.append((char *) text);
 191         texts.append(1, '\0');
 192     }
 193     else
 194     {
 195         emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
 196         this->numberOfAllTexts ++;
 197     }
 198 }
 199
 200 void CSA::MakeStatic()
 201 {
 202     // Sanity check:
 203 #ifdef CSA_TEST_BWT
 204     if (dynFMI == 0)
 205         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 206 #endif
 207
 208     if (texts.size() == 0) // Empty collection
 209     {
 210          ++n;
 211          ++numberOfTexts;
 212          texts.append(1, '\0');
 213     }
 214
 215     // Bitvector of empty texts, FIXME Remove?
 216     {
 217         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
 218         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
 219         for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
 220             tempB[i] = 0;
 221         for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
 222             Tools::SetField(tempB, 1, (*it), 1);
 223         emptyTextId.clear();
 224         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
 225     }
 226
 227     texts.reserve(0); // Release extra capacity
 228
 229 /*    FILE *fp = fopen("texts.txt", "wb");
 230     std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
 231     fclose(fp);*/
 232
 233     uchar *bwt = new uchar[n];
 234     {
 235         // More succinct solution via StringIterator, see below.
 236 /*        unsigned* maptexts = new unsigned[n+1];
 237         // Map text chars to [0..255+numberOfTexts]
 238         unsigned count = 0;
 239         for (ulong i = 0; i < n; ++i)
 240             if (texts[i] == 0)
 241                 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
 242             else {
 243                 uchar c = (uchar)texts[i];
 244                 maptexts[i] = (unsigned)c + numberOfTexts;
 245             }
 246         maptexts[n] = '\0';
 247         assert(count == numberOfTexts);
 248
 249         std::cout << "maptext: ";
 250         for (ulong i = 0; i <= n; ++i)
 251             std::cout << maptexts[i] << ", ";
 252             std::cout << std::endl;*/
 253
 254         // Mark endmarker positions into bitvector
 255         ulong * endmarkers = new ulong[n/W+1];
 256         for (ulong i = 0; i < n/W+1; ++i)
 257             endmarkers[i] = 0;
 258         for (ulong i = 0; i < n; ++i)
 259             if (texts[i] == 0)
 260                 Tools::SetField(endmarkers, 1, i, 1);
 261         BitRank* br = new BitRank(endmarkers, n, true);
 262         assert(numberOfTexts == br->rank(n-1));
 263
 264         // Build iterators, FIXME clean up iterator construction.
 265         StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
 266         StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
 267
 268         bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
 269                                        &bwt[0], numberOfTexts);
 270
 271         bwt[--bwtEndPos] = '\0';
 272         texts.erase();
 273         texts.reserve(0); // Release capacity
 274         delete br; // bitvector of endmarkers
 275     } // End of bw transform
 276 //  std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
 277
 278 /*    fp = fopen("bwt.txt", "wb");
 279     std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
 280     fclose(fp);*/
 281
 282
 283 #ifdef CSA_TEST_BWT
 284     {
 285         uchar *bwtTest = dynFMI->getBWT();
 286         printf("123456789012345678901234567890123456789\n");
 287         for (TextPosition i = 0; i < n && i < 100; i ++)
 288             if (bwt[i] < 50)
 289                 printf("%d", (int)bwt[i]);
 290             else
 291                 printf("%c", bwt[i]);
 292         printf("\n");
 293         for (TextPosition i = 0; i < n && i < 100; i ++)
 294             if (bwtTest[i] < 50)
 295                 printf("%d", (int)bwtTest[i]);
 296             else
 297                 printf("%c", bwtTest[i]);
 298                 printf("\n");
 299
 300         // Sanity check
 301         assert(numberOfTexts == dynFMI->getCollectionSize());
 302
 303         delete dynFMI;
 304         dynFMI = 0;
 305         for (ulong i = 0; i < n; ++i)
 306             if (bwt[i] != bwtTest[i])
 307             {
 308                 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
 309                 assert(0);
 310             }
 311         delete [] bwtTest;
 312     }
 313 #endif // CSA_TEST_BWT
 314
 315     makewavelet(bwt); // Deletes bwt!
 316     bwt = 0;
 317
 318     // Make sampling tables
 319     maketables();
 320 }
 321
 322 bool CSA::EmptyText(DocId k) const
 323 {
 324     assert(k < (DocId)numberOfTexts);
 325     if (emptyTextRank->IsBitSet(k))
 326         return true;
 327     return false;
 328 }
 329
 330 uchar* CSA::GetText(DocId k) const
 331 {
 332     assert(k < (DocId)numberOfTexts);
 333
 334     ulong textRank = 0;
 335     if (emptyTextRank->IsBitSet(k, &textRank))
 336     {
 337         uchar* result = new uchar[1];
 338         result[0] = 0;
 339         return result;
 340     }
 341     // Map to non-empty text
 342     k -= textRank; //emptyTextRank->rank(k);
 343
 344     TextPosition i = k;
 345
 346     string result;
 347     // Reserve average string length to avoid reallocs
 348     result.reserve(n/numberOfTexts);
 349
 350     uchar c = alphabetrank->access(i);
 351     while (c != '\0')
 352     {
 353         result.push_back(c);
 354         i = C[c]+alphabetrank->rank(c,i)-1;
 355
 356         c = alphabetrank->access(i); // "next" char.
 357     }
 358
 359     // Convert to uchar (FIXME return string?)
 360     i = result.size();
 361     uchar* res = new uchar[i+1];
 362     res[i] = '\0';
 363     for (ulong j = 0; j < i; ++j)
 364         res[i-j-1] = result[j];
 365     return res;
 366 }
 367 /*
 368  * Not supported
 369 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 370 {
 371     assert(k < (DocId)numberOfTexts);
 372     assert(j < (*textLength)[k]);
 373     assert(i <= j);
 374
 375     ulong textRank = 0;
 376     if (emptyTextRank->IsBitSet(k, &textRank))
 377     {
 378         uchar* result = new uchar[1];
 379         result[0] = 0;
 380         return result;
 381     }
 382
 383     // Map to non-empty text
 384     k -= textRank; // emptyTextRank->rank(k);
 385
 386     // Start position of k'th text
 387     ulong start = (*textStartPos)[k];
 388
 389     return Substring(i + start, j-i+1);
 390     }*/
 391
 392 /******************************************************************
 393  * Existential queries
 394  */
 395 bool CSA::IsPrefix(uchar const * pattern) const
 396 {
 397     TextPosition m = strlen((char *)pattern);
 398     if (m == 0)
 399         return true;
 400
 401     TextPosition sp = 0, ep = 0;
 402     Search(pattern, m, &sp, &ep);
 403
 404     // Check for end-marker(s) in result interval
 405     if (CountEndmarkers(sp, ep))
 406         return true;
 407     return false;
 408 }
 409
 410 bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 411 {
 412     TextPosition m = strlen((char *)pattern);
 413     if (m == 0)
 414         return true;
 415
 416     TextPosition sp = 0, ep = 0;
 417     Search(pattern, m, &sp, &ep);
 418
 419     // Check for end-marker(s) in result interval
 420     if (CountEndmarkers(sp, ep, begin, end))
 421         return true;
 422     return false;
 423 }
 424
 425
 426 bool CSA::IsSuffix(uchar const *pattern) const
 427 {
 428     // Here counting is as fast as IsSuffix():
 429     if (CountSuffix(pattern) > 0)
 430         return true;
 431     return false;
 432 }
 433
 434 bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 435 {
 436     // Here counting is as fast as IsSuffix():
 437     if (CountSuffix(pattern, begin, end) > 0)
 438         return true;
 439     return false;
 440 }
 441
 442 bool CSA::IsEqual(uchar const *pattern) const
 443 {
 444     TextPosition m = std::strlen((char *)pattern);
 445     if (m == 0)
 446     {
 447         if (numberOfAllTexts - numberOfTexts > 0u)
 448             return true; // Empty texts exists
 449         return false; // No empty texts exists
 450     }
 451
 452     TextPosition sp = 0, ep = 0;
 453     // Match including end-marker
 454     Search(pattern, m+1, &sp, &ep);
 455
 456     // Check for end-marker(s) in result interval
 457     if (CountEndmarkers(sp, ep))
 458         return true;
 459     return false;
 460 }
 461
 462 bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 463 {
 464     TextPosition m = std::strlen((char *)pattern);
 465     if (m == 0)
 466     {
 467         if (numberOfAllTexts - numberOfTexts > 0u)
 468             return true; // Empty texts exists
 469         return false; // No empty texts exists
 470     }
 471
 472     TextPosition sp = 0, ep = 0;
 473     // Match including end-marker
 474     Search(pattern, m+1, &sp, &ep, begin, end);
 475
 476     // Check for end-marker(s) in result interval
 477     if (CountEndmarkers(sp, ep))
 478         return true;
 479     return false;
 480 }
 481
 482 bool CSA::IsContains(uchar const * pattern) const
 483 {
 484     TextPosition m = strlen((char *)pattern);
 485     if (m == 0)
 486         return true;
 487
 488     TextPosition sp = 0, ep = 0;
 489     // Just check if pattern exists somewhere
 490     ulong count = Search(pattern, m, &sp, &ep);
 491
 492     if (count > 0)
 493         return true;
 494     return false;
 495 }
 496
 497 bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const
 498 {
 499     // Here counting is as fast as existential querying
 500     if (CountContains(pattern, begin, end) > 0)
 501         return true; // FIXME No need to filter result set
 502     return false;
 503 }
 504
 505 bool CSA::IsLessThan(uchar const * pattern) const
 506 {
 507     if (CountLessThan(pattern) > 0)
 508         return true;
 509     return false;
 510 }
 511
 512 bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 513 {
 514     if (CountLessThan(pattern, begin, end) > 0)
 515         return true;
 516     return false;
 517 }
 518
 519 /******************************************************************
 520  * Counting queries
 521  */
 522 ulong CSA::Count(uchar const * pattern) const
 523 {
 524     TextPosition m = strlen((char *)pattern);
 525     if (m == 0)
 526         return 0;
 527
 528     TextPosition sp = 0, ep = 0;
 529     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 530     return count;
 531 }
 532
 533 unsigned CSA::CountPrefix(uchar const * pattern) const
 534 {
 535     TextPosition m = strlen((char *)pattern);
 536     if (m == 0)
 537         return numberOfAllTexts;
 538
 539     TextPosition sp = 0, ep = 0;
 540     Search(pattern, m, &sp, &ep);
 541
 542     // Count end-markers in result interval
 543     return CountEndmarkers(sp, ep);
 544 }
 545
 546 unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 547 {
 548     TextPosition m = strlen((char *)pattern);
 549     if (m == 0)
 550         return numberOfAllTexts;
 551
 552     TextPosition sp = 0, ep = 0;
 553     Search(pattern, m, &sp, &ep);
 554
 555     // Count end-markers in result interval
 556     return CountEndmarkers(sp, ep, begin, end);
 557 }
 558
 559 unsigned CSA::CountSuffix(uchar const * pattern) const
 560 {
 561     TextPosition m = strlen((char *)pattern);
 562     if (m == 0)
 563         return numberOfAllTexts;
 564
 565     TextPosition sp = 0, ep = 0;
 566     // Search with end-marker
 567     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 568
 569     return count;
 570 }
 571
 572 unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 573 {
 574     TextPosition m = strlen((char *)pattern);
 575     if (m == 0)
 576         return numberOfAllTexts;
 577
 578     TextPosition sp = 0, ep = 0;
 579     // Search with end-marker
 580     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 581
 582     return count;
 583 }
 584
 585 unsigned CSA::CountEqual(uchar const *pattern) const
 586 {
 587     TextPosition m = strlen((char const *)pattern);
 588     if (m == 0)
 589         return numberOfAllTexts - numberOfTexts; // Empty texts.
 590
 591     TextPosition sp = 0, ep = 0;
 592     // Match including end-marker
 593     Search(pattern, m+1, &sp, &ep);
 594
 595     // Count end-markers in result interval
 596     return CountEndmarkers(sp, ep);
 597 }
 598
 599 unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 600 {
 601     TextPosition m = strlen((char const *)pattern);
 602     if (m == 0)
 603         return numberOfAllTexts - numberOfTexts; // Empty texts.
 604
 605     TextPosition sp = 0, ep = 0;
 606     // Match including end-marker
 607     Search(pattern, m+1, &sp, &ep, begin, end);
 608
 609     // Count end-markers in result interval
 610     return CountEndmarkers(sp, ep); // Already within [begin, end]
 611 }
 612
 613 unsigned CSA::CountContains(uchar const * pattern) const
 614 {
 615     TextPosition m = strlen((char const *)pattern);
 616     if (m == 0)
 617         return numberOfAllTexts; // Total number of texts.
 618
 619     // Here counting is as slow as fetching the result set
 620     // because we have to filter out occ's that fall within same document.
 621     TextCollection::document_result result = Contains(pattern);
 622     return result.size();
 623 }
 624
 625 unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const
 626 {
 627     TextPosition m = strlen((char const *)pattern);
 628     if (m == 0)
 629         return numberOfAllTexts; // Total number of texts.
 630
 631     // Here counting is as slow as fetching the result set
 632     // because we have to filter out occ's that fall within same document.
 633     TextCollection::document_result result = Contains(pattern, begin, end);
 634     return result.size();
 635 }
 636
 637 // Less than or equal
 638 unsigned CSA::CountLessThan(uchar const * pattern) const
 639 {
 640     TextPosition m = strlen((char const *)pattern);
 641     if (m == 0)
 642         return numberOfAllTexts - numberOfTexts; // Empty texts.
 643
 644     TextPosition sp = 0, ep = 0;
 645     SearchLessThan(pattern, m, &sp, &ep);
 646
 647     // Count end-markers in result interval
 648     return CountEndmarkers(sp, ep);
 649 }
 650
 651 unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 652 {
 653     TextPosition m = strlen((char const *)pattern);
 654     if (m == 0)
 655         return numberOfAllTexts - numberOfTexts; // Empty texts.
 656
 657     TextPosition sp = 0, ep = 0;
 658     SearchLessThan(pattern, m, &sp, &ep);
 659
 660     // Count end-markers in result interval
 661     return CountEndmarkers(sp, ep, begin, end);
 662 }
 663
 664 /**
 665  * Document reporting queries
 666  */
 667 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 668 {
 669     TextPosition m = strlen((char *)pattern);
 670     if (m == 0)
 671         return TextCollection::document_result(); // FIXME Should return all 1...k
 672
 673     TextPosition sp = 0, ep = 0;
 674     Search(pattern, m, &sp, &ep);
 675
 676     TextCollection::document_result result;
 677
 678     // Report end-markers in result interval
 679     unsigned resultSize = CountEndmarkers(sp, ep);
 680     if (resultSize == 0)
 681         return result;
 682
 683     result.reserve(resultSize); // Try to avoid reallocation.
 684
 685     // Iterate through end-markers in [sp,ep]:
 686     unsigned i = 0;
 687     if (sp > 0)
 688         i = alphabetrank->rank(0, sp - 1);
 689     while (resultSize)
 690     {
 691         // End-marker we found belongs to the "preceeding" doc in the collection
 692         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 693         // Map to doc ID:
 694         docId = emptyTextRank->select0(docId+1);
 695         result.push_back(docId);
 696
 697         -- resultSize;
 698         ++ i;
 699     }
 700
 701     return result;
 702 }
 703
 704 TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const
 705 {
 706     TextPosition m = strlen((char *)pattern);
 707     if (m == 0)
 708         return TextCollection::document_result(); // FIXME Should return all 1...k
 709
 710     TextPosition sp = 0, ep = 0;
 711     Search(pattern, m, &sp, &ep);
 712
 713     TextCollection::document_result result;
 714
 715     // Report end-markers in result interval
 716     unsigned resultSize = CountEndmarkers(sp, ep);
 717     if (resultSize == 0)
 718         return result;
 719
 720     result.reserve(resultSize); // Try to avoid reallocation.
 721
 722     // Iterate through end-markers in [sp,ep] and [begin, end]:
 723     unsigned i = 0;
 724     if (sp > 0)
 725         i = alphabetrank->rank(0, sp - 1);
 726     while (resultSize)
 727     {
 728         // End-marker we found belongs to the "preceeding" doc in the collection
 729         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 730         // Map to doc ID:
 731         docId = emptyTextRank->select0(docId+1);
 732         if (docId >= begin && docId <= end)
 733             result.push_back(docId);
 734
 735         -- resultSize;
 736         ++ i;
 737     }
 738
 739     return result;
 740 }
 741
 742 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 743 {
 744     TextPosition m = strlen((char *)pattern);
 745     if (m == 0)
 746         return TextCollection::document_result(); // FIXME Should return all 1...k
 747
 748     TextPosition sp = 0, ep = 0;
 749     // Search with end-marker
 750     Search(pattern, m+1, &sp, &ep);
 751
 752     TextCollection::document_result result;
 753     result.reserve(ep-sp+1); // Try to avoid reallocation.
 754
 755     ulong sampled_rank_i = 0;
 756     // Check each occurrence
 757     for (; sp <= ep; ++sp)
 758     {
 759         TextPosition i = sp;
 760
 761         uchar c = alphabetrank->access(i);
 762         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 763         {
 764             i = C[c]+alphabetrank->rank(c,i)-1;
 765             c = alphabetrank->access(i);
 766         }
 767         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 768
 769         if (c == '\0')
 770         {
 771             // Rank among the end-markers in BWT
 772             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 773
 774             // End-marker that we found belongs to the "preceeding" doc in collection:
 775             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 776             // Map to doc ID:
 777             docId = emptyTextRank->select0(docId+1);
 778             result.push_back(docId);
 779         }
 780         else // Sampled position
 781         {
 782             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 783             // Map to doc ID:
 784             docId = emptyTextRank->select0(docId+1);
 785             result.push_back(docId);
 786         }
 787     }
 788
 789     return result;
 790 }
 791
 792 TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const
 793 {
 794     TextPosition m = strlen((char *)pattern);
 795     if (m == 0)
 796         return TextCollection::document_result(); // FIXME Should return all 1...k
 797
 798     TextPosition sp = 0, ep = 0;
 799     // Search with end-marker
 800     Search(pattern, m+1, &sp, &ep, begin, end);
 801
 802     TextCollection::document_result result;
 803     result.reserve(ep-sp+1); // Try to avoid reallocation.
 804
 805     ulong sampled_rank_i = 0;
 806     // Check each occurrence, already within [begin, end]
 807     for (; sp <= ep; ++sp)
 808     {
 809         TextPosition i = sp;
 810
 811         uchar c = alphabetrank->access(i);
 812         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 813         {
 814             i = C[c]+alphabetrank->rank(c,i)-1;
 815             c = alphabetrank->access(i);
 816         }
 817         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 818
 819         if (c == '\0')
 820         {
 821             // Rank among the end-markers in BWT
 822             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 823
 824             // End-marker that we found belongs to the "preceeding" doc in collection:
 825             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 826             // Map to doc ID:
 827             docId = emptyTextRank->select0(docId+1);
 828             result.push_back(docId);
 829         }
 830         else // Sampled position
 831         {
 832             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 833             // Map to doc ID:
 834             docId = emptyTextRank->select0(docId+1);
 835             result.push_back(docId);
 836         }
 837     }
 838
 839     return result;
 840 }
 841
 842
 843 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 844 {
 845     TextPosition m = strlen((char const *)pattern);
 846     if (m == 0)
 847         return TextCollection::document_result(); // FIXME Should return all empty texts
 848
 849     TextPosition sp = 0, ep = 0;
 850     // Match including end-marker
 851     Search(pattern, m+1, &sp, &ep);
 852
 853     TextCollection::document_result result;
 854
 855     // Report end-markers in result interval
 856     unsigned resultSize = CountEndmarkers(sp, ep);
 857     if (resultSize == 0)
 858         return result;
 859
 860     result.reserve(resultSize); // Try to avoid reallocation.
 861
 862     unsigned i = 0;
 863     if (sp > 0)
 864         i = alphabetrank->rank(0, sp - 1);
 865     while (resultSize)
 866     {
 867         // End-marker we found belongs to the "previous" doc in the collection
 868         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 869         // Map to doc ID:
 870         docId = emptyTextRank->select0(docId+1);
 871         result.push_back(docId);
 872
 873         -- resultSize;
 874         ++ i;
 875     }
 876
 877     return result;
 878 }
 879
 880 TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
 881 {
 882     TextPosition m = strlen((char const *)pattern);
 883     if (m == 0)
 884         return TextCollection::document_result(); // FIXME Should return all empty texts
 885
 886     TextPosition sp = 0, ep = 0;
 887     // Match including end-marker
 888     Search(pattern, m+1, &sp, &ep, begin, end);
 889
 890     TextCollection::document_result result;
 891
 892     // Report end-markers in result interval
 893     unsigned resultSize = CountEndmarkers(sp, ep);
 894     if (resultSize == 0)
 895         return result;
 896
 897     result.reserve(resultSize); // Try to avoid reallocation.
 898
 899     unsigned i = 0;
 900     if (sp > 0)
 901         i = alphabetrank->rank(0, sp - 1);
 902     while (resultSize)
 903     {
 904         // End-marker we found belongs to the "previous" doc in the collection
 905         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 906         // Map to doc ID:
 907         docId = emptyTextRank->select0(docId+1);
 908         result.push_back(docId); // already within [begin, end]
 909
 910         -- resultSize;
 911         ++ i;
 912     }
 913
 914     return result;
 915 }
 916
 917
 918 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 919 {
 920     TextPosition m = strlen((char *)pattern);
 921     if (m == 0)
 922         return TextCollection::document_result();
 923
 924     TextPosition sp = 0, ep = 0;
 925     // Search all occurrences
 926     Search(pattern, m, &sp, &ep);
 927
 928     // We want unique document indentifiers, using std::set to collect them
 929     std::set<DocId> resultSet;
 930
 931     ulong sampled_rank_i = 0;
 932     // Check each occurrence
 933     for (; sp <= ep; ++sp)
 934     {
 935         TextPosition i = sp;
 936         uchar c = alphabetrank->access(i);
 937         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 938         {
 939             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 940             c = alphabetrank->access(i);
 941         }
 942         if (c == '\0')
 943         {
 944             // Rank among the end-markers in BWT
 945             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 946
 947             // End-marker that we found belongs to the "preceeding" doc in collection:
 948             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 949             resultSet.insert(docId);
 950         }
 951         else
 952         {
 953             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 954             assert((unsigned)di < numberOfTexts);
 955             resultSet.insert(di);
 956         }
 957     }
 958
 959     // Convert std::set to std::vector
 960     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 961     // Map to doc ID's
 962     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 963         *it = emptyTextRank->select0(*it+1);
 964     return result;
 965 }
 966
 967 TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
 968 {
 969     TextPosition m = strlen((char *)pattern);
 970     if (m == 0)
 971         return TextCollection::document_result();
 972
 973     TextPosition sp = 0, ep = 0;
 974     // Search all occurrences
 975     Search(pattern, m, &sp, &ep);
 976
 977     // We want unique document indentifiers, using std::set to collect them
 978     std::set<DocId> resultSet;
 979
 980     ulong sampled_rank_i = 0;
 981     // Check each occurrence
 982     for (; sp <= ep; ++sp)
 983     {
 984         TextPosition i = sp;
 985         uchar c = alphabetrank->access(i);
 986         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 987         {
 988             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 989             c = alphabetrank->access(i);
 990         }
 991         if (c == '\0')
 992         {
 993             // Rank among the end-markers in BWT
 994             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 995
 996             // End-marker that we found belongs to the "preceeding" doc in collection:
 997             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 998             if (docId >= begin && docId <= end)
 999                 resultSet.insert(docId);
1000         }
1001         else
1002         {
1003             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1004             assert((unsigned)docId < numberOfTexts);
1005             if (docId >= begin && docId <= end)
1006                 resultSet.insert(docId);
1007         }
1008     }
1009
1010     // Convert std::set to std::vector
1011     TextCollection::document_result result(resultSet.begin(), resultSet.end());
1012     // Map to doc ID's
1013     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
1014         *it = emptyTextRank->select0(*it+1);
1015     return result;
1016 }
1017
1018 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
1019 {
1020     TextPosition m = strlen((char *)pattern);
1021     if (m == 0)
1022         return TextCollection::document_result(); // empty result set
1023
1024     TextPosition sp = 0, ep = 0;
1025     SearchLessThan(pattern, m, &sp, &ep);
1026
1027     TextCollection::document_result result;
1028
1029     // Report end-markers in result interval
1030     unsigned resultSize = CountEndmarkers(sp, ep);
1031     if (resultSize == 0)
1032         return result;
1033
1034     result.reserve(resultSize); // Try to avoid reallocation.
1035
1036     // Iterate through end-markers in [sp,ep]:
1037     unsigned i = 0;
1038     if (sp > 0)
1039         i = alphabetrank->rank(0, sp - 1);
1040     while (resultSize)
1041     {
1042         // End-marker we found belongs to the "preceeding" doc in the collection
1043         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1044         // Map to doc ID:
1045         docId = emptyTextRank->select0(docId+1);
1046         result.push_back(docId);
1047
1048         -- resultSize;
1049         ++ i;
1050     }
1051
1052     return result;
1053 }
1054
1055 TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
1056 {
1057     TextPosition m = strlen((char *)pattern);
1058     if (m == 0)
1059         return TextCollection::document_result(); // empty result set
1060
1061     TextPosition sp = 0, ep = 0;
1062     SearchLessThan(pattern, m, &sp, &ep);
1063
1064     TextCollection::document_result result;
1065
1066     // Report end-markers in result interval
1067     unsigned resultSize = CountEndmarkers(sp, ep);
1068     if (resultSize == 0)
1069         return result;
1070
1071     result.reserve(resultSize); // Try to avoid reallocation.
1072
1073     // Iterate through end-markers in [sp,ep] and [begin, end]:
1074     unsigned i = 0;
1075     if (sp > 0)
1076         i = alphabetrank->rank(0, sp - 1);
1077     while (resultSize)
1078     {
1079         // End-marker we found belongs to the "preceeding" doc in the collection
1080         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1081         // Map to doc ID:
1082         docId = emptyTextRank->select0(docId+1);
1083         if (docId >= begin && docId <= end)
1084             result.push_back(docId);
1085
1086         -- resultSize;
1087         ++ i;
1088     }
1089
1090     return result;
1091 }
1092
1093 /**
1094  * Full result set queries
1095  */
1096 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
1097 {
1098     TextPosition m = strlen((char *)pattern);
1099     if (m == 0)
1100         return full_result(); // FIXME Throw exception?
1101
1102     TextPosition sp = 0, ep = 0;
1103     // Search all occurrences
1104     Search(pattern, m, &sp, &ep);
1105
1106     full_result result;
1107     result.reserve(ep-sp+1); // Try to avoid reallocation.
1108
1109     ulong sampled_rank_i = 0;
1110     // Report each occurrence
1111     for (; sp <= ep; ++sp)
1112     {
1113         TextPosition i = sp;
1114         TextPosition dist = 0;
1115         uchar c = alphabetrank->access(i);
1116         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1117         {
1118             i = C[c]+alphabetrank->rank(c,i)-1;
1119             c = alphabetrank->access(i);
1120             ++ dist;
1121         }
1122         if (c == '\0')
1123         {
1124             // Rank among the end-markers in BWT
1125             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1126
1127             // End-marker that we found belongs to the "preceeding" doc in collection:
1128             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1129             // Map to doc ID:
1130             docId = emptyTextRank->select0(docId+1);
1131             result.push_back(make_pair(docId, dist));
1132         }
1133         else
1134         {
1135             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1136             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1137 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1138
1139             // Map to doc ID:
1140             docId = emptyTextRank->select0(docId+1);
1141             result.push_back(make_pair(docId, textPos));
1142         }
1143     }
1144
1145     return result;
1146 }
1147
1148 TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const
1149 {
1150     TextPosition m = strlen((char *)pattern);
1151     if (m == 0)
1152         return full_result(); // FIXME Throw exception?
1153
1154     TextPosition sp = 0, ep = 0;
1155     // Search all occurrences
1156     Search(pattern, m, &sp, &ep);
1157
1158     full_result result;
1159     result.reserve(ep-sp+1); // Try to avoid reallocation.
1160
1161     ulong sampled_rank_i = 0;
1162     // Report each occurrence
1163     for (; sp <= ep; ++sp)
1164     {
1165         TextPosition i = sp;
1166         TextPosition dist = 0;
1167         uchar c = alphabetrank->access(i);
1168         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1169         {
1170             i = C[c]+alphabetrank->rank(c,i)-1;
1171             c = alphabetrank->access(i);
1172             ++ dist;
1173         }
1174         if (c == '\0')
1175         {
1176             // Rank among the end-markers in BWT
1177             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1178
1179             // End-marker that we found belongs to the "preceeding" doc in collection:
1180             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1181             // Map to doc ID:
1182             docId = emptyTextRank->select0(docId+1);
1183             if (docId >= begin && docId <= end)
1184                 result.push_back(make_pair(docId, dist));
1185         }
1186         else
1187         {
1188             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1189             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1190 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1191
1192             // Map to doc ID:
1193             docId = emptyTextRank->select0(docId+1);
1194             if (docId >= begin && docId <= end)
1195                 result.push_back(make_pair(docId, textPos));
1196         }
1197     }
1198
1199     return result;
1200 }
1201
1202
1203 /**
1204  * Save index to a file handle
1205  *
1206  * Throws a std::runtime_error exception on i/o error.
1207  * First byte that is saved represents the version number of the save file.
1208  * In version 2 files, the data fields are:
1209  *  uchar versionFlag;
1210     TextPosition n;
1211     unsigned samplerate;
1212     unsigned C[256];
1213     TextPosition bwtEndPos;
1214     static_sequence * alphabetrank;
1215     BSGAP *sampled;
1216     BlockArray *suffixes;
1217     BlockArray *suffixDocId;
1218     unsigned numberOfTexts;
1219     unsigned numberOfAllTexts;
1220     ulong maxTextLength;
1221     BlockArray *endmarkerDocId;
1222     BSGAP *emptyTextRank;
1223  */
1224 void CSA::Save(FILE *file) const
1225 {
1226     // Saving version info:
1227     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
1228         throw std::runtime_error("CSA::Save(): file write error (version flag).");
1229
1230     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
1231         throw std::runtime_error("CSA::Save(): file write error (n).");
1232     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
1233         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
1234
1235     for(ulong i = 0; i < 256; ++i)
1236         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
1237             throw std::runtime_error("CSA::Save(): file write error (C table).");
1238
1239     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1240         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
1241
1242     alphabetrank->save(file);
1243     sampled->Save(file);
1244     suffixes->Save(file);
1245     suffixDocId->Save(file);
1246
1247     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1248         throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
1249     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1250         throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
1251     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1252         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
1253
1254     endmarkerDocId->Save(file);
1255     emptyTextRank->Save(file);
1256     fflush(file);
1257 }
1258
1259
1260 /**
1261  * Load index from a file handle
1262  *
1263  * Throws a std::runtime_error exception on i/o error.
1264  * For more info, see CSA::Save().
1265  */
1266 void CSA::Load(FILE *file, unsigned samplerate)
1267 {
1268     // Delete everything
1269 #ifdef CSA_TEST_BWT
1270     delete dynFMI;       dynFMI = 0;
1271 #endif
1272     delete alphabetrank; alphabetrank = 0;
1273     delete sampled;      sampled = 0;
1274     delete suffixes;     suffixes = 0;
1275     delete suffixDocId;  suffixDocId = 0;
1276     delete [] codetable; codetable = 0;
1277
1278     delete endmarkerDocId; endmarkerDocId = 0;
1279     delete emptyTextRank;  emptyTextRank = 0;
1280
1281     this->maxTextLength = 0;
1282     this->numberOfTexts = 0;
1283     this->numberOfAllTexts = 0;
1284     this->samplerate = samplerate;
1285     this->n = 0;
1286
1287     uchar verFlag = 0;
1288     if (std::fread(&verFlag, 1, 1, file) != 1)
1289         throw std::runtime_error("CSA::Load(): file read error (version flag).");
1290     if (verFlag != CSA::versionFlag)
1291         throw std::runtime_error("CSA::Load(): invalid save file version.");
1292
1293     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
1294         throw std::runtime_error("CSA::Load(): file read error (n).");
1295     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
1296         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
1297 // FIXME samplerate can not be changed during load.
1298 //    if (this->samplerate == 0)
1299 //        this->samplerate = samplerate;
1300
1301     for(ulong i = 0; i < 256; ++i)
1302         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
1303             throw std::runtime_error("CSA::Load(): file read error (C table).");
1304
1305     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1306         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
1307
1308     alphabetrank = static_sequence::load(file);
1309     sampled = new BSGAP(file);
1310     suffixes = new BlockArray(file);
1311     suffixDocId = new BlockArray(file);
1312
1313     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1314         throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
1315     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1316         throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
1317     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1318         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
1319
1320     endmarkerDocId = new BlockArray(file);
1321     emptyTextRank = new BSGAP(file);
1322
1323     // FIXME Construct data structures with new samplerate
1324     //maketables();
1325 }
1326
1327
1328
1329 /**
1330  * Rest of the functions follow...
1331  */
1332
1333
1334 // FIXME Use 2D-search structure
1335 unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
1336 {
1337     if (sp > ep)
1338         return 0;
1339
1340     ulong ranksp = 0;
1341     if (sp != 0)
1342         ranksp = alphabetrank->rank(0, sp - 1);
1343
1344     unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
1345     if (resultSize == 0)
1346         return 0;
1347
1348     // Count end-markers in result interval and within [begin, end]
1349     unsigned count = 0;
1350     unsigned i = ranksp;
1351     while (resultSize)
1352     {
1353         // End-marker we found belongs to the "previous" doc in the collection
1354         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1355         // Map to doc ID:
1356         docId = emptyTextRank->select0(docId+1);
1357         if (docId >= begin && docId <= end)
1358             ++ count;
1359
1360         -- resultSize;
1361         ++ i;
1362     }
1363     return count;
1364 }
1365
1366 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1367 {
1368     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1369     int c = (int)pattern[m-1];
1370     TextPosition i=m-1;
1371     TextPosition sp = C[c];
1372     TextPosition ep = C[c+1]-1;
1373     while (sp<=ep && i>=1)
1374     {
1375 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1376         c = (int)pattern[--i];
1377         sp = C[c]+alphabetrank->rank(c,sp-1);
1378         ep = C[c]+alphabetrank->rank(c,ep)-1;
1379     }
1380     *spResult = sp;
1381     *epResult = ep;
1382     if (sp<=ep)
1383         return ep - sp + 1;
1384     else
1385         return 0;
1386 }
1387
1388 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
1389 {
1390     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1391     int c = (int)pattern[m-1];
1392     assert(c == 0); // Start from endmarkers
1393     TextPosition i=m-1;
1394     TextPosition sp = begin;
1395     TextPosition ep = end;
1396     while (sp<=ep && i>=1)
1397     {
1398 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1399         c = (int)pattern[--i];
1400         sp = C[c]+alphabetrank->rank(c,sp-1);
1401         ep = C[c]+alphabetrank->rank(c,ep)-1;
1402     }
1403     *spResult = sp;
1404     *epResult = ep;
1405     if (sp<=ep)
1406         return ep - sp + 1;
1407     else
1408         return 0;
1409 }
1410
1411
1412 ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1413 {
1414     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1415     uint c = (int)pattern[m-1];
1416     TextPosition i=m-1;
1417     TextPosition sp = 1;
1418     TextPosition ep = C[c+1]-1;
1419     while (sp<=ep && i>=1)
1420     {
1421 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1422         c = (int)pattern[--i];
1423         uint result = alphabetrank->rank(c,ep);
1424         if (result == ~0u)
1425             ep = 0;
1426         else
1427             ep = C[c]+result-1;
1428     }
1429     *spResult = sp;
1430     *epResult = ep;
1431     if (sp<=ep)
1432         return ep - sp + 1;
1433     else
1434         return 0;
1435 }
1436
1437
1438 CSA::~CSA() {
1439 #ifdef CSA_TEST_BWT
1440     delete dynFMI;
1441 #endif
1442     delete alphabetrank;
1443     delete sampled;
1444     delete suffixes;
1445     delete suffixDocId;
1446     delete [] codetable; // FIXME remove
1447
1448     delete endmarkerDocId;
1449     delete emptyTextRank;
1450 }
1451
1452 void CSA::makewavelet(uchar *bwt)
1453 {
1454     ulong i, min = 0,
1455              max;
1456     for (i=0;i<256;i++)
1457         C[i]=0;
1458     for (i=0;i<n;++i)
1459         C[(int)bwt[i]]++;
1460     for (i=0;i<256;i++)
1461         if (C[i]>0) {min = i; break;}
1462     for (i=255;i>=min;--i)
1463         if (C[i]>0) {max = i; break;}
1464
1465     ulong prev=C[0], temp;
1466     C[0]=0;
1467     for (i=1;i<256;i++) {
1468         temp = C[i];
1469         C[i]=C[i-1]+prev;
1470         prev = temp;
1471     }
1472 //    this->codetable = node::makecodetable(bwt,n);
1473 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1474 //    delete [] bwt;
1475     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1476 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1477 //    std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1478
1479 //    HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1480
1481     alphabet_mapper * am = new alphabet_mapper_none();
1482     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1483 //    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1484     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1485     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1486     delete bmb;
1487     bwt = 0; // already deleted
1488
1489 //    std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1490 //    std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1491 }
1492
1493 void CSA::maketables()
1494 {
1495     // Calculate BWT end-marker position (of last inserted text)
1496     // Note: bwtEndPos already set!
1497     // and the length of the first text (counter l):
1498 #ifdef CSA_TEST_BWT
1499     {
1500         ulong i = 0;
1501         ulong l = 1;
1502         uint alphabetrank_i_tmp = 0;
1503         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1504         while (c != '\0')
1505         {
1506             i = C[c]+alphabetrank_i_tmp-1;
1507             ++l;
1508             c = alphabetrank->access(i, alphabetrank_i_tmp);
1509         }
1510
1511         if (i != bwtEndPos) // compare result
1512         {
1513             std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
1514             exit(0);
1515         }
1516     }
1517 #endif
1518
1519     // Build up arrays for text length and starting positions
1520     // FIXME Temp, remove
1521     //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1522     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1523     //(*textLength)[0] = l;
1524     (*textStartPos)[0] = 0;
1525
1526     // Construct samples
1527     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1528     unsigned ceilLog2n = Tools::CeilLog2(n);
1529     BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1530     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1531
1532     // Mapping from end-markers to doc ID's:
1533     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1534
1535     ulong *sampledpositions = new ulong[n/W+1];
1536     for (ulong i=0;i<n/W+1;i++)
1537         sampledpositions[i]=0lu;
1538
1539     ulong x,p=bwtEndPos;
1540     ulong sampleCount = 0;
1541     // Keeping track of text position of prev. end-marker seen
1542     ulong posOfSuccEndmarker = n;
1543     DocId textId = numberOfTexts;
1544     ulong ulongmax = 0;
1545     ulongmax--;
1546     uint alphabetrank_i_tmp =0;
1547
1548     //positions:
1549     for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1550       // i substitutes SA->GetPos(i)
1551         x=(i==n-1)?0:i+1;
1552
1553         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1554             Tools::SetField(sampledpositions,1,p,1);
1555             (*positions)[sampleCount] = p;
1556             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1557             sampleCount ++;
1558         }
1559
1560         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1561         if (c == '\0')
1562         {
1563             --textId;
1564
1565             // Record the order of end-markers in BWT:
1566             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1567             (*endmarkerDocId)[endmarkerRank] = textId;
1568
1569             // Store text length and text start position:
1570             if (textId < (DocId)numberOfTexts - 1)
1571             {
1572                 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1573                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1574                 posOfSuccEndmarker = x;
1575             }
1576
1577             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1578             p = textId; // Correct LF-mapping to the last char of the previous text.
1579         }
1580         else
1581             p = C[c]+alphabetrank_i_tmp-1;
1582     }
1583     assert(textId == 0);
1584
1585     sampled = new BSGAP(sampledpositions,n,true);
1586     sampleLength = sampled->rank(n-1);
1587     assert(sampleCount == sampleLength);
1588
1589     // Suffixes store an offset from the text start position
1590     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1591     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1592
1593     for(ulong i=0; i<sampleLength; i++) {
1594         assert((*positions)[i] < n);
1595         ulong j = sampled->rank((*positions)[i]);
1596         if (j==0) j=sampleLength;
1597         TextPosition textPos = (*tmpSuffix)[i];
1598         (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1599
1600         assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1601         assert((*suffixDocId)[j-1] < numberOfTexts);
1602         // calculate offset from text start:
1603         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1604     }
1605     // FIXME Temp, remove
1606     delete tmpSuffix;
1607     delete positions;
1608 //    delete textLength;
1609     delete textStartPos;
1610 }
1611
1612
1613 /**
1614  * Finds document identifier for given text position
1615  *
1616  * Starting text position of the document is stored into second parameter.
1617  * Binary searching on text starting positions.
1618  */
1619 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1620 {
1621     assert(i < n);
1622
1623     DocId a = 0;
1624     DocId b = numberOfTexts - 1;
1625     while (a < b)
1626     {
1627         DocId c = a + (b - a)/2;
1628         if ((*textStartPos)[c] > i)
1629             b = c - 1;
1630         else if ((*textStartPos)[c+1] > i)
1631             return c;
1632         else
1633             a = c + 1;
1634     }
1635
1636     assert(a < (DocId)numberOfTexts);
1637     assert(i >= (*textStartPos)[a]);
1638     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1639     return a;
1640 }
1641
1642 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1643 {
1644     TCodeEntry *result = new TCodeEntry[ 256 ];
1645
1646     count_chars( text, n, result );
1647     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1648 //
1649 // First I push all the leaf nodes into the queue
1650 //
1651     for ( unsigned int i = 0 ; i < 256 ; i++ )
1652         if ( result[ i ].count )
1653             q.push(node( i, result[ i ].count ) );
1654 //
1655 // This loop removes the two smallest nodes from the
1656 // queue.  It creates a new internal node that has
1657 // those two nodes as children. The new internal node
1658 // is then inserted into the priority queue.  When there
1659 // is only one node in the priority queue, the tree
1660 // is complete.
1661 //
1662
1663     while ( q.size() > 1 ) {
1664         node *child0 = new node( q.top() );
1665         q.pop();
1666         node *child1 = new node( q.top() );
1667         q.pop();
1668         q.push( node( child0, child1 ) );
1669     }
1670 //
1671 // Now I compute and return the codetable
1672 //
1673     q.top().maketable(0u,0u, result);
1674     q.pop();
1675     return result;
1676 }
1677
1678
1679 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1680 {
1681     if ( child0 )
1682     {
1683         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1684         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1685         delete child0;
1686         delete child1;
1687     }
1688     else
1689     {
1690         codetable[value].code = code;
1691         codetable[value].bits = bits;
1692     }
1693 }
1694
1695 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1696 {
1697     ulong i;
1698     for (i = 0 ; i < 256 ; i++ )
1699         counts[ i ].count = 0;
1700     for (i=0; i<n; i++)
1701         counts[(int)text[i]].count++;
1702 }
1703
1704 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1705       return x | (bit << pos);
1706 }
1707
1708 } // namespace SXSI
1709