CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22
  23 // Conflicts with std::min and std::max
  24 #undef min
  25 #undef max
  26 #include "dcover/bwt.hpp"
  27
  28 #include "HeapProfiler.h" // FIXME remove
  29
  30 #include <iostream>
  31 #include <queue>
  32 #include <set>
  33 #include <vector>
  34 #include <utility>
  35 #include <stdexcept>
  36 #include <cassert>
  37 #include <cstring> // For strlen()
  38 using std::vector;
  39 using std::pair;
  40 using std::make_pair;
  41 using std::map;
  42
  43 namespace SXSI
  44 {
  45
  46 // Save file version info
  47 const uchar CSA::versionFlag = 2;
  48
  49 ////////////////////////////////////////////////////////////////////////////
  50 // Class CSA::THuffAlphabetRank
  51 // FIXME Unused code
  52 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  53     left = NULL;
  54     right = NULL;
  55     bitrank = NULL;
  56     ch = s[0];
  57     leaf = false;
  58     this->codetable = codetable;
  59
  60     bool *B = new bool[n];
  61     TextPosition sum=0,i;
  62     /*
  63     for (i=0; i< n; i++) {
  64        printf("%c:", (char)((int)s[i]-128));
  65        for (r=0;r<codetable[(int)s[i]].bits;r++)
  66           if (codetable[(int)s[i]].code & (1u <<r))
  67              printf("1");
  68           else printf("0");
  69        printf("\n");
  70     }
  71     printf("\n");
  72     if (level > 100) return;*/
  73     for (i=0;i<n;i++)
  74        if (codetable[(int)s[i]].code & (1u << level)) {
  75           B[i] = true;
  76           sum++;
  77        }
  78        else B[i] = false;
  79     if (sum==0 || sum==n) {
  80         delete [] B;
  81         leaf = true;
  82         return;
  83     }
  84     uchar *sfirst, *ssecond;
  85     //if (n-sum > 0)
  86         sfirst = new uchar[n-sum];
  87     //if (sum > 0)
  88         ssecond = new uchar[sum];
  89     unsigned j=0,k=0;
  90    for (i=0;i<n;i++)
  91         if (B[i]) ssecond[k++] = s[i];
  92         else sfirst[j++] = s[i];
  93     ulong *Binbits = new ulong[n/W+1];
  94     for (i=0;i<n;i++)
  95         Tools::SetField(Binbits,1,i,B[i]);
  96     delete [] B;
  97     bitrank = new BitRank(Binbits,n,true);
  98     //if (j > 0) {
  99         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
 100         delete [] sfirst;
 101     //}
 102     //if (k>0) {
 103         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
 104         delete [] ssecond;
 105     //}
 106 }
 107
 108
 109 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 110     // testing that the code works correctly
 111     int C[256];
 112     unsigned i,j;
 113     bool correct=true;
 114     for (j=0;j<256;j++)
 115         C[j] = 0;
 116     for (i=0;i<n;i++) {
 117         C[(int)s[i]]++;
 118         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 119         correct = false;
 120         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 121         }
 122     }
 123     return correct;
 124 }
 125
 126 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 127     if (left!=NULL) delete left;
 128     if (right!=NULL) delete right;
 129     if (bitrank!=NULL)
 130         delete bitrank;
 131 }
 132
 133
 134 ////////////////////////////////////////////////////////////////////////////
 135 // Class CSA
 136
 137 /**
 138  * Constructor inits an empty dynamic FM-index.
 139  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 140  */
 141 CSA::CSA(unsigned samplerate)
 142     : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
 143       suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
 144       emptyTextRank(0)
 145 {
 146     this->samplerate = samplerate;
 147
 148 #ifdef CSA_TEST_BWT
 149     // FIXME TODO : DynFMI needs distribution of characters before hand
 150     // This will create fully balanced wavelet tree for all chars [0, 255].
 151     uchar temp[256];
 152     for (unsigned i = 0; i < 255; ++i)
 153         temp[i] = i+1;
 154     temp[255] = 0;
 155     dynFMI = new DynFMI(temp, 1, 255, false);
 156
 157     /* Debug code: take char distribution from data.txt.
 158     uchar *temp = Tools::GetFileContents("data.txt", 0);
 159     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 160     delete [] temp;*/
 161 #endif
 162 }
 163
 164 /**
 165  * Insert new text
 166  *
 167  * Given text must include end-marker.
 168  * Text identifiers are numbered starting from 0.
 169  */
 170 void CSA::InsertText(uchar const * text)
 171 {
 172     // Sanity check:
 173 #ifdef CSA_TEST_BWT
 174     assert(dynFMI != 0);
 175 #endif
 176
 177     TextPosition m = std::strlen((char *)text) + 1;
 178     if (m > maxTextLength)
 179         maxTextLength = m; // Store length of the longest text seen so far.
 180
 181     if (m > 1)
 182     {
 183         this->n += m;
 184         this->numberOfTexts ++;
 185         this->numberOfAllTexts ++;
 186 #ifdef CSA_TEST_BWT
 187         dynFMI->addText(text, m);
 188 #endif
 189
 190         texts.append((char *) text);
 191         texts.append(1, '\0');
 192     }
 193     else
 194     {
 195         emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
 196         this->numberOfAllTexts ++;
 197     }
 198 }
 199
 200 void CSA::MakeStatic()
 201 {
 202     // Sanity check:
 203 #ifdef CSA_TEST_BWT
 204     if (dynFMI == 0)
 205         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 206 #endif
 207
 208     if (texts.size() == 0) // Empty collection
 209     {
 210          ++n;
 211          texts.append(1, '\0');
 212     }
 213
 214     // Bitvector of empty texts, FIXME Remove?
 215     {
 216         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
 217         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
 218         for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
 219             tempB[i] = 0;
 220         for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
 221             Tools::SetField(tempB, 1, (*it), 1);
 222         emptyTextId.clear();
 223         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
 224     }
 225
 226     texts.reserve(0); // Release extra capacity
 227
 228 /*    FILE *fp = fopen("texts.txt", "wb");
 229     std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
 230     fclose(fp);*/
 231
 232     uchar *bwt = new uchar[n];
 233     {
 234         // More succinct solution via StringIterator, see below.
 235 /*        unsigned* maptexts = new unsigned[n+1];
 236         // Map text chars to [0..255+numberOfTexts]
 237         unsigned count = 0;
 238         for (ulong i = 0; i < n; ++i)
 239             if (texts[i] == 0)
 240                 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
 241             else {
 242                 uchar c = (uchar)texts[i];
 243                 maptexts[i] = (unsigned)c + numberOfTexts;
 244             }
 245         maptexts[n] = '\0';
 246         assert(count == numberOfTexts);
 247
 248         std::cout << "maptext: ";
 249         for (ulong i = 0; i <= n; ++i)
 250             std::cout << maptexts[i] << ", ";
 251             std::cout << std::endl;*/
 252
 253         // Mark endmarker positions into bitvector
 254         ulong * endmarkers = new ulong[n/W+1];
 255         for (ulong i = 0; i < n/W+1; ++i)
 256             endmarkers[i] = 0;
 257         for (ulong i = 0; i < n; ++i)
 258             if (texts[i] == 0)
 259                 Tools::SetField(endmarkers, 1, i, 1);
 260         BitRank* br = new BitRank(endmarkers, n, true);
 261         assert(numberOfTexts == br->rank(n-1));
 262
 263         // Build iterators, FIXME clean up iterator construction.
 264         StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
 265         StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
 266
 267         bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
 268                                        &bwt[0], numberOfTexts);
 269
 270         bwt[--bwtEndPos] = '\0';
 271         texts.erase();
 272         texts.reserve(0); // Release capacity
 273         delete br; // bitvector of endmarkers
 274     } // End of bw transform
 275 //  std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
 276
 277 /*    fp = fopen("bwt.txt", "wb");
 278     std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
 279     fclose(fp);*/
 280
 281
 282 #ifdef CSA_TEST_BWT
 283     {
 284         uchar *bwtTest = dynFMI->getBWT();
 285         printf("123456789012345678901234567890123456789\n");
 286         for (TextPosition i = 0; i < n && i < 100; i ++)
 287             if (bwt[i] < 50)
 288                 printf("%d", (int)bwt[i]);
 289             else
 290                 printf("%c", bwt[i]);
 291         printf("\n");
 292         for (TextPosition i = 0; i < n && i < 100; i ++)
 293             if (bwtTest[i] < 50)
 294                 printf("%d", (int)bwtTest[i]);
 295             else
 296                 printf("%c", bwtTest[i]);
 297                 printf("\n");
 298
 299         // Sanity check
 300         assert(numberOfTexts == dynFMI->getCollectionSize());
 301
 302         delete dynFMI;
 303         dynFMI = 0;
 304         for (ulong i = 0; i < n; ++i)
 305             if (bwt[i] != bwtTest[i])
 306             {
 307                 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
 308                 assert(0);
 309             }
 310         delete [] bwtTest;
 311     }
 312 #endif // CSA_TEST_BWT
 313
 314     makewavelet(bwt); // Deletes bwt!
 315     bwt = 0;
 316
 317     // Make sampling tables
 318     maketables();
 319 }
 320
 321 bool CSA::EmptyText(DocId k) const
 322 {
 323     assert(k < (DocId)numberOfTexts);
 324     if (emptyTextRank->IsBitSet(k))
 325         return true;
 326     return false;
 327 }
 328
 329 uchar* CSA::GetText(DocId k) const
 330 {
 331     assert(k < (DocId)numberOfTexts);
 332
 333     ulong textRank = 0;
 334     if (emptyTextRank->IsBitSet(k, &textRank))
 335     {
 336         uchar* result = new uchar[1];
 337         result[0] = 0;
 338         return result;
 339     }
 340     // Map to non-empty text
 341     k -= textRank; //emptyTextRank->rank(k);
 342
 343     TextPosition i = k;
 344
 345     string result;
 346     // Reserve average string length to avoid reallocs
 347     result.reserve(n/numberOfTexts);
 348
 349     uchar c = alphabetrank->access(i);
 350     while (c != '\0')
 351     {
 352         result.push_back(c);
 353         i = C[c]+alphabetrank->rank(c,i)-1;
 354
 355         c = alphabetrank->access(i); // "next" char.
 356     }
 357
 358     // Convert to uchar (FIXME return string?)
 359     i = result.size();
 360     uchar* res = new uchar[i+1];
 361     res[i] = '\0';
 362     for (ulong j = 0; j < i; ++j)
 363         res[i-j-1] = result[j];
 364     return res;
 365 }
 366 /*
 367  * Not supported
 368 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 369 {
 370     assert(k < (DocId)numberOfTexts);
 371     assert(j < (*textLength)[k]);
 372     assert(i <= j);
 373
 374     ulong textRank = 0;
 375     if (emptyTextRank->IsBitSet(k, &textRank))
 376     {
 377         uchar* result = new uchar[1];
 378         result[0] = 0;
 379         return result;
 380     }
 381
 382     // Map to non-empty text
 383     k -= textRank; // emptyTextRank->rank(k);
 384
 385     // Start position of k'th text
 386     ulong start = (*textStartPos)[k];
 387
 388     return Substring(i + start, j-i+1);
 389     }*/
 390
 391 /******************************************************************
 392  * Existential queries
 393  */
 394 bool CSA::IsPrefix(uchar const * pattern) const
 395 {
 396     TextPosition m = strlen((char *)pattern);
 397     if (m == 0)
 398         return true;
 399
 400     TextPosition sp = 0, ep = 0;
 401     Search(pattern, m, &sp, &ep);
 402
 403     // Check for end-marker(s) in result interval
 404     if (CountEndmarkers(sp, ep))
 405         return true;
 406     return false;
 407 }
 408
 409 bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 410 {
 411     TextPosition m = strlen((char *)pattern);
 412     if (m == 0)
 413         return true;
 414
 415     TextPosition sp = 0, ep = 0;
 416     Search(pattern, m, &sp, &ep);
 417
 418     // Check for end-marker(s) in result interval
 419     if (CountEndmarkers(sp, ep, begin, end))
 420         return true;
 421     return false;
 422 }
 423
 424
 425 bool CSA::IsSuffix(uchar const *pattern) const
 426 {
 427     // Here counting is as fast as IsSuffix():
 428     if (CountSuffix(pattern) > 0)
 429         return true;
 430     return false;
 431 }
 432
 433 bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 434 {
 435     // Here counting is as fast as IsSuffix():
 436     if (CountSuffix(pattern, begin, end) > 0)
 437         return true;
 438     return false;
 439 }
 440
 441 bool CSA::IsEqual(uchar const *pattern) const
 442 {
 443     TextPosition m = std::strlen((char *)pattern);
 444     if (m == 0)
 445     {
 446         if (numberOfAllTexts - numberOfTexts > 0u)
 447             return true; // Empty texts exists
 448         return false; // No empty texts exists
 449     }
 450
 451     TextPosition sp = 0, ep = 0;
 452     // Match including end-marker
 453     Search(pattern, m+1, &sp, &ep);
 454
 455     // Check for end-marker(s) in result interval
 456     if (CountEndmarkers(sp, ep))
 457         return true;
 458     return false;
 459 }
 460
 461 bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 462 {
 463     TextPosition m = std::strlen((char *)pattern);
 464     if (m == 0)
 465     {
 466         if (numberOfAllTexts - numberOfTexts > 0u)
 467             return true; // Empty texts exists
 468         return false; // No empty texts exists
 469     }
 470
 471     TextPosition sp = 0, ep = 0;
 472     // Match including end-marker
 473     Search(pattern, m+1, &sp, &ep, begin, end);
 474
 475     // Check for end-marker(s) in result interval
 476     if (CountEndmarkers(sp, ep))
 477         return true;
 478     return false;
 479 }
 480
 481 bool CSA::IsContains(uchar const * pattern) const
 482 {
 483     TextPosition m = strlen((char *)pattern);
 484     if (m == 0)
 485         return true;
 486
 487     TextPosition sp = 0, ep = 0;
 488     // Just check if pattern exists somewhere
 489     ulong count = Search(pattern, m, &sp, &ep);
 490
 491     if (count > 0)
 492         return true;
 493     return false;
 494 }
 495
 496 bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const
 497 {
 498     // Here counting is as fast as existential querying
 499     if (CountContains(pattern, begin, end) > 0)
 500         return true; // FIXME No need to filter result set
 501     return false;
 502 }
 503
 504 bool CSA::IsLessThan(uchar const * pattern) const
 505 {
 506     if (CountLessThan(pattern) > 0)
 507         return true;
 508     return false;
 509 }
 510
 511 bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 512 {
 513     if (CountLessThan(pattern, begin, end) > 0)
 514         return true;
 515     return false;
 516 }
 517
 518 /******************************************************************
 519  * Counting queries
 520  */
 521 ulong CSA::Count(uchar const * pattern) const
 522 {
 523     TextPosition m = strlen((char *)pattern);
 524     if (m == 0)
 525         return 0;
 526
 527     TextPosition sp = 0, ep = 0;
 528     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 529     return count;
 530 }
 531
 532 unsigned CSA::CountPrefix(uchar const * pattern) const
 533 {
 534     TextPosition m = strlen((char *)pattern);
 535     if (m == 0)
 536         return numberOfAllTexts;
 537
 538     TextPosition sp = 0, ep = 0;
 539     Search(pattern, m, &sp, &ep);
 540
 541     // Count end-markers in result interval
 542     return CountEndmarkers(sp, ep);
 543 }
 544
 545 unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 546 {
 547     TextPosition m = strlen((char *)pattern);
 548     if (m == 0)
 549         return numberOfAllTexts;
 550
 551     TextPosition sp = 0, ep = 0;
 552     Search(pattern, m, &sp, &ep);
 553
 554     // Count end-markers in result interval
 555     return CountEndmarkers(sp, ep, begin, end);
 556 }
 557
 558 unsigned CSA::CountSuffix(uchar const * pattern) const
 559 {
 560     TextPosition m = strlen((char *)pattern);
 561     if (m == 0)
 562         return numberOfAllTexts;
 563
 564     TextPosition sp = 0, ep = 0;
 565     // Search with end-marker
 566     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 567
 568     return count;
 569 }
 570
 571 unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 572 {
 573     TextPosition m = strlen((char *)pattern);
 574     if (m == 0)
 575         return numberOfAllTexts;
 576
 577     TextPosition sp = 0, ep = 0;
 578     // Search with end-marker
 579     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 580
 581     return count;
 582 }
 583
 584 unsigned CSA::CountEqual(uchar const *pattern) const
 585 {
 586     TextPosition m = strlen((char const *)pattern);
 587     if (m == 0)
 588         return numberOfAllTexts - numberOfTexts; // Empty texts.
 589
 590     TextPosition sp = 0, ep = 0;
 591     // Match including end-marker
 592     Search(pattern, m+1, &sp, &ep);
 593
 594     // Count end-markers in result interval
 595     return CountEndmarkers(sp, ep);
 596 }
 597
 598 unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 599 {
 600     TextPosition m = strlen((char const *)pattern);
 601     if (m == 0)
 602         return numberOfAllTexts - numberOfTexts; // Empty texts.
 603
 604     TextPosition sp = 0, ep = 0;
 605     // Match including end-marker
 606     Search(pattern, m+1, &sp, &ep, begin, end);
 607
 608     // Count end-markers in result interval
 609     return CountEndmarkers(sp, ep); // Already within [begin, end]
 610 }
 611
 612 unsigned CSA::CountContains(uchar const * pattern) const
 613 {
 614     TextPosition m = strlen((char const *)pattern);
 615     if (m == 0)
 616         return numberOfAllTexts; // Total number of texts.
 617
 618     // Here counting is as slow as fetching the result set
 619     // because we have to filter out occ's that fall within same document.
 620     TextCollection::document_result result = Contains(pattern);
 621     return result.size();
 622 }
 623
 624 unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const
 625 {
 626     TextPosition m = strlen((char const *)pattern);
 627     if (m == 0)
 628         return numberOfAllTexts; // Total number of texts.
 629
 630     // Here counting is as slow as fetching the result set
 631     // because we have to filter out occ's that fall within same document.
 632     TextCollection::document_result result = Contains(pattern, begin, end);
 633     return result.size();
 634 }
 635
 636 // Less than or equal
 637 unsigned CSA::CountLessThan(uchar const * pattern) const
 638 {
 639     TextPosition m = strlen((char const *)pattern);
 640     if (m == 0)
 641         return numberOfAllTexts - numberOfTexts; // Empty texts.
 642
 643     TextPosition sp = 0, ep = 0;
 644     SearchLessThan(pattern, m, &sp, &ep);
 645
 646     // Count end-markers in result interval
 647     return CountEndmarkers(sp, ep);
 648 }
 649
 650 unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 651 {
 652     TextPosition m = strlen((char const *)pattern);
 653     if (m == 0)
 654         return numberOfAllTexts - numberOfTexts; // Empty texts.
 655
 656     TextPosition sp = 0, ep = 0;
 657     SearchLessThan(pattern, m, &sp, &ep);
 658
 659     // Count end-markers in result interval
 660     return CountEndmarkers(sp, ep, begin, end);
 661 }
 662
 663 /**
 664  * Document reporting queries
 665  */
 666 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 667 {
 668     TextPosition m = strlen((char *)pattern);
 669     if (m == 0)
 670         return TextCollection::document_result(); // FIXME Should return all 1...k
 671
 672     TextPosition sp = 0, ep = 0;
 673     Search(pattern, m, &sp, &ep);
 674
 675     TextCollection::document_result result;
 676
 677     // Report end-markers in result interval
 678     unsigned resultSize = CountEndmarkers(sp, ep);
 679     if (resultSize == 0)
 680         return result;
 681
 682     result.reserve(resultSize); // Try to avoid reallocation.
 683
 684     // Iterate through end-markers in [sp,ep]:
 685     unsigned i = 0;
 686     if (sp > 0)
 687         i = alphabetrank->rank(0, sp - 1);
 688     while (resultSize)
 689     {
 690         // End-marker we found belongs to the "preceeding" doc in the collection
 691         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 692         // Map to doc ID:
 693         docId = emptyTextRank->select0(docId+1);
 694         result.push_back(docId);
 695
 696         -- resultSize;
 697         ++ i;
 698     }
 699
 700     return result;
 701 }
 702
 703 TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const
 704 {
 705     TextPosition m = strlen((char *)pattern);
 706     if (m == 0)
 707         return TextCollection::document_result(); // FIXME Should return all 1...k
 708
 709     TextPosition sp = 0, ep = 0;
 710     Search(pattern, m, &sp, &ep);
 711
 712     TextCollection::document_result result;
 713
 714     // Report end-markers in result interval
 715     unsigned resultSize = CountEndmarkers(sp, ep);
 716     if (resultSize == 0)
 717         return result;
 718
 719     result.reserve(resultSize); // Try to avoid reallocation.
 720
 721     // Iterate through end-markers in [sp,ep] and [begin, end]:
 722     unsigned i = 0;
 723     if (sp > 0)
 724         i = alphabetrank->rank(0, sp - 1);
 725     while (resultSize)
 726     {
 727         // End-marker we found belongs to the "preceeding" doc in the collection
 728         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 729         // Map to doc ID:
 730         docId = emptyTextRank->select0(docId+1);
 731         if (docId >= begin && docId <= end)
 732             result.push_back(docId);
 733
 734         -- resultSize;
 735         ++ i;
 736     }
 737
 738     return result;
 739 }
 740
 741 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 742 {
 743     TextPosition m = strlen((char *)pattern);
 744     if (m == 0)
 745         return TextCollection::document_result(); // FIXME Should return all 1...k
 746
 747     TextPosition sp = 0, ep = 0;
 748     // Search with end-marker
 749     Search(pattern, m+1, &sp, &ep);
 750
 751     TextCollection::document_result result;
 752     result.reserve(ep-sp+1); // Try to avoid reallocation.
 753
 754     ulong sampled_rank_i = 0;
 755     // Check each occurrence
 756     for (; sp <= ep; ++sp)
 757     {
 758         TextPosition i = sp;
 759
 760         uchar c = alphabetrank->access(i);
 761         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 762         {
 763             i = C[c]+alphabetrank->rank(c,i)-1;
 764             c = alphabetrank->access(i);
 765         }
 766         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 767
 768         if (c == '\0')
 769         {
 770             // Rank among the end-markers in BWT
 771             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 772
 773             // End-marker that we found belongs to the "preceeding" doc in collection:
 774             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 775             // Map to doc ID:
 776             docId = emptyTextRank->select0(docId+1);
 777             result.push_back(docId);
 778         }
 779         else // Sampled position
 780         {
 781             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 782             // Map to doc ID:
 783             docId = emptyTextRank->select0(docId+1);
 784             result.push_back(docId);
 785         }
 786     }
 787
 788     return result;
 789 }
 790
 791 TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const
 792 {
 793     TextPosition m = strlen((char *)pattern);
 794     if (m == 0)
 795         return TextCollection::document_result(); // FIXME Should return all 1...k
 796
 797     TextPosition sp = 0, ep = 0;
 798     // Search with end-marker
 799     Search(pattern, m+1, &sp, &ep, begin, end);
 800
 801     TextCollection::document_result result;
 802     result.reserve(ep-sp+1); // Try to avoid reallocation.
 803
 804     ulong sampled_rank_i = 0;
 805     // Check each occurrence, already within [begin, end]
 806     for (; sp <= ep; ++sp)
 807     {
 808         TextPosition i = sp;
 809
 810         uchar c = alphabetrank->access(i);
 811         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 812         {
 813             i = C[c]+alphabetrank->rank(c,i)-1;
 814             c = alphabetrank->access(i);
 815         }
 816         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 817
 818         if (c == '\0')
 819         {
 820             // Rank among the end-markers in BWT
 821             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 822
 823             // End-marker that we found belongs to the "preceeding" doc in collection:
 824             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 825             // Map to doc ID:
 826             docId = emptyTextRank->select0(docId+1);
 827             result.push_back(docId);
 828         }
 829         else // Sampled position
 830         {
 831             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 832             // Map to doc ID:
 833             docId = emptyTextRank->select0(docId+1);
 834             result.push_back(docId);
 835         }
 836     }
 837
 838     return result;
 839 }
 840
 841
 842 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 843 {
 844     TextPosition m = strlen((char const *)pattern);
 845     if (m == 0)
 846         return TextCollection::document_result(); // FIXME Should return all empty texts
 847
 848     TextPosition sp = 0, ep = 0;
 849     // Match including end-marker
 850     Search(pattern, m+1, &sp, &ep);
 851
 852     TextCollection::document_result result;
 853
 854     // Report end-markers in result interval
 855     unsigned resultSize = CountEndmarkers(sp, ep);
 856     if (resultSize == 0)
 857         return result;
 858
 859     result.reserve(resultSize); // Try to avoid reallocation.
 860
 861     unsigned i = 0;
 862     if (sp > 0)
 863         i = alphabetrank->rank(0, sp - 1);
 864     while (resultSize)
 865     {
 866         // End-marker we found belongs to the "previous" doc in the collection
 867         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 868         // Map to doc ID:
 869         docId = emptyTextRank->select0(docId+1);
 870         result.push_back(docId);
 871
 872         -- resultSize;
 873         ++ i;
 874     }
 875
 876     return result;
 877 }
 878
 879 TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
 880 {
 881     TextPosition m = strlen((char const *)pattern);
 882     if (m == 0)
 883         return TextCollection::document_result(); // FIXME Should return all empty texts
 884
 885     TextPosition sp = 0, ep = 0;
 886     // Match including end-marker
 887     Search(pattern, m+1, &sp, &ep, begin, end);
 888
 889     TextCollection::document_result result;
 890
 891     // Report end-markers in result interval
 892     unsigned resultSize = CountEndmarkers(sp, ep);
 893     if (resultSize == 0)
 894         return result;
 895
 896     result.reserve(resultSize); // Try to avoid reallocation.
 897
 898     unsigned i = 0;
 899     if (sp > 0)
 900         i = alphabetrank->rank(0, sp - 1);
 901     while (resultSize)
 902     {
 903         // End-marker we found belongs to the "previous" doc in the collection
 904         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 905         // Map to doc ID:
 906         docId = emptyTextRank->select0(docId+1);
 907         result.push_back(docId); // already within [begin, end]
 908
 909         -- resultSize;
 910         ++ i;
 911     }
 912
 913     return result;
 914 }
 915
 916
 917 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 918 {
 919     TextPosition m = strlen((char *)pattern);
 920     if (m == 0)
 921         return TextCollection::document_result();
 922
 923     TextPosition sp = 0, ep = 0;
 924     // Search all occurrences
 925     Search(pattern, m, &sp, &ep);
 926
 927     // We want unique document indentifiers, using std::set to collect them
 928     std::set<DocId> resultSet;
 929
 930     ulong sampled_rank_i = 0;
 931     // Check each occurrence
 932     for (; sp <= ep; ++sp)
 933     {
 934         TextPosition i = sp;
 935         uchar c = alphabetrank->access(i);
 936         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 937         {
 938             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 939             c = alphabetrank->access(i);
 940         }
 941         if (c == '\0')
 942         {
 943             // Rank among the end-markers in BWT
 944             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 945
 946             // End-marker that we found belongs to the "preceeding" doc in collection:
 947             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 948             resultSet.insert(docId);
 949         }
 950         else
 951         {
 952             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 953             assert((unsigned)di < numberOfTexts);
 954             resultSet.insert(di);
 955         }
 956     }
 957
 958     // Convert std::set to std::vector
 959     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 960     // Map to doc ID's
 961     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 962         *it = emptyTextRank->select0(*it+1);
 963     return result;
 964 }
 965
 966 TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
 967 {
 968     TextPosition m = strlen((char *)pattern);
 969     if (m == 0)
 970         return TextCollection::document_result();
 971
 972     TextPosition sp = 0, ep = 0;
 973     // Search all occurrences
 974     Search(pattern, m, &sp, &ep);
 975
 976     // We want unique document indentifiers, using std::set to collect them
 977     std::set<DocId> resultSet;
 978
 979     ulong sampled_rank_i = 0;
 980     // Check each occurrence
 981     for (; sp <= ep; ++sp)
 982     {
 983         TextPosition i = sp;
 984         uchar c = alphabetrank->access(i);
 985         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 986         {
 987             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 988             c = alphabetrank->access(i);
 989         }
 990         if (c == '\0')
 991         {
 992             // Rank among the end-markers in BWT
 993             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 994
 995             // End-marker that we found belongs to the "preceeding" doc in collection:
 996             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 997             if (docId >= begin && docId <= end)
 998                 resultSet.insert(docId);
 999         }
1000         else
1001         {
1002             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1003             assert((unsigned)docId < numberOfTexts);
1004             if (docId >= begin && docId <= end)
1005                 resultSet.insert(docId);
1006         }
1007     }
1008
1009     // Convert std::set to std::vector
1010     TextCollection::document_result result(resultSet.begin(), resultSet.end());
1011     // Map to doc ID's
1012     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
1013         *it = emptyTextRank->select0(*it+1);
1014     return result;
1015 }
1016
1017 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
1018 {
1019     TextPosition m = strlen((char *)pattern);
1020     if (m == 0)
1021         return TextCollection::document_result(); // empty result set
1022
1023     TextPosition sp = 0, ep = 0;
1024     SearchLessThan(pattern, m, &sp, &ep);
1025
1026     TextCollection::document_result result;
1027
1028     // Report end-markers in result interval
1029     unsigned resultSize = CountEndmarkers(sp, ep);
1030     if (resultSize == 0)
1031         return result;
1032
1033     result.reserve(resultSize); // Try to avoid reallocation.
1034
1035     // Iterate through end-markers in [sp,ep]:
1036     unsigned i = 0;
1037     if (sp > 0)
1038         i = alphabetrank->rank(0, sp - 1);
1039     while (resultSize)
1040     {
1041         // End-marker we found belongs to the "preceeding" doc in the collection
1042         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1043         // Map to doc ID:
1044         docId = emptyTextRank->select0(docId+1);
1045         result.push_back(docId);
1046
1047         -- resultSize;
1048         ++ i;
1049     }
1050
1051     return result;
1052 }
1053
1054 TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
1055 {
1056     TextPosition m = strlen((char *)pattern);
1057     if (m == 0)
1058         return TextCollection::document_result(); // empty result set
1059
1060     TextPosition sp = 0, ep = 0;
1061     SearchLessThan(pattern, m, &sp, &ep);
1062
1063     TextCollection::document_result result;
1064
1065     // Report end-markers in result interval
1066     unsigned resultSize = CountEndmarkers(sp, ep);
1067     if (resultSize == 0)
1068         return result;
1069
1070     result.reserve(resultSize); // Try to avoid reallocation.
1071
1072     // Iterate through end-markers in [sp,ep] and [begin, end]:
1073     unsigned i = 0;
1074     if (sp > 0)
1075         i = alphabetrank->rank(0, sp - 1);
1076     while (resultSize)
1077     {
1078         // End-marker we found belongs to the "preceeding" doc in the collection
1079         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1080         // Map to doc ID:
1081         docId = emptyTextRank->select0(docId+1);
1082         if (docId >= begin && docId <= end)
1083             result.push_back(docId);
1084
1085         -- resultSize;
1086         ++ i;
1087     }
1088
1089     return result;
1090 }
1091
1092 /**
1093  * Full result set queries
1094  */
1095 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
1096 {
1097     TextPosition m = strlen((char *)pattern);
1098     if (m == 0)
1099         return full_result(); // FIXME Throw exception?
1100
1101     TextPosition sp = 0, ep = 0;
1102     // Search all occurrences
1103     Search(pattern, m, &sp, &ep);
1104
1105     full_result result;
1106     result.reserve(ep-sp+1); // Try to avoid reallocation.
1107
1108     ulong sampled_rank_i = 0;
1109     // Report each occurrence
1110     for (; sp <= ep; ++sp)
1111     {
1112         TextPosition i = sp;
1113         TextPosition dist = 0;
1114         uchar c = alphabetrank->access(i);
1115         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1116         {
1117             i = C[c]+alphabetrank->rank(c,i)-1;
1118             c = alphabetrank->access(i);
1119             ++ dist;
1120         }
1121         if (c == '\0')
1122         {
1123             // Rank among the end-markers in BWT
1124             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1125
1126             // End-marker that we found belongs to the "preceeding" doc in collection:
1127             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1128             // Map to doc ID:
1129             docId = emptyTextRank->select0(docId+1);
1130             result.push_back(make_pair(docId, dist));
1131         }
1132         else
1133         {
1134             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1135             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1136 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1137
1138             // Map to doc ID:
1139             docId = emptyTextRank->select0(docId+1);
1140             result.push_back(make_pair(docId, textPos));
1141         }
1142     }
1143
1144     return result;
1145 }
1146
1147 TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const
1148 {
1149     TextPosition m = strlen((char *)pattern);
1150     if (m == 0)
1151         return full_result(); // FIXME Throw exception?
1152
1153     TextPosition sp = 0, ep = 0;
1154     // Search all occurrences
1155     Search(pattern, m, &sp, &ep);
1156
1157     full_result result;
1158     result.reserve(ep-sp+1); // Try to avoid reallocation.
1159
1160     ulong sampled_rank_i = 0;
1161     // Report each occurrence
1162     for (; sp <= ep; ++sp)
1163     {
1164         TextPosition i = sp;
1165         TextPosition dist = 0;
1166         uchar c = alphabetrank->access(i);
1167         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1168         {
1169             i = C[c]+alphabetrank->rank(c,i)-1;
1170             c = alphabetrank->access(i);
1171             ++ dist;
1172         }
1173         if (c == '\0')
1174         {
1175             // Rank among the end-markers in BWT
1176             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1177
1178             // End-marker that we found belongs to the "preceeding" doc in collection:
1179             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1180             // Map to doc ID:
1181             docId = emptyTextRank->select0(docId+1);
1182             if (docId >= begin && docId <= end)
1183                 result.push_back(make_pair(docId, dist));
1184         }
1185         else
1186         {
1187             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1188             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1189 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1190
1191             // Map to doc ID:
1192             docId = emptyTextRank->select0(docId+1);
1193             if (docId >= begin && docId <= end)
1194                 result.push_back(make_pair(docId, textPos));
1195         }
1196     }
1197
1198     return result;
1199 }
1200
1201
1202 /**
1203  * Save index to a file handle
1204  *
1205  * Throws a std::runtime_error exception on i/o error.
1206  * First byte that is saved represents the version number of the save file.
1207  * In version 2 files, the data fields are:
1208  *  uchar versionFlag;
1209     TextPosition n;
1210     unsigned samplerate;
1211     unsigned C[256];
1212     TextPosition bwtEndPos;
1213     static_sequence * alphabetrank;
1214     BSGAP *sampled;
1215     BlockArray *suffixes;
1216     BlockArray *suffixDocId;
1217     unsigned numberOfTexts;
1218     unsigned numberOfAllTexts;
1219     ulong maxTextLength;
1220     BlockArray *endmarkerDocId;
1221     BSGAP *emptyTextRank;
1222  */
1223 void CSA::Save(FILE *file) const
1224 {
1225     // Saving version info:
1226     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
1227         throw std::runtime_error("CSA::Save(): file write error (version flag).");
1228
1229     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
1230         throw std::runtime_error("CSA::Save(): file write error (n).");
1231     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
1232         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
1233
1234     for(ulong i = 0; i < 256; ++i)
1235         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
1236             throw std::runtime_error("CSA::Save(): file write error (C table).");
1237
1238     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1239         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
1240
1241     alphabetrank->save(file);
1242     sampled->Save(file);
1243     suffixes->Save(file);
1244     suffixDocId->Save(file);
1245
1246     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1247         throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
1248     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1249         throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
1250     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1251         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
1252
1253     endmarkerDocId->Save(file);
1254     emptyTextRank->Save(file);
1255     fflush(file);
1256 }
1257
1258
1259 /**
1260  * Load index from a file handle
1261  *
1262  * Throws a std::runtime_error exception on i/o error.
1263  * For more info, see CSA::Save().
1264  */
1265 void CSA::Load(FILE *file, unsigned samplerate)
1266 {
1267     // Delete everything
1268 #ifdef CSA_TEST_BWT
1269     delete dynFMI;       dynFMI = 0;
1270 #endif
1271     delete alphabetrank; alphabetrank = 0;
1272     delete sampled;      sampled = 0;
1273     delete suffixes;     suffixes = 0;
1274     delete suffixDocId;  suffixDocId = 0;
1275     delete [] codetable; codetable = 0;
1276
1277     delete endmarkerDocId; endmarkerDocId = 0;
1278     delete emptyTextRank;  emptyTextRank = 0;
1279
1280     this->maxTextLength = 0;
1281     this->numberOfTexts = 0;
1282     this->numberOfAllTexts = 0;
1283     this->samplerate = samplerate;
1284     this->n = 0;
1285
1286     uchar verFlag = 0;
1287     if (std::fread(&verFlag, 1, 1, file) != 1)
1288         throw std::runtime_error("CSA::Load(): file read error (version flag).");
1289     if (verFlag != CSA::versionFlag)
1290         throw std::runtime_error("CSA::Load(): invalid save file version.");
1291
1292     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
1293         throw std::runtime_error("CSA::Load(): file read error (n).");
1294     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
1295         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
1296 // FIXME samplerate can not be changed during load.
1297 //    if (this->samplerate == 0)
1298 //        this->samplerate = samplerate;
1299
1300     for(ulong i = 0; i < 256; ++i)
1301         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
1302             throw std::runtime_error("CSA::Load(): file read error (C table).");
1303
1304     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1305         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
1306
1307     alphabetrank = static_sequence::load(file);
1308     sampled = new BSGAP(file);
1309     suffixes = new BlockArray(file);
1310     suffixDocId = new BlockArray(file);
1311
1312     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1313         throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
1314     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1315         throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
1316     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1317         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
1318
1319     endmarkerDocId = new BlockArray(file);
1320     emptyTextRank = new BSGAP(file);
1321
1322     // FIXME Construct data structures with new samplerate
1323     //maketables();
1324 }
1325
1326
1327
1328 /**
1329  * Rest of the functions follow...
1330  */
1331
1332
1333 // FIXME Use 2D-search structure
1334 unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
1335 {
1336     if (sp > ep)
1337         return 0;
1338
1339     ulong ranksp = 0;
1340     if (sp != 0)
1341         ranksp = alphabetrank->rank(0, sp - 1);
1342
1343     unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
1344     if (resultSize == 0)
1345         return 0;
1346
1347     // Count end-markers in result interval and within [begin, end]
1348     unsigned count = 0;
1349     unsigned i = ranksp;
1350     while (resultSize)
1351     {
1352         // End-marker we found belongs to the "previous" doc in the collection
1353         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1354         // Map to doc ID:
1355         docId = emptyTextRank->select0(docId+1);
1356         if (docId >= begin && docId <= end)
1357             ++ count;
1358
1359         -- resultSize;
1360         ++ i;
1361     }
1362     return count;
1363 }
1364
1365 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1366 {
1367     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1368     int c = (int)pattern[m-1];
1369     TextPosition i=m-1;
1370     TextPosition sp = C[c];
1371     TextPosition ep = C[c+1]-1;
1372     while (sp<=ep && i>=1)
1373     {
1374 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1375         c = (int)pattern[--i];
1376         sp = C[c]+alphabetrank->rank(c,sp-1);
1377         ep = C[c]+alphabetrank->rank(c,ep)-1;
1378     }
1379     *spResult = sp;
1380     *epResult = ep;
1381     if (sp<=ep)
1382         return ep - sp + 1;
1383     else
1384         return 0;
1385 }
1386
1387 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
1388 {
1389     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1390     int c = (int)pattern[m-1];
1391     assert(c == 0); // Start from endmarkers
1392     TextPosition i=m-1;
1393     TextPosition sp = begin;
1394     TextPosition ep = end;
1395     while (sp<=ep && i>=1)
1396     {
1397 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1398         c = (int)pattern[--i];
1399         sp = C[c]+alphabetrank->rank(c,sp-1);
1400         ep = C[c]+alphabetrank->rank(c,ep)-1;
1401     }
1402     *spResult = sp;
1403     *epResult = ep;
1404     if (sp<=ep)
1405         return ep - sp + 1;
1406     else
1407         return 0;
1408 }
1409
1410
1411 ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1412 {
1413     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1414     uint c = (int)pattern[m-1];
1415     TextPosition i=m-1;
1416     TextPosition sp = 1;
1417     TextPosition ep = C[c+1]-1;
1418     while (sp<=ep && i>=1)
1419     {
1420 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1421         c = (int)pattern[--i];
1422         uint result = alphabetrank->rank(c,ep);
1423         if (result == ~0u)
1424             ep = 0;
1425         else
1426             ep = C[c]+result-1;
1427     }
1428     *spResult = sp;
1429     *epResult = ep;
1430     if (sp<=ep)
1431         return ep - sp + 1;
1432     else
1433         return 0;
1434 }
1435
1436
1437 CSA::~CSA() {
1438 #ifdef CSA_TEST_BWT
1439     delete dynFMI;
1440 #endif
1441     delete alphabetrank;
1442     delete sampled;
1443     delete suffixes;
1444     delete suffixDocId;
1445     delete [] codetable; // FIXME remove
1446
1447     delete endmarkerDocId;
1448     delete emptyTextRank;
1449 }
1450
1451 void CSA::makewavelet(uchar *bwt)
1452 {
1453     ulong i, min = 0,
1454              max;
1455     for (i=0;i<256;i++)
1456         C[i]=0;
1457     for (i=0;i<n;++i)
1458         C[(int)bwt[i]]++;
1459     for (i=0;i<256;i++)
1460         if (C[i]>0) {min = i; break;}
1461     for (i=255;i>=min;--i)
1462         if (C[i]>0) {max = i; break;}
1463
1464     ulong prev=C[0], temp;
1465     C[0]=0;
1466     for (i=1;i<256;i++) {
1467         temp = C[i];
1468         C[i]=C[i-1]+prev;
1469         prev = temp;
1470     }
1471 //    this->codetable = node::makecodetable(bwt,n);
1472 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1473 //    delete [] bwt;
1474     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1475 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1476 //    std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1477
1478     HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1479
1480     alphabet_mapper * am = new alphabet_mapper_none();
1481     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1482 //    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1483     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1484     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1485     delete bmb;
1486     bwt = 0; // already deleted
1487
1488 //    std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1489 //    std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1490 }
1491
1492 void CSA::maketables()
1493 {
1494     // Calculate BWT end-marker position (of last inserted text)
1495     // Note: bwtEndPos already set!
1496     // and the length of the first text (counter l):
1497 #ifdef CSA_TEST_BWT
1498     {
1499         ulong i = 0;
1500         ulong l = 1;
1501         uint alphabetrank_i_tmp = 0;
1502         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1503         while (c != '\0')
1504         {
1505             i = C[c]+alphabetrank_i_tmp-1;
1506             ++l;
1507             c = alphabetrank->access(i, alphabetrank_i_tmp);
1508         }
1509
1510         if (i != bwtEndPos) // compare result
1511         {
1512             std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
1513             exit(0);
1514         }
1515     }
1516 #endif
1517
1518     // Build up arrays for text length and starting positions
1519     // FIXME Temp, remove
1520     //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1521     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1522     //(*textLength)[0] = l;
1523     (*textStartPos)[0] = 0;
1524
1525     // Construct samples
1526     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1527     unsigned ceilLog2n = Tools::CeilLog2(n);
1528     BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1529     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1530
1531     // Mapping from end-markers to doc ID's:
1532     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1533
1534     ulong *sampledpositions = new ulong[n/W+1];
1535     for (ulong i=0;i<n/W+1;i++)
1536         sampledpositions[i]=0lu;
1537
1538     ulong x,p=bwtEndPos;
1539     ulong sampleCount = 0;
1540     // Keeping track of text position of prev. end-marker seen
1541     ulong posOfSuccEndmarker = n;
1542     DocId textId = numberOfTexts;
1543     ulong ulongmax = 0;
1544     ulongmax--;
1545     uint alphabetrank_i_tmp =0;
1546
1547     //positions:
1548     for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1549       // i substitutes SA->GetPos(i)
1550         x=(i==n-1)?0:i+1;
1551
1552         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1553             Tools::SetField(sampledpositions,1,p,1);
1554             (*positions)[sampleCount] = p;
1555             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1556             sampleCount ++;
1557         }
1558
1559         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1560         if (c == '\0')
1561         {
1562             --textId;
1563
1564             // Record the order of end-markers in BWT:
1565             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1566             (*endmarkerDocId)[endmarkerRank] = textId;
1567
1568             // Store text length and text start position:
1569             if (textId < (DocId)numberOfTexts - 1)
1570             {
1571                 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1572                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1573                 posOfSuccEndmarker = x;
1574             }
1575
1576             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1577             p = textId; // Correct LF-mapping to the last char of the previous text.
1578         }
1579         else
1580             p = C[c]+alphabetrank_i_tmp-1;
1581     }
1582     assert(textId == 0);
1583
1584     sampled = new BSGAP(sampledpositions,n,true);
1585     sampleLength = sampled->rank(n-1);
1586     assert(sampleCount == sampleLength);
1587
1588     // Suffixes store an offset from the text start position
1589     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1590     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1591
1592     for(ulong i=0; i<sampleLength; i++) {
1593         assert((*positions)[i] < n);
1594         ulong j = sampled->rank((*positions)[i]);
1595         if (j==0) j=sampleLength;
1596         TextPosition textPos = (*tmpSuffix)[i];
1597         (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1598
1599         assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1600         assert((*suffixDocId)[j-1] < numberOfTexts);
1601         // calculate offset from text start:
1602         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1603     }
1604     // FIXME Temp, remove
1605     delete tmpSuffix;
1606     delete positions;
1607 //    delete textLength;
1608     delete textStartPos;
1609 }
1610
1611
1612 /**
1613  * Finds document identifier for given text position
1614  *
1615  * Starting text position of the document is stored into second parameter.
1616  * Binary searching on text starting positions.
1617  */
1618 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1619 {
1620     assert(i < n);
1621
1622     DocId a = 0;
1623     DocId b = numberOfTexts - 1;
1624     while (a < b)
1625     {
1626         DocId c = a + (b - a)/2;
1627         if ((*textStartPos)[c] > i)
1628             b = c - 1;
1629         else if ((*textStartPos)[c+1] > i)
1630             return c;
1631         else
1632             a = c + 1;
1633     }
1634
1635     assert(a < (DocId)numberOfTexts);
1636     assert(i >= (*textStartPos)[a]);
1637     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1638     return a;
1639 }
1640
1641 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1642 {
1643     TCodeEntry *result = new TCodeEntry[ 256 ];
1644
1645     count_chars( text, n, result );
1646     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1647 //
1648 // First I push all the leaf nodes into the queue
1649 //
1650     for ( unsigned int i = 0 ; i < 256 ; i++ )
1651         if ( result[ i ].count )
1652             q.push(node( i, result[ i ].count ) );
1653 //
1654 // This loop removes the two smallest nodes from the
1655 // queue.  It creates a new internal node that has
1656 // those two nodes as children. The new internal node
1657 // is then inserted into the priority queue.  When there
1658 // is only one node in the priority queue, the tree
1659 // is complete.
1660 //
1661
1662     while ( q.size() > 1 ) {
1663         node *child0 = new node( q.top() );
1664         q.pop();
1665         node *child1 = new node( q.top() );
1666         q.pop();
1667         q.push( node( child0, child1 ) );
1668     }
1669 //
1670 // Now I compute and return the codetable
1671 //
1672     q.top().maketable(0u,0u, result);
1673     q.pop();
1674     return result;
1675 }
1676
1677
1678 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1679 {
1680     if ( child0 )
1681     {
1682         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1683         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1684         delete child0;
1685         delete child1;
1686     }
1687     else
1688     {
1689         codetable[value].code = code;
1690         codetable[value].bits = bits;
1691     }
1692 }
1693
1694 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1695 {
1696     ulong i;
1697     for (i = 0 ; i < 256 ; i++ )
1698         counts[ i ].count = 0;
1699     for (i=0; i<n; i++)
1700         counts[(int)text[i]].count++;
1701 }
1702
1703 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1704       return x | (bit << pos);
1705 }
1706
1707 } // namespace SXSI
1708