CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22
  23 // Conflicts with std::min and std::max
  24 #undef min
  25 #undef max
  26 #include "dcover/bwt.hpp"
  27
  28 #include "HeapProfiler.h" // FIXME remove
  29
  30 #include <iostream>
  31 #include <queue>
  32 #include <set>
  33 #include <vector>
  34 #include <utility>
  35 #include <stdexcept>
  36 #include <cassert>
  37 #include <cstring> // For strlen()
  38 using std::vector;
  39 using std::pair;
  40 using std::make_pair;
  41 using std::map;
  42
  43 namespace SXSI
  44 {
  45
  46 // Save file version info
  47 const uchar CSA::versionFlag = 2;
  48
  49 ////////////////////////////////////////////////////////////////////////////
  50 // Class CSA::THuffAlphabetRank
  51 // FIXME Unused code
  52 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  53     left = NULL;
  54     right = NULL;
  55     bitrank = NULL;
  56     ch = s[0];
  57     leaf = false;
  58     this->codetable = codetable;
  59
  60     bool *B = new bool[n];
  61     TextPosition sum=0,i;
  62     /*
  63     for (i=0; i< n; i++) {
  64        printf("%c:", (char)((int)s[i]-128));
  65        for (r=0;r<codetable[(int)s[i]].bits;r++)
  66           if (codetable[(int)s[i]].code & (1u <<r))
  67              printf("1");
  68           else printf("0");
  69        printf("\n");
  70     }
  71     printf("\n");
  72     if (level > 100) return;*/
  73     for (i=0;i<n;i++)
  74        if (codetable[(int)s[i]].code & (1u << level)) {
  75           B[i] = true;
  76           sum++;
  77        }
  78        else B[i] = false;
  79     if (sum==0 || sum==n) {
  80         delete [] B;
  81         leaf = true;
  82         return;
  83     }
  84     uchar *sfirst, *ssecond;
  85     //if (n-sum > 0)
  86         sfirst = new uchar[n-sum];
  87     //if (sum > 0)
  88         ssecond = new uchar[sum];
  89     unsigned j=0,k=0;
  90    for (i=0;i<n;i++)
  91         if (B[i]) ssecond[k++] = s[i];
  92         else sfirst[j++] = s[i];
  93     ulong *Binbits = new ulong[n/W+1];
  94     for (i=0;i<n;i++)
  95         Tools::SetField(Binbits,1,i,B[i]);
  96     delete [] B;
  97     bitrank = new BitRank(Binbits,n,true);
  98     //if (j > 0) {
  99         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
 100         delete [] sfirst;
 101     //}
 102     //if (k>0) {
 103         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
 104         delete [] ssecond;
 105     //}
 106 }
 107
 108
 109 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 110     // testing that the code works correctly
 111     int C[256];
 112     unsigned i,j;
 113     bool correct=true;
 114     for (j=0;j<256;j++)
 115         C[j] = 0;
 116     for (i=0;i<n;i++) {
 117         C[(int)s[i]]++;
 118         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 119         correct = false;
 120         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 121         }
 122     }
 123     return correct;
 124 }
 125
 126 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 127     if (left!=NULL) delete left;
 128     if (right!=NULL) delete right;
 129     if (bitrank!=NULL)
 130         delete bitrank;
 131 }
 132
 133
 134 ////////////////////////////////////////////////////////////////////////////
 135 // Class CSA
 136
 137 /**
 138  * Constructor inits an empty dynamic FM-index.
 139  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 140  */
 141 CSA::CSA(unsigned samplerate)
 142     : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
 143       suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
 144       emptyTextRank(0)
 145 {
 146     this->samplerate = samplerate;
 147
 148 #ifdef CSA_TEST_BWT
 149     // FIXME TODO : DynFMI needs distribution of characters before hand
 150     // This will create fully balanced wavelet tree for all chars [0, 255].
 151     uchar temp[256];
 152     for (unsigned i = 0; i < 255; ++i)
 153         temp[i] = i+1;
 154     temp[255] = 0;
 155     dynFMI = new DynFMI(temp, 1, 255, false);
 156
 157     /* Debug code: take char distribution from data.txt.
 158     uchar *temp = Tools::GetFileContents("data.txt", 0);
 159     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 160     delete [] temp;*/
 161 #endif
 162 }
 163
 164 /**
 165  * Insert new text
 166  *
 167  * Given text must include end-marker.
 168  * Text identifiers are numbered starting from 0.
 169  */
 170 void CSA::InsertText(uchar const * text)
 171 {
 172     // Sanity check:
 173 #ifdef CSA_TEST_BWT
 174     assert(dynFMI != 0);
 175 #endif
 176
 177     TextPosition m = std::strlen((char *)text) + 1;
 178     if (m > maxTextLength)
 179         maxTextLength = m; // Store length of the longest text seen so far.
 180
 181     if (m > 1)
 182     {
 183         this->n += m;
 184         this->numberOfTexts ++;
 185         this->numberOfAllTexts ++;
 186 #ifdef CSA_TEST_BWT
 187         dynFMI->addText(text, m);
 188 #endif
 189
 190         texts.append((char *) text);
 191         texts.append(1, '\0');
 192     }
 193     else
 194     {
 195         emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
 196         this->numberOfAllTexts ++;
 197     }
 198 }
 199
 200 void CSA::MakeStatic()
 201 {
 202     // Sanity check:
 203 #ifdef CSA_TEST_BWT
 204     if (dynFMI == 0)
 205         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 206 #endif
 207
 208     if (texts.size() == 0) // Empty collection
 209     {
 210          ++n;
 211          texts.append(1, '\0');
 212     }
 213
 214     // Bitvector of empty texts, FIXME Remove?
 215     {
 216         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
 217         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
 218         for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
 219             tempB[i] = 0;
 220         for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
 221             Tools::SetField(tempB, 1, (*it), 1);
 222         emptyTextId.clear();
 223         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
 224     }
 225
 226     texts.reserve(0); // Release extra capacity
 227
 228 /*    FILE *fp = fopen("texts.txt", "wb");
 229     std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
 230     fclose(fp);*/
 231
 232     uchar *bwt = new uchar[n];
 233     {
 234         // More succinct solution via StringIterator, see below.
 235 /*        unsigned* maptexts = new unsigned[n+1];
 236         // Map text chars to [0..255+numberOfTexts]
 237         unsigned count = 0;
 238         for (ulong i = 0; i < n; ++i)
 239             if (texts[i] == 0)
 240                 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
 241             else {
 242                 uchar c = (uchar)texts[i];
 243                 maptexts[i] = (unsigned)c + numberOfTexts;
 244             }
 245         maptexts[n] = '\0';
 246         assert(count == numberOfTexts);
 247
 248         std::cout << "maptext: ";
 249         for (ulong i = 0; i <= n; ++i)
 250             std::cout << maptexts[i] << ", ";
 251             std::cout << std::endl;*/
 252
 253         // Mark endmarker positions into bitvector
 254         ulong * endmarkers = new ulong[n/W+1];
 255         for (ulong i = 0; i < n/W+1; ++i)
 256             endmarkers[i] = 0;
 257         for (ulong i = 0; i < n; ++i)
 258             if (texts[i] == 0)
 259                 Tools::SetField(endmarkers, 1, i, 1);
 260         BitRank* br = new BitRank(endmarkers, n, true);
 261         assert(numberOfTexts == br->rank(n-1));
 262
 263         // Build iterators, FIXME clean up iterator construction.
 264         StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
 265         StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
 266
 267         bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
 268                                        &bwt[0], numberOfTexts);
 269
 270         bwt[--bwtEndPos] = '\0';
 271         texts.erase();
 272         texts.reserve(0); // Release capacity
 273         delete br; // bitvector of endmarkers
 274     } // End of bw transform
 275 //  std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
 276
 277 /*    fp = fopen("bwt.txt", "wb");
 278     std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
 279     fclose(fp);*/
 280
 281
 282 #ifdef CSA_TEST_BWT
 283     {
 284         uchar *bwtTest = dynFMI->getBWT();
 285         printf("123456789012345678901234567890123456789\n");
 286         for (TextPosition i = 0; i < n && i < 100; i ++)
 287             if (bwt[i] < 50)
 288                 printf("%d", (int)bwt[i]);
 289             else
 290                 printf("%c", bwt[i]);
 291         printf("\n");
 292         for (TextPosition i = 0; i < n && i < 100; i ++)
 293             if (bwtTest[i] < 50)
 294                 printf("%d", (int)bwtTest[i]);
 295             else
 296                 printf("%c", bwtTest[i]);
 297                 printf("\n");
 298
 299         // Sanity check
 300         assert(numberOfTexts == dynFMI->getCollectionSize());
 301
 302         delete dynFMI;
 303         dynFMI = 0;
 304         for (ulong i = 0; i < n; ++i)
 305             if (bwt[i] != bwtTest[i])
 306             {
 307                 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
 308                 assert(0);
 309             }
 310         delete [] bwtTest;
 311     }
 312 #endif // CSA_TEST_BWT
 313
 314     makewavelet(bwt); // Deletes bwt!
 315     bwt = 0;
 316
 317     // Make sampling tables
 318     maketables();
 319 }
 320
 321 bool CSA::EmptyText(DocId k) const
 322 {
 323     assert(k < (DocId)numberOfTexts);
 324     if (emptyTextRank->IsBitSet(k))
 325         return true;
 326     return false;
 327 }
 328
 329 uchar* CSA::GetText(DocId k) const
 330 {
 331     assert(k < (DocId)numberOfTexts);
 332
 333     ulong textRank = 0;
 334     if (emptyTextRank->IsBitSet(k, &textRank))
 335     {
 336         uchar* result = new uchar[1];
 337         result[0] = 0;
 338         return result;
 339     }
 340     // Map to non-empty text
 341     k -= textRank; //emptyTextRank->rank(k);
 342
 343     TextPosition i = k;
 344
 345     string result;
 346     // Reserve average string length to avoid reallocs
 347     result.reserve(n/numberOfTexts);
 348
 349     uchar c = alphabetrank->access(i);
 350     while (c != '\0')
 351     {
 352         result.push_back(c);
 353         i = C[c]+alphabetrank->rank(c,i)-1;
 354
 355         c = alphabetrank->access(i); // "next" char.
 356     }
 357
 358     // Convert to uchar (FIXME return string?)
 359     i = result.size();
 360     uchar* res = new uchar[i+1];
 361     res[i] = '\0';
 362     for (ulong j = 0; j < i; ++j)
 363         res[i-j-1] = result[j];
 364     return res;
 365 }
 366 /*
 367  * Not supported
 368 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 369 {
 370     assert(k < (DocId)numberOfTexts);
 371     assert(j < (*textLength)[k]);
 372     assert(i <= j);
 373
 374     ulong textRank = 0;
 375     if (emptyTextRank->IsBitSet(k, &textRank))
 376     {
 377         uchar* result = new uchar[1];
 378         result[0] = 0;
 379         return result;
 380     }
 381
 382     // Map to non-empty text
 383     k -= textRank; // emptyTextRank->rank(k);
 384
 385     // Start position of k'th text
 386     ulong start = (*textStartPos)[k];
 387
 388     return Substring(i + start, j-i+1);
 389     }*/
 390
 391 /******************************************************************
 392  * Existential queries
 393  */
 394 bool CSA::IsPrefix(uchar const * pattern) const
 395 {
 396     TextPosition m = strlen((char *)pattern);
 397     if (m == 0)
 398         return true;
 399
 400     TextPosition sp = 0, ep = 0;
 401     Search(pattern, m, &sp, &ep);
 402
 403     // Check for end-marker(s) in result interval
 404     if (CountEndmarkers(sp, ep))
 405         return true;
 406     return false;
 407 }
 408
 409 bool CSA::IsSuffix(uchar const *pattern) const
 410 {
 411     // Here counting is as fast as IsSuffix():
 412     if (CountSuffix(pattern) > 0)
 413         return true;
 414     return false;
 415 }
 416
 417 bool CSA::IsEqual(uchar const *pattern) const
 418 {
 419     TextPosition m = std::strlen((char *)pattern);
 420     if (m == 0)
 421     {
 422         if (numberOfAllTexts - numberOfTexts > 0u)
 423             return true; // Empty texts exists
 424         return false; // No empty texts exists
 425     }
 426
 427     TextPosition sp = 0, ep = 0;
 428     // Match including end-marker
 429     Search(pattern, m+1, &sp, &ep);
 430
 431     // Check for end-marker(s) in result interval
 432     if (CountEndmarkers(sp, ep))
 433         return true;
 434     return false;
 435 }
 436
 437 bool CSA::IsContains(uchar const * pattern) const
 438 {
 439     TextPosition m = strlen((char *)pattern);
 440     if (m == 0)
 441         return true;
 442
 443     TextPosition sp = 0, ep = 0;
 444     // Just check if pattern exists somewhere
 445     ulong count = Search(pattern, m, &sp, &ep);
 446
 447     if (count > 0)
 448         return true;
 449     return false;
 450 }
 451
 452 bool CSA::IsLessThan(uchar const * pattern) const
 453 {
 454     if (CountLessThan(pattern) > 0)
 455         return true;
 456     return false;
 457 }
 458
 459 /******************************************************************
 460  * Counting queries
 461  */
 462 ulong CSA::Count(uchar const * pattern) const
 463 {
 464     TextPosition m = strlen((char *)pattern);
 465     if (m == 0)
 466         return 0;
 467
 468     TextPosition sp = 0, ep = 0;
 469     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 470     return count;
 471 }
 472
 473 unsigned CSA::CountPrefix(uchar const * pattern) const
 474 {
 475     TextPosition m = strlen((char *)pattern);
 476     if (m == 0)
 477         return numberOfAllTexts;
 478
 479     TextPosition sp = 0, ep = 0;
 480     Search(pattern, m, &sp, &ep);
 481
 482     // Count end-markers in result interval
 483     return CountEndmarkers(sp, ep);
 484 }
 485
 486 unsigned CSA::CountSuffix(uchar const * pattern) const
 487 {
 488     TextPosition m = strlen((char *)pattern);
 489     if (m == 0)
 490         return numberOfAllTexts;
 491
 492     TextPosition sp = 0, ep = 0;
 493     // Search with end-marker
 494     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 495
 496     return count;
 497 }
 498
 499 unsigned CSA::CountEqual(uchar const *pattern) const
 500 {
 501     TextPosition m = strlen((char const *)pattern);
 502     if (m == 0)
 503         return numberOfAllTexts - numberOfTexts; // Empty texts.
 504
 505     TextPosition sp = 0, ep = 0;
 506     // Match including end-marker
 507     Search(pattern, m+1, &sp, &ep);
 508
 509     // Count end-markers in result interval
 510     return CountEndmarkers(sp, ep);
 511 }
 512
 513 unsigned CSA::CountContains(uchar const * pattern) const
 514 {
 515     TextPosition m = strlen((char const *)pattern);
 516     if (m == 0)
 517         return numberOfAllTexts; // Total number of texts.
 518
 519     // Here counting is as slow as fetching the result set
 520     // because we would have to filter out occ's that fall within same document.
 521     TextCollection::document_result result = Contains(pattern);
 522     return result.size();
 523 }
 524
 525 // Less than or equal
 526 unsigned CSA::CountLessThan(uchar const * pattern) const
 527 {
 528     TextPosition m = strlen((char const *)pattern);
 529     if (m == 0)
 530         return numberOfAllTexts - numberOfTexts; // Empty texts.
 531
 532     TextPosition sp = 0, ep = 0;
 533     SearchLessThan(pattern, m, &sp, &ep);
 534
 535     // Count end-markers in result interval
 536     return CountEndmarkers(sp, ep);
 537 }
 538
 539 /**
 540  * Document reporting queries
 541  */
 542 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 543 {
 544     TextPosition m = strlen((char *)pattern);
 545     if (m == 0)
 546         return TextCollection::document_result(); // FIXME Should return all 1...k
 547
 548     TextPosition sp = 0, ep = 0;
 549     Search(pattern, m, &sp, &ep);
 550
 551     TextCollection::document_result result;
 552
 553     // Report end-markers in result interval
 554     unsigned resultSize = CountEndmarkers(sp, ep);
 555     if (resultSize == 0)
 556         return result;
 557
 558     result.reserve(resultSize); // Try to avoid reallocation.
 559
 560     // Iterate through end-markers in [sp,ep]:
 561     unsigned i = 0;
 562     if (sp > 0)
 563         i = alphabetrank->rank(0, sp - 1);
 564     while (resultSize)
 565     {
 566         // End-marker we found belongs to the "preceeding" doc in the collection
 567         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 568         // Map to doc ID:
 569         docId = emptyTextRank->select0(docId+1);
 570         result.push_back(docId);
 571
 572         -- resultSize;
 573         ++ i;
 574     }
 575
 576     return result;
 577 }
 578
 579 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 580 {
 581     TextPosition m = strlen((char *)pattern);
 582     if (m == 0)
 583         return TextCollection::document_result(); // FIXME Should return all 1...k
 584
 585     TextPosition sp = 0, ep = 0;
 586     // Search with end-marker
 587     Search(pattern, m+1, &sp, &ep);
 588
 589     TextCollection::document_result result;
 590     result.reserve(ep-sp+1); // Try to avoid reallocation.
 591
 592     ulong sampled_rank_i = 0;
 593     // Check each occurrence
 594     for (; sp <= ep; ++sp)
 595     {
 596         TextPosition i = sp;
 597
 598         uchar c = alphabetrank->access(i);
 599         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 600         {
 601             i = C[c]+alphabetrank->rank(c,i)-1;
 602             c = alphabetrank->access(i);
 603         }
 604         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 605
 606         if (c == '\0')
 607         {
 608             // Rank among the end-markers in BWT
 609             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 610
 611             // End-marker that we found belongs to the "preceeding" doc in collection:
 612             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 613             // Map to doc ID:
 614             docId = emptyTextRank->select0(docId+1);
 615             result.push_back(docId);
 616         }
 617         else // Sampled position
 618         {
 619             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 620             // Map to doc ID:
 621             docId = emptyTextRank->select0(docId+1);
 622             result.push_back(docId);
 623         }
 624     }
 625
 626     return result;
 627 }
 628
 629 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 630 {
 631     TextPosition m = strlen((char const *)pattern);
 632     if (m == 0)
 633         return TextCollection::document_result(); // FIXME Should return all empty texts
 634
 635     TextPosition sp = 0, ep = 0;
 636     // Match including end-marker
 637     Search(pattern, m+1, &sp, &ep);
 638
 639     TextCollection::document_result result;
 640
 641     // Report end-markers in result interval
 642     unsigned resultSize = CountEndmarkers(sp, ep);
 643     if (resultSize == 0)
 644         return result;
 645
 646     result.reserve(resultSize); // Try to avoid reallocation.
 647
 648     unsigned i = 0;
 649     if (sp > 0)
 650         i = alphabetrank->rank(0, sp - 1);
 651     while (resultSize)
 652     {
 653         // End-marker we found belongs to the "previous" doc in the collection
 654         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 655         // Map to doc ID:
 656         docId = emptyTextRank->select0(docId+1);
 657         result.push_back(docId);
 658
 659         -- resultSize;
 660         ++ i;
 661     }
 662
 663     return result;
 664 }
 665
 666
 667 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 668 {
 669     TextPosition m = strlen((char *)pattern);
 670     if (m == 0)
 671         return TextCollection::document_result();
 672
 673     TextPosition sp = 0, ep = 0;
 674     // Search all occurrences
 675     Search(pattern, m, &sp, &ep);
 676
 677     // We want unique document indentifiers, using std::set to collect them
 678     std::set<DocId> resultSet;
 679
 680     ulong sampled_rank_i = 0;
 681     // Check each occurrence
 682     for (; sp <= ep; ++sp)
 683     {
 684         TextPosition i = sp;
 685         uchar c = alphabetrank->access(i);
 686         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 687         {
 688             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 689             c = alphabetrank->access(i);
 690         }
 691         if (c == '\0')
 692         {
 693             // Rank among the end-markers in BWT
 694             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 695
 696             // End-marker that we found belongs to the "preceeding" doc in collection:
 697             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 698             resultSet.insert(docId);
 699         }
 700         else
 701         {
 702             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 703             assert((unsigned)di < numberOfTexts);
 704             resultSet.insert(di);
 705         }
 706     }
 707
 708     // Convert std::set to std::vector
 709     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 710     // Map to doc ID's
 711     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 712         *it = emptyTextRank->select0(*it+1);
 713     return result;
 714 }
 715
 716 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 717 {
 718     TextPosition m = strlen((char *)pattern);
 719     if (m == 0)
 720         return TextCollection::document_result(); // empty result set
 721
 722     TextPosition sp = 0, ep = 0;
 723     SearchLessThan(pattern, m, &sp, &ep);
 724
 725     TextCollection::document_result result;
 726
 727     // Report end-markers in result interval
 728     unsigned resultSize = CountEndmarkers(sp, ep);
 729     if (resultSize == 0)
 730         return result;
 731
 732     result.reserve(resultSize); // Try to avoid reallocation.
 733
 734     // Iterate through end-markers in [sp,ep]:
 735     unsigned i = 0;
 736     if (sp > 0)
 737         i = alphabetrank->rank(0, sp - 1);
 738     while (resultSize)
 739     {
 740         // End-marker we found belongs to the "preceeding" doc in the collection
 741         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 742         // Map to doc ID:
 743         docId = emptyTextRank->select0(docId+1);
 744         result.push_back(docId);
 745
 746         -- resultSize;
 747         ++ i;
 748     }
 749
 750     return result;
 751 }
 752
 753 /**
 754  * Full result set queries
 755  */
 756 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 757 {
 758     TextPosition m = strlen((char *)pattern);
 759     if (m == 0)
 760         return full_result(); // FIXME Throw exception?
 761
 762     TextPosition sp = 0, ep = 0;
 763     // Search all occurrences
 764     Search(pattern, m, &sp, &ep);
 765
 766     full_result result;
 767     result.reserve(ep-sp+1); // Try to avoid reallocation.
 768
 769     ulong sampled_rank_i = 0;
 770     // Report each occurrence
 771     for (; sp <= ep; ++sp)
 772     {
 773         TextPosition i = sp;
 774         TextPosition dist = 0;
 775         uchar c = alphabetrank->access(i);
 776         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 777         {
 778             i = C[c]+alphabetrank->rank(c,i)-1;
 779             c = alphabetrank->access(i);
 780             ++ dist;
 781         }
 782         if (c == '\0')
 783         {
 784             // Rank among the end-markers in BWT
 785             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 786
 787             // End-marker that we found belongs to the "preceeding" doc in collection:
 788             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 789             // Map to doc ID:
 790             docId = emptyTextRank->select0(docId+1);
 791             result.push_back(make_pair(docId, dist));
 792         }
 793         else
 794         {
 795             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
 796             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 797 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
 798
 799             // Map to doc ID:
 800             docId = emptyTextRank->select0(docId+1);
 801             result.push_back(make_pair(docId, textPos));
 802         }
 803     }
 804
 805     return result;
 806 }
 807
 808
 809 /**
 810  * Save index to a file handle
 811  *
 812  * Throws a std::runtime_error exception on i/o error.
 813  * First byte that is saved represents the version number of the save file.
 814  * In version 2 files, the data fields are:
 815  *  uchar versionFlag;
 816     TextPosition n;
 817     unsigned samplerate;
 818     unsigned C[256];
 819     TextPosition bwtEndPos;
 820     static_sequence * alphabetrank;
 821     BSGAP *sampled;
 822     BlockArray *suffixes;
 823     BlockArray *suffixDocId;
 824     unsigned numberOfTexts;
 825     unsigned numberOfAllTexts;
 826     ulong maxTextLength;
 827     BlockArray *endmarkerDocId;
 828     BSGAP *emptyTextRank;
 829  */
 830 void CSA::Save(FILE *file) const
 831 {
 832     // Saving version info:
 833     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 834         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 835
 836     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 837         throw std::runtime_error("CSA::Save(): file write error (n).");
 838     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 839         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 840
 841     for(ulong i = 0; i < 256; ++i)
 842         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 843             throw std::runtime_error("CSA::Save(): file write error (C table).");
 844
 845     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 846         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 847
 848     alphabetrank->save(file);
 849     sampled->Save(file);
 850     suffixes->Save(file);
 851     suffixDocId->Save(file);
 852
 853     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 854         throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
 855     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 856         throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
 857     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 858         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
 859
 860     endmarkerDocId->Save(file);
 861     emptyTextRank->Save(file);
 862     fflush(file);
 863 }
 864
 865
 866 /**
 867  * Load index from a file handle
 868  *
 869  * Throws a std::runtime_error exception on i/o error.
 870  * For more info, see CSA::Save().
 871  */
 872 void CSA::Load(FILE *file, unsigned samplerate)
 873 {
 874     // Delete everything
 875 #ifdef CSA_TEST_BWT
 876     delete dynFMI;       dynFMI = 0;
 877 #endif
 878     delete alphabetrank; alphabetrank = 0;
 879     delete sampled;      sampled = 0;
 880     delete suffixes;     suffixes = 0;
 881     delete suffixDocId;  suffixDocId = 0;
 882     delete [] codetable; codetable = 0;
 883
 884     delete endmarkerDocId; endmarkerDocId = 0;
 885     delete emptyTextRank;  emptyTextRank = 0;
 886
 887     this->maxTextLength = 0;
 888     this->numberOfTexts = 0;
 889     this->numberOfAllTexts = 0;
 890     this->samplerate = samplerate;
 891     this->n = 0;
 892
 893     uchar verFlag = 0;
 894     if (std::fread(&verFlag, 1, 1, file) != 1)
 895         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 896     if (verFlag != CSA::versionFlag)
 897         throw std::runtime_error("CSA::Load(): invalid save file version.");
 898
 899     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 900         throw std::runtime_error("CSA::Load(): file read error (n).");
 901     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 902         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 903 // FIXME samplerate can not be changed during load.
 904 //    if (this->samplerate == 0)
 905 //        this->samplerate = samplerate;
 906
 907     for(ulong i = 0; i < 256; ++i)
 908         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 909             throw std::runtime_error("CSA::Load(): file read error (C table).");
 910
 911     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 912         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 913
 914     alphabetrank = static_sequence::load(file);
 915     sampled = new BSGAP(file);
 916     suffixes = new BlockArray(file);
 917     suffixDocId = new BlockArray(file);
 918
 919     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 920         throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
 921     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 922         throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
 923     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 924         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
 925
 926     endmarkerDocId = new BlockArray(file);
 927     emptyTextRank = new BSGAP(file);
 928
 929     // FIXME Construct data structures with new samplerate
 930     //maketables();
 931 }
 932
 933
 934
 935 /**
 936  * Rest of the functions follow...
 937  */
 938
 939
 940 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 941 {
 942     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 943     int c = (int)pattern[m-1];
 944     TextPosition i=m-1;
 945     TextPosition sp = C[c];
 946     TextPosition ep = C[c+1]-1;
 947     while (sp<=ep && i>=1)
 948     {
 949 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 950         c = (int)pattern[--i];
 951         sp = C[c]+alphabetrank->rank(c,sp-1);
 952         ep = C[c]+alphabetrank->rank(c,ep)-1;
 953     }
 954     *spResult = sp;
 955     *epResult = ep;
 956     if (sp<=ep)
 957         return ep - sp + 1;
 958     else
 959         return 0;
 960 }
 961
 962
 963 ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 964 {
 965     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 966     uint c = (int)pattern[m-1];
 967     TextPosition i=m-1;
 968     TextPosition sp = 1;
 969     TextPosition ep = C[c+1]-1;
 970     while (sp<=ep && i>=1)
 971     {
 972 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 973         c = (int)pattern[--i];
 974         uint result = alphabetrank->rank(c,ep);
 975         if (result == ~0u)
 976             ep = 0;
 977         else
 978             ep = C[c]+result-1;
 979     }
 980     *spResult = sp;
 981     *epResult = ep;
 982     if (sp<=ep)
 983         return ep - sp + 1;
 984     else
 985         return 0;
 986 }
 987
 988
 989 CSA::~CSA() {
 990 #ifdef CSA_TEST_BWT
 991     delete dynFMI;
 992 #endif
 993     delete alphabetrank;
 994     delete sampled;
 995     delete suffixes;
 996     delete suffixDocId;
 997     delete [] codetable; // FIXME remove
 998
 999     delete endmarkerDocId;
1000     delete emptyTextRank;
1001 }
1002
1003 void CSA::makewavelet(uchar *bwt)
1004 {
1005     ulong i, min = 0,
1006              max;
1007     for (i=0;i<256;i++)
1008         C[i]=0;
1009     for (i=0;i<n;++i)
1010         C[(int)bwt[i]]++;
1011     for (i=0;i<256;i++)
1012         if (C[i]>0) {min = i; break;}
1013     for (i=255;i>=min;--i)
1014         if (C[i]>0) {max = i; break;}
1015
1016     ulong prev=C[0], temp;
1017     C[0]=0;
1018     for (i=1;i<256;i++) {
1019         temp = C[i];
1020         C[i]=C[i-1]+prev;
1021         prev = temp;
1022     }
1023 //    this->codetable = node::makecodetable(bwt,n);
1024 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1025 //    delete [] bwt;
1026     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1027   std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1028     std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1029
1030     HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1031
1032     alphabet_mapper * am = new alphabet_mapper_none();
1033     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1034 //    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1035     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1036     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1037     delete bmb;
1038     bwt = 0; // already deleted
1039
1040     std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1041     std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1042 }
1043
1044 void CSA::maketables()
1045 {
1046     // Calculate BWT end-marker position (of last inserted text)
1047     // Note: bwtEndPos already set!
1048     // and the length of the first text (counter l):
1049 #ifdef CSA_TEST_BWT
1050     {
1051         ulong i = 0;
1052         ulong l = 1;
1053         uint alphabetrank_i_tmp = 0;
1054         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1055         while (c != '\0')
1056         {
1057             i = C[c]+alphabetrank_i_tmp-1;
1058             ++l;
1059             c = alphabetrank->access(i, alphabetrank_i_tmp);
1060         }
1061
1062         if (i != bwtEndPos) // compare result
1063         {
1064             std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
1065             exit(0);
1066         }
1067     }
1068 #endif
1069
1070     // Build up arrays for text length and starting positions
1071     // FIXME Temp, remove
1072     //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1073     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1074     //(*textLength)[0] = l;
1075     (*textStartPos)[0] = 0;
1076
1077     // Construct samples
1078     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1079     unsigned ceilLog2n = Tools::CeilLog2(n);
1080     BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1081     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1082
1083     // Mapping from end-markers to doc ID's:
1084     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1085
1086     ulong *sampledpositions = new ulong[n/W+1];
1087     for (ulong i=0;i<n/W+1;i++)
1088         sampledpositions[i]=0lu;
1089
1090     ulong x,p=bwtEndPos;
1091     ulong sampleCount = 0;
1092     // Keeping track of text position of prev. end-marker seen
1093     ulong posOfSuccEndmarker = n;
1094     DocId textId = numberOfTexts;
1095     ulong ulongmax = 0;
1096     ulongmax--;
1097     uint alphabetrank_i_tmp =0;
1098
1099     //positions:
1100     for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1101       // i substitutes SA->GetPos(i)
1102         x=(i==n-1)?0:i+1;
1103
1104         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1105             Tools::SetField(sampledpositions,1,p,1);
1106             (*positions)[sampleCount] = p;
1107             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1108             sampleCount ++;
1109         }
1110
1111         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1112         if (c == '\0')
1113         {
1114             --textId;
1115
1116             // Record the order of end-markers in BWT:
1117             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1118             (*endmarkerDocId)[endmarkerRank] = textId;
1119
1120             // Store text length and text start position:
1121             if (textId < (DocId)numberOfTexts - 1)
1122             {
1123                 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1124                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1125                 posOfSuccEndmarker = x;
1126             }
1127
1128             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1129             p = textId; // Correct LF-mapping to the last char of the previous text.
1130         }
1131         else
1132             p = C[c]+alphabetrank_i_tmp-1;
1133     }
1134     assert(textId == 0);
1135
1136     sampled = new BSGAP(sampledpositions,n,true);
1137     sampleLength = sampled->rank(n-1);
1138     assert(sampleCount == sampleLength);
1139
1140     // Suffixes store an offset from the text start position
1141     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1142     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1143
1144     for(ulong i=0; i<sampleLength; i++) {
1145         assert((*positions)[i] < n);
1146         ulong j = sampled->rank((*positions)[i]);
1147         if (j==0) j=sampleLength;
1148         TextPosition textPos = (*tmpSuffix)[i];
1149         (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1150
1151         assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1152         assert((*suffixDocId)[j-1] < numberOfTexts);
1153         // calculate offset from text start:
1154         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1155     }
1156     // FIXME Temp, remove
1157     delete tmpSuffix;
1158     delete positions;
1159 //    delete textLength;
1160     delete textStartPos;
1161 }
1162
1163
1164 /**
1165  * Finds document identifier for given text position
1166  *
1167  * Starting text position of the document is stored into second parameter.
1168  * Binary searching on text starting positions.
1169  */
1170 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1171 {
1172     assert(i < n);
1173
1174     DocId a = 0;
1175     DocId b = numberOfTexts - 1;
1176     while (a < b)
1177     {
1178         DocId c = a + (b - a)/2;
1179         if ((*textStartPos)[c] > i)
1180             b = c - 1;
1181         else if ((*textStartPos)[c+1] > i)
1182             return c;
1183         else
1184             a = c + 1;
1185     }
1186
1187     assert(a < (DocId)numberOfTexts);
1188     assert(i >= (*textStartPos)[a]);
1189     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1190     return a;
1191 }
1192
1193 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1194 {
1195     TCodeEntry *result = new TCodeEntry[ 256 ];
1196
1197     count_chars( text, n, result );
1198     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1199 //
1200 // First I push all the leaf nodes into the queue
1201 //
1202     for ( unsigned int i = 0 ; i < 256 ; i++ )
1203         if ( result[ i ].count )
1204             q.push(node( i, result[ i ].count ) );
1205 //
1206 // This loop removes the two smallest nodes from the
1207 // queue.  It creates a new internal node that has
1208 // those two nodes as children. The new internal node
1209 // is then inserted into the priority queue.  When there
1210 // is only one node in the priority queue, the tree
1211 // is complete.
1212 //
1213
1214     while ( q.size() > 1 ) {
1215         node *child0 = new node( q.top() );
1216         q.pop();
1217         node *child1 = new node( q.top() );
1218         q.pop();
1219         q.push( node( child0, child1 ) );
1220     }
1221 //
1222 // Now I compute and return the codetable
1223 //
1224     q.top().maketable(0u,0u, result);
1225     q.pop();
1226     return result;
1227 }
1228
1229
1230 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1231 {
1232     if ( child0 )
1233     {
1234         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1235         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1236         delete child0;
1237         delete child1;
1238     }
1239     else
1240     {
1241         codetable[value].code = code;
1242         codetable[value].bits = bits;
1243     }
1244 }
1245
1246 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1247 {
1248     ulong i;
1249     for (i = 0 ; i < 256 ; i++ )
1250         counts[ i ].count = 0;
1251     for (i=0; i<n; i++)
1252         counts[(int)text[i]].count++;
1253 }
1254
1255 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1256       return x | (bit << pos);
1257 }
1258
1259 } // namespace SXSI
1260