CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22 #include <iostream>
  23 #include <queue>
  24 #include <iomanip>
  25 #include <set>
  26 #include <vector>
  27 #include <utility>
  28 #include <stdexcept>
  29 #include <cassert>
  30 #include <cstring> // For strlen()
  31 using std::vector;
  32 using std::pair;
  33 using std::make_pair;
  34 using std::map;
  35
  36 using SXSI::TextCollection;
  37
  38 // Save file version info
  39 const uchar CSA::versionFlag = 2;
  40
  41 ////////////////////////////////////////////////////////////////////////////
  42 // Class CSA::THuffAlphabetRank
  43 // FIXME Unused code
  44 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  45     left = NULL;
  46     right = NULL;
  47     bitrank = NULL;
  48     ch = s[0];
  49     leaf = false;
  50     this->codetable = codetable;
  51
  52     bool *B = new bool[n];
  53     TextPosition sum=0,i;
  54     /*
  55     for (i=0; i< n; i++) {
  56        printf("%c:", (char)((int)s[i]-128));
  57        for (r=0;r<codetable[(int)s[i]].bits;r++)
  58           if (codetable[(int)s[i]].code & (1u <<r))
  59              printf("1");
  60           else printf("0");
  61        printf("\n");
  62     }
  63     printf("\n");
  64     if (level > 100) return;*/
  65     for (i=0;i<n;i++)
  66        if (codetable[(int)s[i]].code & (1u << level)) {
  67           B[i] = true;
  68           sum++;
  69        }
  70        else B[i] = false;
  71     if (sum==0 || sum==n) {
  72         delete [] B;
  73         leaf = true;
  74         return;
  75     }
  76     uchar *sfirst, *ssecond;
  77     //if (n-sum > 0)
  78         sfirst = new uchar[n-sum];
  79     //if (sum > 0)
  80         ssecond = new uchar[sum];
  81     unsigned j=0,k=0;
  82    for (i=0;i<n;i++)
  83         if (B[i]) ssecond[k++] = s[i];
  84         else sfirst[j++] = s[i];
  85     ulong *Binbits = new ulong[n/W+1];
  86     for (i=0;i<n;i++)
  87         Tools::SetField(Binbits,1,i,B[i]);
  88     delete [] B;
  89     bitrank = new BitRank(Binbits,n,true);
  90     //if (j > 0) {
  91         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
  92         delete [] sfirst;
  93     //}
  94     //if (k>0) {
  95         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
  96         delete [] ssecond;
  97     //}
  98 }
  99
 100
 101 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 102     // testing that the code works correctly
 103     int C[256];
 104     unsigned i,j;
 105     bool correct=true;
 106     for (j=0;j<256;j++)
 107         C[j] = 0;
 108     for (i=0;i<n;i++) {
 109         C[(int)s[i]]++;
 110         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 111         correct = false;
 112         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 113         }
 114     }
 115     return correct;
 116 }
 117
 118 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 119     if (left!=NULL) delete left;
 120     if (right!=NULL) delete right;
 121     if (bitrank!=NULL)
 122         delete bitrank;
 123 }
 124
 125
 126 ////////////////////////////////////////////////////////////////////////////
 127 // Class CSA
 128
 129 /**
 130  * Constructor inits an empty dynamic FM-index.
 131  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 132  */
 133 CSA::CSA(unsigned samplerate)
 134     : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0),
 135       codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
 136       textLength(0), textStartPos(0), emptyTextRank(0)
 137 {
 138     this->samplerate = samplerate;
 139
 140     // FIXME TODO : DynFMI needs distribution of characters before hand
 141     // This will create fully balanced wavelet tree for all chars [0, 255].
 142     uchar temp[256];
 143     for (unsigned i = 0; i < 255; ++i)
 144         temp[i] = i+1;
 145     temp[255] = 0;
 146     dynFMI = new DynFMI(temp, 1, 255, false);
 147
 148     /* Debug code: take char distribution from data.txt.
 149     uchar *temp = Tools::GetFileContents("data.txt", 0);
 150     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 151     delete [] temp;*/
 152 }
 153
 154 /**
 155  * Insert new text
 156  *
 157  * Given text must include end-marker.
 158  * Text identifiers are numbered starting from 0.
 159  */
 160 void CSA::InsertText(uchar const * text)
 161 {
 162     // Sanity check:
 163     assert(dynFMI != 0);
 164
 165     TextPosition m = std::strlen((char *)text) + 1;
 166     if (m > maxTextLength)
 167         maxTextLength = m; // Store length of the longest text seen so far.
 168
 169     if (m > 1)
 170     {
 171         this->n += m;
 172         this->numberOfTexts ++;
 173         this->numberOfAllTexts ++;
 174         dynFMI->addText(text, m);
 175     }
 176     else
 177     {
 178         emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
 179         this->numberOfAllTexts ++;            // Replace with dynamic bitvector
 180     }
 181 }
 182
 183 void CSA::MakeStatic()
 184 {
 185     // Sanity check:
 186     if (dynFMI == 0)
 187         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 188
 189     // Bitvector of empty texts
 190     {
 191         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
 192         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
 193         for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
 194             tempB[i] = 0;
 195         for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
 196             Tools::SetField(tempB, 1, (*it), 1);
 197         emptyTextId.clear();
 198         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
 199     }
 200
 201     uchar *bwt = dynFMI->getBWT();
 202     /*printf("123456789012345678901234567890123456789\n");
 203     for (TextPosition i = 0; i < n; i ++)
 204         if (bwt[i] < 50)
 205             printf("%d", (int)bwt[i]);
 206         else
 207             printf("%c", bwt[i]);
 208             printf("\n");*/
 209
 210
 211 /*    for (TextPosition i = 1; i <= n; i ++)
 212     {
 213         printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
 214     }*/
 215
 216     // Sanity check
 217     assert(numberOfTexts == dynFMI->getCollectionSize());
 218
 219     delete dynFMI;
 220     dynFMI = 0;
 221
 222     makewavelet(bwt); // Deletes bwt!
 223     bwt = 0;
 224
 225     // Calculate BWT end-marker position (of last inserted text)
 226     // and the length of the first text (counter l):
 227     ulong i = 0;
 228     ulong l = 1;
 229
 230     uchar c  = alphabetrank->access(i);
 231     while (c != '\0')
 232     {
 233         i = C[c]+alphabetrank->rank(c, i)-1;
 234         ++l;
 235         c = alphabetrank->access(i);
 236     }
 237     bwtEndPos = i;
 238     assert(bwtEndPos < n);
 239     //printf("bwtEndPos = %lu\n", bwtEndPos);
 240
 241     // Build up arrays for text length and starting positions
 242     // FIXME Remove, temp data
 243     textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
 244     textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
 245     (*textLength)[0] = l;
 246     (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables().
 247
 248     // Make sampling tables
 249     maketables();
 250 }
 251
 252 bool CSA::EmptyText(DocId k) const
 253 {
 254     assert(k < (DocId)numberOfTexts);
 255     if (emptyTextRank->IsBitSet(k))
 256         return true;
 257     return false;
 258 }
 259
 260 uchar* CSA::GetText(DocId k) const
 261 {
 262     assert(k < (DocId)numberOfTexts);
 263
 264     ulong textRank = 0;
 265     if (emptyTextRank->IsBitSet(k, &textRank))
 266     {
 267         uchar* result = new uchar[1];
 268         result[0] = 0;
 269         return result;
 270     }
 271     // Map to non-empty text
 272     k -= textRank; //emptyTextRank->rank(k);
 273
 274     TextPosition i = k;
 275
 276     string result;
 277     // Reserve average string length to avoid reallocs
 278     result.reserve(n/numberOfTexts);
 279
 280     uchar c = alphabetrank->access(i);
 281     while (c != '\0')
 282     {
 283         result.push_back(c);
 284         i = C[c]+alphabetrank->rank(c,i)-1;
 285
 286         c = alphabetrank->access(i); // "next" char.
 287     }
 288
 289     // Convert to uchar (FIXME return string?)
 290     i = result.size();
 291     uchar* res = new uchar[i+1];
 292     res[i] = '\0';
 293     for (ulong j = 0; j < i; ++j)
 294         res[i-j-1] = result[j];
 295     return res;
 296 }
 297 /*
 298  * Not supported
 299 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 300 {
 301     assert(k < (DocId)numberOfTexts);
 302     assert(j < (*textLength)[k]);
 303     assert(i <= j);
 304
 305     ulong textRank = 0;
 306     if (emptyTextRank->IsBitSet(k, &textRank))
 307     {
 308         uchar* result = new uchar[1];
 309         result[0] = 0;
 310         return result;
 311     }
 312
 313     // Map to non-empty text
 314     k -= textRank; // emptyTextRank->rank(k);
 315
 316     // Start position of k'th text
 317     ulong start = (*textStartPos)[k];
 318
 319     return Substring(i + start, j-i+1);
 320     }*/
 321
 322 /******************************************************************
 323  * Existential queries
 324  */
 325 bool CSA::IsPrefix(uchar const * pattern) const
 326 {
 327     TextPosition m = strlen((char *)pattern);
 328     if (m == 0)
 329         return true;
 330
 331     TextPosition sp = 0, ep = 0;
 332     Search(pattern, m, &sp, &ep);
 333
 334     // Check for end-marker(s) in result interval
 335     if (CountEndmarkers(sp, ep))
 336         return true;
 337     return false;
 338 }
 339
 340 bool CSA::IsSuffix(uchar const *pattern) const
 341 {
 342     // Here counting is as fast as IsSuffix():
 343     if (CountSuffix(pattern) > 0)
 344         return true;
 345     return false;
 346 }
 347
 348 bool CSA::IsEqual(uchar const *pattern) const
 349 {
 350     TextPosition m = std::strlen((char *)pattern);
 351     if (m == 0)
 352     {
 353         if (numberOfAllTexts - numberOfTexts > 0u)
 354             return true; // Empty texts exists
 355         return false; // No empty texts exists
 356     }
 357
 358     TextPosition sp = 0, ep = 0;
 359     // Match including end-marker
 360     Search(pattern, m+1, &sp, &ep);
 361
 362     // Check for end-marker(s) in result interval
 363     if (CountEndmarkers(sp, ep))
 364         return true;
 365     return false;
 366 }
 367
 368 bool CSA::IsContains(uchar const * pattern) const
 369 {
 370     TextPosition m = strlen((char *)pattern);
 371     if (m == 0)
 372         return true;
 373
 374     TextPosition sp = 0, ep = 0;
 375     // Just check if pattern exists somewhere
 376     ulong count = Search(pattern, m, &sp, &ep);
 377
 378     if (count > 0)
 379         return true;
 380     return false;
 381 }
 382
 383 bool CSA::IsLessThan(uchar const*) const
 384 {
 385     // TODO
 386     assert(0);
 387     return false;
 388 }
 389
 390 /******************************************************************
 391  * Counting queries
 392  */
 393 ulong CSA::Count(uchar const * pattern) const
 394 {
 395     TextPosition m = strlen((char *)pattern);
 396     if (m == 0)
 397         return 0;
 398
 399     TextPosition sp = 0, ep = 0;
 400     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 401     return count;
 402 }
 403
 404 unsigned CSA::CountPrefix(uchar const * pattern) const
 405 {
 406     TextPosition m = strlen((char *)pattern);
 407     if (m == 0)
 408         return numberOfAllTexts;
 409
 410     TextPosition sp = 0, ep = 0;
 411     Search(pattern, m, &sp, &ep);
 412
 413     // Count end-markers in result interval
 414     return CountEndmarkers(sp, ep);
 415 }
 416
 417 unsigned CSA::CountSuffix(uchar const * pattern) const
 418 {
 419     TextPosition m = strlen((char *)pattern);
 420     if (m == 0)
 421         return numberOfAllTexts;
 422
 423     TextPosition sp = 0, ep = 0;
 424     // Search with end-marker
 425     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 426
 427     return count;
 428 }
 429
 430 unsigned CSA::CountEqual(uchar const *pattern) const
 431 {
 432     TextPosition m = strlen((char const *)pattern);
 433     if (m == 0)
 434         return numberOfAllTexts - numberOfTexts; // Empty texts.
 435
 436     TextPosition sp = 0, ep = 0;
 437     // Match including end-marker
 438     Search(pattern, m+1, &sp, &ep);
 439
 440     // Count end-markers in result interval
 441     return CountEndmarkers(sp, ep);
 442 }
 443
 444 unsigned CSA::CountContains(uchar const * pattern) const
 445 {
 446     TextPosition m = strlen((char const *)pattern);
 447     if (m == 0)
 448         return numberOfAllTexts; // Total number of texts.
 449
 450     // Here counting is as slow as fetching the result set
 451     // because we would have to filter out occ's that fall within same document.
 452     TextCollection::document_result result = Contains(pattern);
 453     return result.size();
 454 }
 455
 456 unsigned CSA::CountLessThan(uchar const * pattern) const
 457 {
 458     // TODO
 459     assert(0);
 460     return 0;
 461 }
 462
 463 /**
 464  * Document reporting queries
 465  */
 466 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 467 {
 468     TextPosition m = strlen((char *)pattern);
 469     if (m == 0)
 470         return TextCollection::document_result(); // FIXME Should return all 1...k
 471
 472     TextPosition sp = 0, ep = 0;
 473     Search(pattern, m, &sp, &ep);
 474
 475     TextCollection::document_result result;
 476
 477     // Report end-markers in result interval
 478     unsigned resultSize = CountEndmarkers(sp, ep);
 479     if (resultSize == 0)
 480         return result;
 481
 482     result.reserve(resultSize); // Try to avoid reallocation.
 483
 484     // Iterate through end-markers in [sp,ep]:
 485     unsigned i = 0;
 486     if (sp > 0)
 487         i = alphabetrank->rank(0, sp - 1);
 488     while (resultSize)
 489     {
 490         // End-marker we found belongs to the "preceeding" doc in the collection
 491         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 492         // Map to doc ID:
 493         docId = emptyTextRank->select0(docId+1);
 494         result.push_back(docId);
 495
 496         -- resultSize;
 497         ++ i;
 498     }
 499
 500     return result;
 501 }
 502
 503 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 504 {
 505     TextPosition m = strlen((char *)pattern);
 506     if (m == 0)
 507         return TextCollection::document_result(); // FIXME Should return all 1...k
 508
 509     TextPosition sp = 0, ep = 0;
 510     // Search with end-marker
 511     Search(pattern, m+1, &sp, &ep);
 512
 513     TextCollection::document_result result;
 514     result.reserve(ep-sp+1); // Try to avoid reallocation.
 515
 516     ulong sampled_rank_i = 0;
 517     // Check each occurrence
 518     for (; sp <= ep; ++sp)
 519     {
 520         TextPosition i = sp;
 521
 522         uchar c = alphabetrank->access(i);
 523         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 524         {
 525             i = C[c]+alphabetrank->rank(c,i)-1;
 526             c = alphabetrank->access(i);
 527         }
 528         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 529
 530         if (c == '\0')
 531         {
 532             // Rank among the end-markers in BWT
 533             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 534
 535             // End-marker that we found belongs to the "preceeding" doc in collection:
 536             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 537             // Map to doc ID:
 538             docId = emptyTextRank->select0(docId+1);
 539             result.push_back(docId);
 540         }
 541         else // Sampled position
 542         {
 543             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 544             // Map to doc ID:
 545             docId = emptyTextRank->select0(docId+1);
 546             result.push_back(docId);
 547         }
 548     }
 549
 550     return result;
 551 }
 552
 553 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 554 {
 555     TextPosition m = strlen((char const *)pattern);
 556     if (m == 0)
 557         return TextCollection::document_result(); // FIXME Should return all empty texts
 558
 559     TextPosition sp = 0, ep = 0;
 560     // Match including end-marker
 561     Search(pattern, m+1, &sp, &ep);
 562
 563     TextCollection::document_result result;
 564
 565     // Report end-markers in result interval
 566     unsigned resultSize = CountEndmarkers(sp, ep);
 567     if (resultSize == 0)
 568         return result;
 569
 570     result.reserve(resultSize); // Try to avoid reallocation.
 571
 572     unsigned i = 0;
 573     if (sp > 0)
 574         i = alphabetrank->rank(0, sp - 1);
 575     while (resultSize)
 576     {
 577         // End-marker we found belongs to the "previous" doc in the collection
 578         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 579         // Map to doc ID:
 580         docId = emptyTextRank->select0(docId+1);
 581         result.push_back(docId);
 582
 583         -- resultSize;
 584         ++ i;
 585     }
 586
 587     return result;
 588 }
 589
 590
 591 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 592 {
 593     TextPosition m = strlen((char *)pattern);
 594     if (m == 0)
 595         return TextCollection::document_result();
 596
 597     TextPosition sp = 0, ep = 0;
 598     // Search all occurrences
 599     Search(pattern, m, &sp, &ep);
 600
 601     // We want unique document indentifiers, using std::set to collect them
 602     std::set<DocId> resultSet;
 603
 604     ulong sampled_rank_i = 0;
 605     // Check each occurrence
 606     for (; sp <= ep; ++sp)
 607     {
 608         TextPosition i = sp;
 609         uchar c = alphabetrank->access(i);
 610         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 611         {
 612             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 613             c = alphabetrank->access(i);
 614         }
 615         if (c == '\0')
 616         {
 617             // Rank among the end-markers in BWT
 618             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 619
 620             // End-marker that we found belongs to the "preceeding" doc in collection:
 621             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 622             resultSet.insert(docId);
 623         }
 624         else
 625         {
 626             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 627             assert((unsigned)di < numberOfTexts);
 628             resultSet.insert(di);
 629         }
 630     }
 631
 632     // Convert std::set to std::vector
 633     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 634     // Map to doc ID's
 635     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 636         *it = emptyTextRank->select0(*it+1);
 637     return result;
 638 }
 639
 640 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 641 {
 642     // TODO
 643     assert(0);
 644     return document_result();
 645 }
 646
 647 /**
 648  * Full result set queries
 649  */
 650 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 651 {
 652     TextPosition m = strlen((char *)pattern);
 653     if (m == 0)
 654         return full_result(); // FIXME Throw exception?
 655
 656     TextPosition sp = 0, ep = 0;
 657     // Search all occurrences
 658     Search(pattern, m, &sp, &ep);
 659
 660     full_result result;
 661     result.reserve(ep-sp+1); // Try to avoid reallocation.
 662
 663     ulong sampled_rank_i = 0;
 664     // Report each occurrence
 665     for (; sp <= ep; ++sp)
 666     {
 667         TextPosition i = sp;
 668         TextPosition dist = 0;
 669         uchar c = alphabetrank->access(i);
 670         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 671         {
 672             i = C[c]+alphabetrank->rank(c,i)-1;
 673             c = alphabetrank->access(i);
 674             ++ dist;
 675         }
 676         if (c == '\0')
 677         {
 678             // Rank among the end-markers in BWT
 679             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 680
 681             // End-marker that we found belongs to the "preceeding" doc in collection:
 682             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 683             // Map to doc ID:
 684             docId = emptyTextRank->select0(docId+1);
 685             result.push_back(make_pair(docId, dist));
 686         }
 687         else
 688         {
 689             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
 690             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 691 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
 692
 693             // Map to doc ID:
 694             docId = emptyTextRank->select0(docId+1);
 695             result.push_back(make_pair(docId, textPos));
 696         }
 697     }
 698
 699     return result;
 700 }
 701
 702
 703 /**
 704  * Save index to a file handle
 705  *
 706  * Throws a std::runtime_error exception on i/o error.
 707  * First byte that is saved represents the version number of the save file.
 708  * In version 2 files, the data fields are:
 709  *  uchar versionFlag;
 710     TextPosition n;
 711     unsigned samplerate;
 712     unsigned C[256];
 713     TextPosition bwtEndPos;
 714     static_sequence * alphabetrank;
 715     BSGAP *sampled;
 716     BlockArray *suffixes;
 717     BlockArray *suffixDocId;
 718     unsigned numberOfTexts;
 719     unsigned numberOfAllTexts;
 720     ulong maxTextLength;
 721     BlockArray *endmarkerDocId;
 722     BSGAP *emptyTextRank;
 723  */
 724 void CSA::Save(FILE *file) const
 725 {
 726     // Saving version info:
 727     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 728         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 729
 730     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 731         throw std::runtime_error("CSA::Save(): file write error (n).");
 732     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 733         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 734
 735     for(ulong i = 0; i < 256; ++i)
 736         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 737             throw std::runtime_error("CSA::Save(): file write error (C table).");
 738
 739     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 740         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 741
 742     alphabetrank->save(file);
 743     sampled->Save(file);
 744     suffixes->Save(file);
 745     suffixDocId->Save(file);
 746
 747     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 748         throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
 749     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 750         throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
 751     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 752         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
 753
 754     endmarkerDocId->Save(file);
 755     emptyTextRank->Save(file);
 756     fflush(file);
 757 }
 758
 759
 760 /**
 761  * Load index from a file handle
 762  *
 763  * Throws a std::runtime_error exception on i/o error.
 764  * For more info, see CSA::Save().
 765  */
 766 void CSA::Load(FILE *file, unsigned samplerate)
 767 {
 768     // Delete everything
 769     delete dynFMI;       dynFMI = 0;
 770     delete alphabetrank; alphabetrank = 0;
 771     delete sampled;      sampled = 0;
 772     delete suffixes;     suffixes = 0;
 773     delete suffixDocId;  suffixDocId = 0;
 774     delete positions; positions = 0;
 775     delete [] codetable; codetable = 0;
 776
 777     delete endmarkerDocId; endmarkerDocId = 0;
 778     delete emptyTextRank;  emptyTextRank = 0;
 779     // FIXME Remove following:
 780     delete textLength;     textLength = 0;
 781     delete textStartPos;   textStartPos = 0;
 782
 783     this->maxTextLength = 0;
 784     this->numberOfTexts = 0;
 785     this->numberOfAllTexts = 0;
 786     this->samplerate = samplerate;
 787     this->n = 0;
 788
 789     uchar verFlag = 0;
 790     if (std::fread(&verFlag, 1, 1, file) != 1)
 791         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 792     if (verFlag != CSA::versionFlag)
 793         throw std::runtime_error("CSA::Load(): invalid save file version.");
 794
 795     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 796         throw std::runtime_error("CSA::Load(): file read error (n).");
 797     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 798         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 799 // FIXME samplerate can not be changed during load.
 800 //    if (this->samplerate == 0)
 801 //        this->samplerate = samplerate;
 802
 803     for(ulong i = 0; i < 256; ++i)
 804         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 805             throw std::runtime_error("CSA::Load(): file read error (C table).");
 806
 807     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 808         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 809
 810     alphabetrank = static_sequence::load(file);
 811     sampled = new BSGAP(file);
 812     suffixes = new BlockArray(file);
 813     suffixDocId = new BlockArray(file);
 814
 815     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 816         throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
 817     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 818         throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
 819     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 820         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
 821
 822     endmarkerDocId = new BlockArray(file);
 823     emptyTextRank = new BSGAP(file);
 824
 825     // FIXME Construct data structures with new samplerate
 826     //maketables();  // FIXME: this will redo text length tables
 827 }
 828
 829
 830
 831 /**
 832  * Rest of the functions follow...
 833  */
 834
 835 /*
 836  * Not supported
 837 uchar * CSA::Substring(TextPosition i, TextPosition l) const
 838 {
 839     uchar *result = new uchar[l + 1];
 840     if (l == 0)
 841     {
 842         result[0] = 0u;
 843         return result;
 844     }
 845
 846     TextPosition dist;
 847     TextPosition k = i + l - 1;
 848     // Check for end of the string
 849     if (k > n - 1)
 850     {
 851         l -= k - n + 1;
 852         k = n - 1;
 853     }
 854
 855     TextPosition skip = samplerate - k % samplerate - 1;
 856     TextPosition j;
 857     if (k / samplerate + 1 >= n / samplerate)
 858     {
 859         j = bwtEndPos;
 860         skip = n - k - 1;
 861     }
 862     else
 863     {
 864         j = (*positions)[k/samplerate+1];
 865         //cout << samplerate << ' ' << j << '\n';
 866     }
 867
 868     for (dist = 0; dist < skip + l; dist++)
 869     {
 870         int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp);
 871         if (c == '\0')
 872         {
 873             // Rank among the end-markers in BWT
 874             unsigned endmarkerRank = alphabetrank->rank(0, j) - 1;
 875             j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker
 876         }
 877         else
 878             j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 879         if (dist >= skip)
 880             result[l + skip - dist - 1] = c;
 881     }
 882     result[l] = 0u;
 883     return result;
 884 }
 885 */
 886   /*TextPosition CSA::inverse(TextPosition i)
 887 {
 888     TextPosition skip = samplerate - i % samplerate;
 889     TextPosition j;
 890     if (i / samplerate + 1 >= n / samplerate)
 891     {
 892         j = bwtEndPos;
 893         skip = n - i;
 894     }
 895     else
 896     {
 897         j = (*positions)[i/samplerate+1];
 898         //cout << samplerate << ' ' << j << '\n';
 899     }
 900
 901     while (skip > 0)
 902     {
 903         int c = alphabetrank->charAtPos(j);
 904         j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 905         skip --;
 906     }
 907     return j;
 908     }*/
 909
 910 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 911 {
 912     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 913     int c = (int)pattern[m-1];
 914     TextPosition i=m-1;
 915     TextPosition sp = C[c];
 916     TextPosition ep = C[c+1]-1;
 917     while (sp<=ep && i>=1)
 918     {
 919 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 920         c = (int)pattern[--i];
 921         sp = C[c]+alphabetrank->rank(c,sp-1);
 922         ep = C[c]+alphabetrank->rank(c,ep)-1;
 923     }
 924     *spResult = sp;
 925     *epResult = ep;
 926     if (sp<=ep)
 927         return ep - sp + 1;
 928     else
 929         return 0;
 930 }
 931
 932    /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
 933 {
 934     sp = C[(int)c]+alphabetrank->rank(c,sp-1);
 935     ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
 936
 937     if (sp<=ep)
 938         return ep - sp + 1;
 939     else
 940         return 0;
 941         }*/
 942
 943 CSA::TextPosition CSA::Lookup(TextPosition i) const
 944 {
 945     TextPosition dist=0;
 946     while (!sampled->IsBitSet(i))
 947     {
 948
 949         int c = alphabetrank->access(i);
 950         if (c == '\0')
 951         {
 952             // End-markers are sampled
 953             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 954             DocId docId = (*endmarkerDocId)[endmarkerRank];
 955             return (*textStartPos)[docId] + dist;
 956         }
 957         i = C[c]+alphabetrank->rank(c,i)-1;
 958         ++dist;
 959     }
 960
 961     return (*suffixes)[sampled->rank(i)-1]+dist;
 962 }
 963
 964 CSA::~CSA() {
 965     delete dynFMI;
 966     delete alphabetrank;
 967     delete sampled;
 968     delete suffixes;
 969     delete suffixDocId;
 970     delete positions;
 971     delete [] codetable;
 972
 973     delete endmarkerDocId;
 974     delete textLength;
 975     delete textStartPos;
 976     delete emptyTextRank;
 977 }
 978
 979 void CSA::makewavelet(uchar *bwt)
 980 {
 981     ulong i, min = 0,
 982              max;
 983     for (i=0;i<256;i++)
 984         C[i]=0;
 985     for (i=0;i<n;++i)
 986         C[(int)bwt[i]]++;
 987     for (i=0;i<256;i++)
 988         if (C[i]>0) {min = i; break;}
 989     for (i=255;i>=min;--i)
 990         if (C[i]>0) {max = i; break;}
 991
 992     // Print frequencies
 993     /*for(i = 0; i < 256; i++)
 994         if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
 995         fflush(stdout);*/
 996
 997     ulong prev=C[0], temp;
 998     C[0]=0;
 999     for (i=1;i<256;i++) {
1000         temp = C[i];
1001         C[i]=C[i-1]+prev;
1002         prev = temp;
1003     }
1004 //    this->codetable = node::makecodetable(bwt,n);
1005 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1006 //    delete [] bwt;
1007     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1008
1009     uint *text = new uint[n];
1010     for (i = 0; i < n; ++i) // Silly
1011         text[i] = bwt[i];
1012     delete [] bwt;
1013     bwt = 0;
1014
1015     alphabet_mapper * am = new alphabet_mapper_none();
1016     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1017
1018     //cout << "Building Huffman table..."; cout.flush();
1019
1020     wt_coder * wtc = new wt_coder_huff(text,n,am);
1021
1022     //cout << "done" << endl; cout.flush();
1023     //cout << "Building static_sequence..."; cout.flush();
1024
1025     alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
1026     delete [] text;
1027     text = 0;
1028
1029 /*    for (i = 0; i < n; ++i)
1030     {
1031         uchar c = alphabetrank->charAtPos(i);
1032         TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
1033         printf("LF[%lu] = %lu\n", i, j);
1034         }*/
1035 }
1036
1037 void CSA::maketables()
1038 {
1039     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1040
1041     ulong *sampledpositions = new ulong[n/W+1];
1042     unsigned ceilLog2n = Tools::CeilLog2(n);
1043     positions = new BlockArray(sampleLength, ceilLog2n);
1044     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1045
1046     // Mapping from end-markers to doc ID's:
1047     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1048
1049     ulong i,j=0;
1050     for (i=0;i<n/W+1;i++)
1051         sampledpositions[i]=0lu;
1052
1053     ulong x,p=bwtEndPos;
1054     ulong sampleCount = 0;
1055     // Keeping track of text position of end-markers seen
1056     ulong posOfSuccEndmarker = n;
1057     DocId textId = numberOfTexts;
1058     ulong ulongmax = 0;
1059     ulongmax--;
1060
1061     //positions:
1062     for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1063       // i substitutes SA->GetPos(i)
1064         x=(i==n-1)?0:i+1;
1065
1066         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1067             Tools::SetField(sampledpositions,1,p,1);
1068             (*positions)[sampleCount] = p;
1069             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1070             sampleCount ++;
1071         }
1072
1073         uchar c = alphabetrank->access(p);
1074         if (c == '\0')
1075         {
1076             --textId;
1077
1078             // Record the order of end-markers in BWT:
1079             ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1080             (*endmarkerDocId)[endmarkerRank] = textId;
1081
1082             // Store text length and text start position:
1083             if (textId < (DocId)numberOfTexts - 1)
1084             {
1085                 (*textLength)[textId + 1] = posOfSuccEndmarker - x;
1086                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1087                 posOfSuccEndmarker = x;
1088             }
1089
1090             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1091             p = textId; // Correct LF-mapping to the last char of the previous text.
1092         }
1093         else
1094             p = C[c]+alphabetrank->rank(c, p)-1;
1095     }
1096     assert(textId == 0);
1097
1098     /*i = 0;
1099     for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i)
1100     {
1101         int docc = (*endmarkerDocId)[i];
1102         ulong poss = (*endmarkerPos)[i];
1103         printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss);
1104         }*/
1105 /*
1106     for (i = 0; i < numberOfTexts; ++ i)
1107     {
1108         //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl;
1109         assert(dynTextLength[i].first == (*textLength)[i]);
1110         assert(dynTextLength[i].second == (*textStartPos)[i]);
1111     }
1112 */
1113     sampled = new BSGAP(sampledpositions,n,true);
1114     sampleLength = sampled->rank(n-1);
1115     assert(sampleCount == sampleLength);
1116 //    std::cout << ";sampleLength;" << sampleLength << std::endl;
1117     // Suffixes == offset from text start position
1118     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1119     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1120
1121    //suffixes:
1122     for(i=0; i<sampleLength; i++) {
1123         assert((*positions)[i] < n);
1124         j = sampled->rank((*positions)[i]);
1125         if (j==0) j=sampleLength;
1126         TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate;
1127         (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]);
1128
1129         assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts);
1130         assert((*suffixDocId)[j-1] < numberOfTexts);
1131         // calculate offset from text start:
1132         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1133     }
1134     // FIXME Temp, remove
1135     delete tmpSuffix;
1136     tmpSuffix=0;
1137     delete positions;
1138     positions =0;
1139     delete textLength;
1140     textLength = 0;
1141     delete textStartPos;
1142     textStartPos = 0;
1143 }
1144
1145
1146 /**
1147  * Finds document identifier for given text position
1148  *
1149  * Starting text position of the document is stored into second parameter.
1150  * Binary searching on text starting positions.
1151  */
1152 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
1153 {
1154     assert(i < n);
1155
1156     DocId a = 0;
1157     DocId b = numberOfTexts - 1;
1158     while (a < b)
1159     {
1160         DocId c = a + (b - a)/2;
1161         if ((*textStartPos)[c] > i)
1162             b = c - 1;
1163         else if ((*textStartPos)[c+1] > i)
1164             return c;
1165         else
1166             a = c + 1;
1167     }
1168
1169     assert(a < (DocId)numberOfTexts);
1170     assert(i >= (*textStartPos)[a]);
1171     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1172     return a;
1173 }
1174
1175 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1176 {
1177     TCodeEntry *result = new TCodeEntry[ 256 ];
1178
1179     count_chars( text, n, result );
1180     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1181 //
1182 // First I push all the leaf nodes into the queue
1183 //
1184     for ( unsigned int i = 0 ; i < 256 ; i++ )
1185         if ( result[ i ].count )
1186             q.push(node( i, result[ i ].count ) );
1187 //
1188 // This loop removes the two smallest nodes from the
1189 // queue.  It creates a new internal node that has
1190 // those two nodes as children. The new internal node
1191 // is then inserted into the priority queue.  When there
1192 // is only one node in the priority queue, the tree
1193 // is complete.
1194 //
1195
1196     while ( q.size() > 1 ) {
1197         node *child0 = new node( q.top() );
1198         q.pop();
1199         node *child1 = new node( q.top() );
1200         q.pop();
1201         q.push( node( child0, child1 ) );
1202     }
1203 //
1204 // Now I compute and return the codetable
1205 //
1206     q.top().maketable(0u,0u, result);
1207     q.pop();
1208     return result;
1209 }
1210
1211
1212 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1213 {
1214     if ( child0 )
1215     {
1216         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1217         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1218         delete child0;
1219         delete child1;
1220     }
1221     else
1222     {
1223         codetable[value].code = code;
1224         codetable[value].bits = bits;
1225     }
1226 }
1227
1228 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1229 {
1230     ulong i;
1231     for (i = 0 ; i < 256 ; i++ )
1232         counts[ i ].count = 0;
1233     for (i=0; i<n; i++)
1234         counts[(int)text[i]].count++;
1235 }
1236
1237 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1238       return x | (bit << pos);
1239 }
1240