CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22 #include <iostream>
  23 #include <queue>
  24 #include <iomanip>
  25 #include <set>
  26 #include <vector>
  27 #include <utility>
  28 #include <stdexcept>
  29 #include <cassert>
  30 #include <cstring> // For strlen()
  31 using std::vector;
  32 using std::pair;
  33 using std::make_pair;
  34 using std::map;
  35
  36 using SXSI::TextCollection;
  37
  38
  39 ////////////////////////////////////////////////////////////////////////////
  40 // Class CSA::THuffAlphabetRank
  41
  42 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  43     left = NULL;
  44     right = NULL;
  45     bitrank = NULL;
  46     ch = s[0];
  47     leaf = false;
  48     this->codetable = codetable;
  49
  50     bool *B = new bool[n];
  51     TextPosition sum=0,i;
  52     /*
  53     for (i=0; i< n; i++) {
  54        printf("%c:", (char)((int)s[i]-128));
  55        for (r=0;r<codetable[(int)s[i]].bits;r++)
  56           if (codetable[(int)s[i]].code & (1u <<r))
  57              printf("1");
  58           else printf("0");
  59        printf("\n");
  60     }
  61     printf("\n");
  62     if (level > 100) return;*/
  63     for (i=0;i<n;i++)
  64        if (codetable[(int)s[i]].code & (1u << level)) {
  65           B[i] = true;
  66           sum++;
  67        }
  68        else B[i] = false;
  69     if (sum==0 || sum==n) {
  70         delete [] B;
  71         leaf = true;
  72         return;
  73     }
  74     uchar *sfirst, *ssecond;
  75     //if (n-sum > 0)
  76         sfirst = new uchar[n-sum];
  77     //if (sum > 0)
  78         ssecond = new uchar[sum];
  79     unsigned j=0,k=0;
  80    for (i=0;i<n;i++)
  81         if (B[i]) ssecond[k++] = s[i];
  82         else sfirst[j++] = s[i];
  83     ulong *Binbits = new ulong[n/W+1];
  84     for (i=0;i<n;i++)
  85         Tools::SetField(Binbits,1,i,B[i]);
  86     delete [] B;
  87     bitrank = new BitRank(Binbits,n,true);
  88     //if (j > 0) {
  89         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
  90         delete [] sfirst;
  91     //}
  92     //if (k>0) {
  93         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
  94         delete [] ssecond;
  95     //}
  96 }
  97
  98
  99 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 100     // testing that the code works correctly
 101     int C[256];
 102     unsigned i,j;
 103     bool correct=true;
 104     for (j=0;j<256;j++)
 105         C[j] = 0;
 106     for (i=0;i<n;i++) {
 107         C[(int)s[i]]++;
 108         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 109         correct = false;
 110         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 111         }
 112     }
 113     return correct;
 114 }
 115
 116 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 117     if (left!=NULL) delete left;
 118     if (right!=NULL) delete right;
 119     if (bitrank!=NULL)
 120         delete bitrank;
 121 }
 122
 123
 124 /////////////////////////////////////////////f///////////////////////////////
 125 // Class CSA
 126
 127 /**
 128  * Constructor inits an empty dynamic FM-index.
 129  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 130  */
 131 CSA::CSA(unsigned samplerate)
 132     : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
 133       codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0),
 134       textLength(0), textStartPos(0)
 135 {
 136     this->samplerate = samplerate;
 137
 138     // FIXME TODO : DynFMI needs distribution of characters before hand
 139     // This will create fully balanced wavelet tree for all chars [0, 255].
 140     uchar temp[256];
 141     for (unsigned i = 0; i < 255; ++i)
 142         temp[i] = i+1;
 143     temp[255] = 0;
 144     dynFMI = new DynFMI(temp, 1, 255, false);
 145
 146     /* Debug code: take char distribution from data.txt.
 147     uchar *temp = Tools::GetFileContents("data.txt", 0);
 148     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 149     delete [] temp;*/
 150 }
 151
 152 /**
 153  * Insert new text
 154  *
 155  * Given text must include end-marker.
 156  * Text identifiers are numbered starting from 0.
 157  */
 158 void CSA::InsertText(uchar const * text)
 159 {
 160     // Sanity check:
 161     assert(dynFMI != 0);
 162
 163     TextPosition m = std::strlen((char *)text) + 1;
 164     if (m > maxTextLength)
 165         maxTextLength = m; // Store length of the longest text seen so far.
 166
 167     this->n += m;
 168     this->numberOfTexts ++;
 169     dynFMI->addText(text, m);
 170 }
 171
 172 void CSA::MakeStatic()
 173 {
 174     // Sanity check:
 175     if (dynFMI == 0)
 176     {
 177         std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
 178         std::exit(0);
 179     }
 180
 181     uchar *bwt = dynFMI->getBWT();
 182     /*printf("123456789012345678901234567890123456789\n");
 183     for (TextPosition i = 0; i < n; i ++)
 184         if (bwt[i] < 50)
 185             printf("%d", (int)bwt[i]);
 186         else
 187             printf("%c", bwt[i]);
 188             printf("\n");*/
 189
 190
 191 /*    for (TextPosition i = 1; i <= n; i ++)
 192     {
 193         printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
 194     }*/
 195
 196     // Sanity check
 197     assert(numberOfTexts == dynFMI->getCollectionSize());
 198
 199     delete dynFMI;
 200     dynFMI = 0;
 201
 202     makewavelet(bwt);
 203
 204     // Calculate BWT end-marker position (of last inserted text)
 205     // and the length of the first text (counter l):
 206     ulong i = 0;
 207     ulong l = 1;
 208     while (bwt[i] != '\0')
 209     {
 210         uchar c = bwt[i];
 211         i = C[c]+alphabetrank->rank(c, i)-1;
 212         ++l;
 213     }
 214     bwtEndPos = i;
 215     //printf("bwtEndPos = %lu\n", bwtEndPos);
 216
 217     delete [] bwt;
 218
 219     // Build up arrays for text length and starting positions
 220     textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
 221     textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
 222     (*textLength)[0] = l;
 223     (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables().
 224
 225     // Make sampling tables
 226     maketables();
 227 }
 228
 229 bool CSA::EmptyText(DocId k) const
 230 {
 231     assert(k < (DocId)numberOfTexts);
 232     return (1 == (*textLength)[k]);
 233 }
 234
 235 uchar* CSA::GetText(DocId k) const
 236 {
 237     assert(k < (DocId)numberOfTexts);
 238
 239     uchar* result = new uchar[(*textLength)[k]];
 240     TextPosition i = k;
 241     TextPosition j = (*textLength)[k] - 1;
 242     result[j] = '\0';
 243
 244     uchar c = alphabetrank->charAtPos(i);
 245     while (c != '\0')
 246     {
 247         --j;
 248         result[j] = c;
 249         i = C[c]+alphabetrank->rank(c,i)-1;
 250
 251         c = alphabetrank->charAtPos(i); // "next" char.
 252     }
 253     assert(j == 0);
 254     return result;
 255 }
 256
 257 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 258 {
 259     assert(k < (DocId)numberOfTexts);
 260     assert(j < (*textLength)[k]);
 261     assert(i <= j);
 262
 263     // Start position of k'th text
 264     ulong start = (*textStartPos)[k];
 265
 266     return Substring(i + start, j-i+1);
 267 }
 268
 269 /******************************************************************
 270  * Existential queries
 271  */
 272 bool CSA::IsPrefix(uchar const * pattern) const
 273 {
 274     TextPosition m = strlen((char *)pattern);
 275     if (m == 0)
 276         return true;
 277
 278     TextPosition sp = 0, ep = 0;
 279     Search(pattern, m, &sp, &ep);
 280
 281     // Check for end-marker(s) in result interval
 282     if (CountEndmarkers(sp, ep))
 283         return true;
 284     return false;
 285 }
 286
 287 bool CSA::IsSuffix(uchar const *pattern) const
 288 {
 289     // Here counting is as fast as isSuffix():
 290     if (CountSuffix(pattern) > 0)
 291         return true;
 292     return false;
 293 }
 294
 295 bool CSA::IsEqual(uchar const *pattern) const
 296 {
 297     TextPosition m = std::strlen((char *)pattern);
 298     //if (m == 0)
 299     //    return false; // FIXME Check for empty texts!
 300
 301     TextPosition sp = 0, ep = 0;
 302     // Match including end-marker
 303     Search(pattern, m+1, &sp, &ep);
 304
 305     // Check for end-marker(s) in result interval
 306     if (CountEndmarkers(sp, ep))
 307         return true;
 308     return false;
 309 }
 310
 311 bool CSA::IsContains(uchar const * pattern) const
 312 {
 313     TextPosition m = strlen((char *)pattern);
 314     if (m == 0)
 315         return true;
 316
 317     TextPosition sp = 0, ep = 0;
 318     // Just check if pattern exists somewhere
 319     ulong count = Search(pattern, m, &sp, &ep);
 320
 321     if (count > 0)
 322         return true;
 323     return false;
 324 }
 325
 326 bool CSA::IsLessThan(uchar const*) const
 327 {
 328     // TODO
 329     assert(0);
 330     return false;
 331 }
 332
 333 /******************************************************************
 334  * Counting queries
 335  */
 336 unsigned CSA::CountPrefix(uchar const * pattern) const
 337 {
 338     TextPosition m = strlen((char *)pattern);
 339     if (m == 0)
 340         return numberOfTexts;
 341
 342     TextPosition sp = 0, ep = 0;
 343     Search(pattern, m, &sp, &ep);
 344
 345     // Count end-markers in result interval
 346     return CountEndmarkers(sp, ep);
 347 }
 348
 349 unsigned CSA::CountSuffix(uchar const * pattern) const
 350 {
 351     TextPosition m = strlen((char *)pattern);
 352     if (m == 0)
 353         return numberOfTexts;
 354
 355     TextPosition sp = 0, ep = 0;
 356     // Search with end-marker
 357     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 358
 359     return count;
 360 }
 361
 362 unsigned CSA::CountEqual(uchar const *pattern) const
 363 {
 364     TextPosition m = strlen((char const *)pattern);
 365 //    if (m == 0)
 366 //        return ; // FIXME Test for empty texts.
 367
 368     TextPosition sp = 0, ep = 0;
 369     // Match including end-marker
 370     Search(pattern, m+1, &sp, &ep);
 371
 372     // Count end-markers in result interval
 373     return CountEndmarkers(sp, ep);
 374 }
 375
 376 unsigned CSA::CountContains(uchar const * pattern) const
 377 {
 378     // Here counting is as slow as fetching the result set
 379     // because we would have to filter out occ's that fall within same document.
 380     TextCollection::document_result result = Contains(pattern);
 381     return result.size();
 382 }
 383
 384 unsigned CSA::CountLessThan(uchar const * pattern) const
 385 {
 386     // TODO
 387     assert(0);
 388     return 0;
 389 }
 390
 391 /**
 392  * Document reporting queries
 393  */
 394 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 395 {
 396     TextPosition m = strlen((char *)pattern);
 397     if (m == 0)
 398         return TextCollection::document_result();
 399
 400     TextPosition sp = 0, ep = 0;
 401     Search(pattern, m, &sp, &ep);
 402
 403     TextCollection::document_result result;
 404
 405     // Report end-markers in result interval
 406     unsigned resultSize = CountEndmarkers(sp, ep);
 407     if (resultSize == 0)
 408         return result;
 409
 410     unsigned i = 0;
 411     if (sp > 0)
 412         i = alphabetrank->rank(0, sp - 1);
 413     while (resultSize)
 414     {
 415         // End-marker we found belongs to the "preceeding" doc in the collection
 416         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 417         result.push_back(docId);
 418         -- resultSize;
 419         ++ i;
 420     }
 421
 422     return result;
 423 }
 424
 425 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 426 {
 427     TextPosition m = strlen((char *)pattern);
 428     if (m == 0)
 429         return TextCollection::document_result();
 430
 431     TextPosition sp = 0, ep = 0;
 432     // Search with end-marker
 433     Search(pattern, m+1, &sp, &ep);
 434
 435     TextCollection::document_result result;
 436
 437     // Check each occurrence
 438     for (; sp <= ep; ++sp)
 439     {
 440         TextPosition i = sp;
 441         uchar c = alphabetrank->charAtPos(i);
 442         while (c != '\0' && !sampled->IsBitSet(i))
 443         {
 444             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 445             c = alphabetrank->charAtPos(i);
 446         }
 447         if (c == '\0')
 448         {
 449             // Rank among the end-markers in BWT
 450             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 451
 452             // End-marker that we found belongs to the "preceeding" doc in collection:
 453             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 454             result.push_back(docId);
 455         }
 456         else // Sampled position
 457         {
 458             TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
 459             result.push_back(DocIdAtTextPos(textPos));
 460         }
 461     }
 462
 463     return result;
 464 }
 465
 466 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 467 {
 468     TextPosition m = strlen((char const *)pattern);
 469     if (m == 0)
 470         return TextCollection::document_result();
 471
 472     TextPosition sp = 0, ep = 0;
 473     // Match including end-marker
 474     Search(pattern, m+1, &sp, &ep);
 475
 476     TextCollection::document_result result;
 477
 478     // Report end-markers in result interval
 479     unsigned resultSize = CountEndmarkers(sp, ep);
 480     if (resultSize == 0)
 481         return result;
 482
 483     unsigned i = 0;
 484     if (sp > 0)
 485         i = alphabetrank->rank(0, sp - 1);
 486     while (resultSize)
 487     {
 488        // End-marker we found belongs to the "previous" doc in the collection
 489         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 490         result.push_back(docId);
 491         -- resultSize;
 492         ++ i;
 493     }
 494
 495     return result;
 496 }
 497
 498
 499 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 500 {
 501     TextPosition m = strlen((char *)pattern);
 502     if (m == 0)
 503         return TextCollection::document_result();
 504
 505     TextPosition sp = 0, ep = 0;
 506     // Search all occurrences
 507     Search(pattern, m, &sp, &ep);
 508
 509     // We want unique document indentifiers, using std::set to collect them
 510     std::set<DocId> resultSet;
 511
 512     // Check each occurrence
 513     for (; sp <= ep; ++sp)
 514     {
 515         TextPosition i = sp;
 516         uchar c = alphabetrank->charAtPos(i);
 517         while (c != '\0' && !sampled->IsBitSet(i))
 518         {
 519             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 520             c = alphabetrank->charAtPos(i);
 521         }
 522         if (c == '\0')
 523         {
 524             // Rank among the end-markers in BWT
 525             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 526
 527             // End-marker that we found belongs to the "preceeding" doc in collection:
 528             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 529             resultSet.insert(docId);
 530         }
 531         else
 532         {
 533             TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
 534             resultSet.insert(DocIdAtTextPos(textPos));
 535         }
 536     }
 537
 538     // Convert std::set to std::vector (TODO better way to construct result vector?)
 539     TextCollection::document_result result;
 540     for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
 541         result.push_back(*it);
 542     return result;
 543 }
 544
 545 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 546 {
 547     // TODO
 548     assert(0);
 549     return document_result();
 550 }
 551
 552 /**
 553  * Full result set queries
 554  */
 555 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 556 {
 557     TextPosition m = strlen((char *)pattern);
 558     if (m == 0)
 559         return full_result();
 560
 561     TextPosition sp = 0, ep = 0;
 562     // Search all occurrences
 563     Search(pattern, m, &sp, &ep);
 564
 565     full_result result;
 566
 567     // Report each occurrence
 568     for (; sp <= ep; ++sp)
 569     {
 570         TextPosition i = sp;
 571         TextPosition dist = 0;
 572         uchar c = alphabetrank->charAtPos(i);
 573         while (c != '\0' && !sampled->IsBitSet(i))
 574         {
 575             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 576             c = alphabetrank->charAtPos(i);
 577             ++ dist;
 578         }
 579         if (c == '\0')
 580         {
 581             // Rank among the end-markers in BWT
 582             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 583
 584             // End-marker that we found belongs to the "preceeding" doc in collection:
 585             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 586             result.push_back(make_pair(docId, dist));
 587         }
 588         else
 589         {
 590             TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist;
 591             // Read document identifier and its starting position in text order
 592             DocId docId = DocIdAtTextPos(textPos);
 593             result.push_back(make_pair(docId, textPos - (*textStartPos)[docId]));
 594         }
 595     }
 596
 597     return result;
 598 }
 599
 600
 601 /**
 602  * Finds document identifier for given text position
 603  *
 604  * Starting text position of the document is stored into second parameter.
 605  * Binary searching on text starting positions.
 606  */
 607 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
 608 {
 609     assert(i < n);
 610
 611     DocId a = 0;
 612     DocId b = numberOfTexts - 1;
 613     while (a < b)
 614     {
 615         DocId c = a + (b - a)/2;
 616         if ((*textStartPos)[c] > i)
 617             b = c - 1;
 618         else if ((*textStartPos)[c+1] > i)
 619             return c;
 620         else
 621             a = c + 1;
 622     }
 623
 624     assert(a < (DocId)numberOfTexts);
 625     assert(i > (*textStartPos)[a]);
 626     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
 627     return a;
 628 }
 629
 630 /**
 631  * Count end-markers in given interval
 632  */
 633 unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const
 634 {
 635     if (sp > ep)
 636         return 0;
 637
 638     ulong ranksp = 0;
 639     if (sp != 0)
 640         ranksp = alphabetrank->rank(0, sp - 1);
 641
 642     return alphabetrank->rank(0, ep) - ranksp;
 643 }
 644
 645 /**
 646  * Save index to a file handle
 647  *
 648  * Throws a std::runtime_error exception on i/o error.
 649  * First byte that is saved represents the version number of the save file.
 650  * In version 1 files, the data fields are:
 651  *        <uchar version> version info;
 652  *        <unsigned s>    samplerate;
 653  *        <ulong n>       length of the BWT;
 654  *        <ulong bwt>     end marker position in BWT;
 655  *        <uchar *>       BWT string of length n;
 656  *        <unsigned r>    number of texts;
 657  *        <ulong max>     length of longest text;
 658  *        <vector textLength>
 659  *                        array of <ulong, ulong> pairs.
 660  *
 661  * TODO: Save the data structures instead of BWT sequence?
 662  * TODO: Don't save textLength and textStartPos arrays.
 663  */
 664 void CSA::Save(FILE *file) const
 665 {
 666     // Saving version 1 data:
 667     uchar versionFlag = 1;
 668     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 669         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 670
 671     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 672         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 673
 674     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 675         throw std::runtime_error("CSA::Save(): file write error (n).");
 676
 677     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 678         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 679
 680     for (ulong offset = 0; offset < n; offset ++)
 681     {
 682         uchar c = alphabetrank->charAtPos(offset);
 683         if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
 684             throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
 685     }
 686
 687     unsigned r = numberOfTexts;
 688     if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
 689         throw std::runtime_error("CSA::Save(): file write error (r).");
 690
 691     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 692         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
 693
 694     for (r = 0; r < numberOfTexts; ++ r)
 695     {
 696         if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1)
 697             throw std::runtime_error("CSA::Save(): file write error (text length).");
 698         if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1)
 699             throw std::runtime_error("CSA::Save(): file write error (text start).");
 700     }
 701 }
 702
 703
 704 /**
 705  * Load index from a file handle
 706  *
 707  * Throws a std::runtime_error exception on i/o error.
 708  * For more info, see CSA::Save().
 709  */
 710 void CSA::Load(FILE *file, unsigned samplerate)
 711 {
 712     // Delete everything
 713     delete dynFMI;       dynFMI = 0;
 714     delete alphabetrank; alphabetrank = 0;
 715     delete sampled;      sampled = 0;
 716     delete [] suffixes;  suffixes = 0;
 717     delete [] positions; positions = 0;
 718     delete [] codetable; codetable = 0;
 719
 720     delete endmarkerDocId; endmarkerDocId = 0;
 721     delete textLength;     textLength = 0;
 722     delete textStartPos;   textStartPos = 0;
 723
 724     this->maxTextLength = 0;
 725     this->numberOfTexts = 0;
 726     this->samplerate = samplerate;
 727     this->n = 0;
 728
 729     uchar versionFlag = 0;
 730     if (std::fread(&versionFlag, 1, 1, file) != 1)
 731         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 732     if (versionFlag != 1)
 733         throw std::runtime_error("CSA::Load(): invalid start byte.");
 734
 735     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 736         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 737     if (this->samplerate == 0)
 738         this->samplerate = samplerate;
 739
 740     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 741         throw std::runtime_error("CSA::Load(): file read error (n).");
 742
 743     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 744         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 745
 746     uchar *bwt = new uchar[n];
 747     for (ulong offset = 0; offset < n; offset ++)
 748         if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
 749             throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
 750
 751     unsigned r = 0;
 752     if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
 753         throw std::runtime_error("CSA::Load(): file read error (r).");
 754
 755     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 756         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
 757
 758     // Build up arrays for text length and starting positions
 759     textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength));
 760     textStartPos = new BlockArray(r, Tools::CeilLog2(this->n));
 761
 762     while (r > 0)
 763     {
 764         TextPosition length = 0, start = 0;
 765         if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
 766             throw std::runtime_error("CSA::Load(): file read error (text length).");
 767         if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
 768             throw std::runtime_error("CSA::Load(): file read error (text start).");
 769
 770         (*textLength)[numberOfTexts] = length;
 771         (*textStartPos)[numberOfTexts] = start;
 772         ++numberOfTexts;
 773         --r;
 774     }
 775
 776     // Construct data structures
 777     makewavelet(bwt);
 778     delete [] bwt;
 779     maketables();  // FIXME: this will redo text length tables
 780 }
 781
 782
 783
 784 /**
 785  * Rest of the functions follow...
 786  */
 787
 788 uchar * CSA::Substring(TextPosition i, TextPosition l) const
 789 {
 790     uchar *result = new uchar[l + 1];
 791     if (l == 0)
 792     {
 793         result[0] = 0u;
 794         return result;
 795     }
 796
 797     TextPosition dist;
 798     TextPosition k = i + l - 1;
 799     // Check for end of the string
 800     if (k > n - 1)
 801     {
 802         l -= k - n + 1;
 803         k = n - 1;
 804     }
 805
 806     TextPosition skip = samplerate - k % samplerate - 1;
 807     TextPosition j;
 808     if (k / samplerate + 1 >= n / samplerate)
 809     {
 810         j = bwtEndPos;
 811         skip = n - k - 1;
 812     }
 813     else
 814     {
 815         j = (*positions)[k/samplerate+1];
 816         //cout << samplerate << ' ' << j << '\n';
 817     }
 818
 819     for (dist = 0; dist < skip + l; dist++)
 820     {
 821         int c = alphabetrank->charAtPos(j);
 822         if (c == '\0')
 823         {
 824             // Rank among the end-markers in BWT
 825             unsigned endmarkerRank = alphabetrank->rank(0, j) - 1;
 826             j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker
 827         }
 828         else
 829             j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 830         if (dist >= skip)
 831             result[l + skip - dist - 1] = c;
 832     }
 833     result[l] = 0u;
 834     return result;
 835 }
 836
 837   /*TextPosition CSA::inverse(TextPosition i)
 838 {
 839     TextPosition skip = samplerate - i % samplerate;
 840     TextPosition j;
 841     if (i / samplerate + 1 >= n / samplerate)
 842     {
 843         j = bwtEndPos;
 844         skip = n - i;
 845     }
 846     else
 847     {
 848         j = (*positions)[i/samplerate+1];
 849         //cout << samplerate << ' ' << j << '\n';
 850     }
 851
 852     while (skip > 0)
 853     {
 854         int c = alphabetrank->charAtPos(j);
 855         j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 856         skip --;
 857     }
 858     return j;
 859     }*/
 860
 861 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
 862     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 863     int c = (int)pattern[m-1];
 864     TextPosition i=m-1;
 865     TextPosition sp = C[c];
 866     TextPosition ep = C[c+1]-1;
 867     while (sp<=ep && i>=1)
 868     {
 869 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 870         c = (int)pattern[--i];
 871         sp = C[c]+alphabetrank->rank(c,sp-1);
 872         ep = C[c]+alphabetrank->rank(c,ep)-1;
 873     }
 874     *spResult = sp;
 875     *epResult = ep;
 876     if (sp<=ep)
 877         return ep - sp + 1;
 878     else
 879         return 0;
 880  }
 881
 882    /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
 883 {
 884     sp = C[(int)c]+alphabetrank->rank(c,sp-1);
 885     ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
 886
 887     if (sp<=ep)
 888         return ep - sp + 1;
 889     else
 890         return 0;
 891         }*/
 892
 893 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
 894 {
 895     TextPosition dist=0;
 896     while (!sampled->IsBitSet(i))
 897     {
 898         int c = alphabetrank->charAtPos(i);
 899         if (c == '\0')
 900         {
 901             // End-markers are sampled
 902             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 903             DocId docId = (*endmarkerDocId)[endmarkerRank];
 904             return (*textStartPos)[docId] + dist;
 905         }
 906         i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 907         ++dist;
 908     }
 909
 910     return (*suffixes)[sampled->rank(i)-1]+dist;
 911 }
 912
 913 CSA::~CSA() {
 914     delete dynFMI;
 915     delete alphabetrank;
 916     delete sampled;
 917     delete suffixes;
 918     delete positions;
 919     delete [] codetable;
 920
 921     delete endmarkerDocId;
 922     delete textLength;
 923     delete textStartPos;
 924 }
 925
 926 void CSA::makewavelet(uchar *bwt)
 927 {
 928     ulong i, min = 0,
 929              max;
 930     for (i=0;i<256;i++)
 931         C[i]=0;
 932     for (i=0;i<n;++i)
 933         C[(int)bwt[i]]++;
 934     for (i=0;i<256;i++)
 935         if (C[i]>0) {min = i; break;}
 936     for (i=255;i>=min;--i)
 937         if (C[i]>0) {max = i; break;}
 938
 939     // Print frequencies
 940     /*for(i = 0; i < 256; i++)
 941         if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
 942         fflush(stdout);*/
 943
 944     ulong prev=C[0], temp;
 945     C[0]=0;
 946     for (i=1;i<256;i++) {
 947         temp = C[i];
 948         C[i]=C[i-1]+prev;
 949         prev = temp;
 950     }
 951     this->codetable = node::makecodetable(bwt,n);
 952     alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 953     //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
 954
 955 /*    for (i = 0; i < n; ++i)
 956     {
 957         uchar c = alphabetrank->charAtPos(i);
 958         TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
 959         printf("LF[%lu] = %lu\n", i, j);
 960         }*/
 961 }
 962
 963 void CSA::maketables()
 964 {
 965     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
 966
 967     ulong *sampledpositions = new ulong[n/W+1];
 968     unsigned ceilLog2n = Tools::CeilLog2(n);
 969     suffixes = new BlockArray(sampleLength, ceilLog2n);
 970     positions = new BlockArray(sampleLength, ceilLog2n);
 971
 972     // Mapping from end-markers to doc ID's:
 973     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
 974
 975     ulong i,j=0;
 976     for (i=0;i<n/W+1;i++)
 977         sampledpositions[i]=0lu;
 978
 979     ulong x,p=bwtEndPos;
 980     // Keeping track of text position of end-markers seen
 981     ulong posOfSuccEndmarker = n;
 982     DocId textId = numberOfTexts;
 983     ulong ulongmax = 0;
 984     ulongmax--;
 985
 986     //positions:
 987     for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
 988       // i substitutes SA->GetPos(i)
 989         x=(i==n-1)?0:i+1;
 990
 991         if (x % samplerate == 0) {
 992             Tools::SetField(sampledpositions,1,p,1);
 993             (*positions)[x/samplerate] = p;
 994         }
 995
 996         uchar c = alphabetrank->charAtPos(p);
 997         if (c == '\0')
 998         {
 999             --textId;
1000
1001             // Record the order of end-markers in BWT:
1002             ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1003             (*endmarkerDocId)[endmarkerRank] = textId;
1004
1005             // Store text length and text start position:
1006             if (textId < (DocId)numberOfTexts - 1)
1007             {
1008                 (*textLength)[textId + 1] = posOfSuccEndmarker - x;
1009                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1010                 posOfSuccEndmarker = x;
1011             }
1012
1013             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1014             p = textId; // Correct LF-mapping to the last char of the previous text.
1015         }
1016         else
1017             p = C[c]+alphabetrank->rank(c, p)-1;
1018     }
1019     assert(textId == 0);
1020
1021     /*i = 0;
1022     for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i)
1023     {
1024         int docc = (*endmarkerDocId)[i];
1025         ulong poss = (*endmarkerPos)[i];
1026         printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss);
1027         }*/
1028 /*
1029     for (i = 0; i < numberOfTexts; ++ i)
1030     {
1031         //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl;
1032         assert(dynTextLength[i].first == (*textLength)[i]);
1033         assert(dynTextLength[i].second == (*textStartPos)[i]);
1034     }
1035 */
1036     sampled = new BitRank(sampledpositions,n,true);
1037
1038    //suffixes:
1039     for(i=0; i<sampleLength; i++) {
1040         j = sampled->rank((*positions)[i]);
1041         if (j==0) j=sampleLength;
1042         (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate;
1043     }
1044 }
1045
1046 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1047 {
1048     TCodeEntry *result = new TCodeEntry[ 256 ];
1049
1050     count_chars( text, n, result );
1051     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1052 //
1053 // First I push all the leaf nodes into the queue
1054 //
1055     for ( unsigned int i = 0 ; i < 256 ; i++ )
1056         if ( result[ i ].count )
1057             q.push(node( i, result[ i ].count ) );
1058 //
1059 // This loop removes the two smallest nodes from the
1060 // queue.  It creates a new internal node that has
1061 // those two nodes as children. The new internal node
1062 // is then inserted into the priority queue.  When there
1063 // is only one node in the priority queue, the tree
1064 // is complete.
1065 //
1066
1067     while ( q.size() > 1 ) {
1068         node *child0 = new node( q.top() );
1069         q.pop();
1070         node *child1 = new node( q.top() );
1071         q.pop();
1072         q.push( node( child0, child1 ) );
1073     }
1074 //
1075 // Now I compute and return the codetable
1076 //
1077     q.top().maketable(0u,0u, result);
1078     q.pop();
1079     return result;
1080 }
1081
1082
1083 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1084 {
1085     if ( child0 )
1086     {
1087         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1088         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1089         delete child0;
1090         delete child1;
1091     }
1092     else
1093     {
1094         codetable[value].code = code;
1095         codetable[value].bits = bits;
1096     }
1097 }
1098
1099 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1100 {
1101     ulong i;
1102     for (i = 0 ; i < 256 ; i++ )
1103         counts[ i ].count = 0;
1104     for (i=0; i<n; i++)
1105         counts[(int)text[i]].count++;
1106 }
1107
1108 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1109       return x | (bit << pos);
1110 }
1111