CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22 #include <iostream>
  23 #include <queue>
  24 #include <iomanip>
  25 #include <set>
  26 #include <vector>
  27 #include <utility>
  28 #include <stdexcept>
  29 #include <cassert>
  30 #include <cstring> // For strlen()
  31 using std::vector;
  32 using std::pair;
  33 using std::make_pair;
  34 using std::map;
  35
  36 using SXSI::TextCollection;
  37
  38
  39 ////////////////////////////////////////////////////////////////////////////
  40 // Class CSA::THuffAlphabetRank
  41
  42 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  43     left = NULL;
  44     right = NULL;
  45     bitrank = NULL;
  46     ch = s[0];
  47     leaf = false;
  48     this->codetable = codetable;
  49
  50     bool *B = new bool[n];
  51     TextPosition sum=0,i;
  52     /*
  53     for (i=0; i< n; i++) {
  54        printf("%c:", (char)((int)s[i]-128));
  55        for (r=0;r<codetable[(int)s[i]].bits;r++)
  56           if (codetable[(int)s[i]].code & (1u <<r))
  57              printf("1");
  58           else printf("0");
  59        printf("\n");
  60     }
  61     printf("\n");
  62     if (level > 100) return;*/
  63     for (i=0;i<n;i++)
  64        if (codetable[(int)s[i]].code & (1u << level)) {
  65           B[i] = true;
  66           sum++;
  67        }
  68        else B[i] = false;
  69     if (sum==0 || sum==n) {
  70         delete [] B;
  71         leaf = true;
  72         return;
  73     }
  74     uchar *sfirst, *ssecond;
  75     //if (n-sum > 0)
  76         sfirst = new uchar[n-sum];
  77     //if (sum > 0)
  78         ssecond = new uchar[sum];
  79     unsigned j=0,k=0;
  80    for (i=0;i<n;i++)
  81         if (B[i]) ssecond[k++] = s[i];
  82         else sfirst[j++] = s[i];
  83     ulong *Binbits = new ulong[n/W+1];
  84     for (i=0;i<n;i++)
  85         Tools::SetField(Binbits,1,i,B[i]);
  86     delete [] B;
  87     bitrank = new BitRank(Binbits,n,true);
  88     //if (j > 0) {
  89         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
  90         delete [] sfirst;
  91     //}
  92     //if (k>0) {
  93         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
  94         delete [] ssecond;
  95     //}
  96 }
  97
  98
  99 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 100     // testing that the code works correctly
 101     int C[256];
 102     unsigned i,j;
 103     bool correct=true;
 104     for (j=0;j<256;j++)
 105         C[j] = 0;
 106     for (i=0;i<n;i++) {
 107         C[(int)s[i]]++;
 108         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 109         correct = false;
 110         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 111         }
 112     }
 113     return correct;
 114 }
 115
 116 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 117     if (left!=NULL) delete left;
 118     if (right!=NULL) delete right;
 119     if (bitrank!=NULL)
 120         delete bitrank;
 121 }
 122
 123
 124 ////////////////////////////////////////////////////////////////////////////
 125 // Class CSA
 126
 127 /**
 128  * Constructor inits an empty dynamic FM-index.
 129  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 130  */
 131 CSA::CSA(unsigned samplerate)
 132     : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
 133       codetable(0), dynFMI(0)
 134 {
 135     this->samplerate = samplerate;
 136
 137     // FIXME TODO : DynFMI needs distribution of characters before hand
 138     // This will create fully balanced wavelet tree for all chars [0, 255].
 139     uchar temp[256];
 140     for (unsigned i = 0; i < 255; ++i)
 141         temp[i] = i+1;
 142     temp[255] = 0;
 143     dynFMI = new DynFMI(temp, 1, 255, false);
 144 }
 145
 146 /**
 147  * Insert new text
 148  *
 149  * Given text must include end-marker.
 150  * Text identifiers are numbered starting from 0.
 151  */
 152 void CSA::InsertText(uchar const * text)
 153 {
 154     // Sanity check:
 155     assert(dynFMI != 0);
 156
 157     TextPosition m = std::strlen((char *)text) + 1;
 158     // Store text length and starting position
 159     textLength.push_back(make_pair(m, n));
 160     this->n += m;
 161     dynFMI->addText(text, m);
 162 }
 163
 164 void CSA::MakeStatic()
 165 {
 166     // Sanity check:
 167     if (dynFMI == 0)
 168     {
 169         std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
 170         std::exit(0);
 171     }
 172
 173     uchar *bwt = dynFMI->getBWT();
 174     /*printf("1234567890123456789\n");
 175     for (TextPosition i = 0; i < n; i ++)
 176         if (bwt[i] < 50)
 177             printf("%d", (int)bwt[i]);
 178         else
 179             printf("%c", bwt[i]);
 180     printf("\n");
 181     */
 182
 183 /*    for (TextPosition i = 1; i <= n; i ++)
 184     {
 185         printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
 186     }*/
 187
 188     // Sanity check
 189     assert(textLength.size() == dynFMI->getCollectionSize());
 190
 191     delete dynFMI;
 192     dynFMI = 0;
 193
 194     makewavelet(bwt);
 195
 196     // Calculate BWT end-marker position (of last inserted text)
 197     ulong i = 0;
 198     while (bwt[i] != '\0')
 199     {
 200         uchar c = bwt[i];
 201         i = C[c]+alphabetrank->rank(c, i)-1;
 202     }
 203     bwtEndPos = i;
 204     //printf("bwtEndPos = %lu\n", bwtEndPos);
 205
 206     delete [] bwt;
 207
 208     // Make sampling tables
 209     maketables();
 210 }
 211
 212
 213 uchar* CSA::GetText(DocId k) const
 214 {
 215     assert(k < (DocId)textLength.size());
 216
 217     uchar* result = new uchar[textLength[k].first];
 218     TextPosition i = k;
 219     TextPosition j = textLength[k].first-1;
 220     result[j] = '\0';
 221
 222     uchar c = alphabetrank->charAtPos(i);
 223     while (c != '\0')
 224     {
 225         --j;
 226         result[j] = c;
 227         i = C[c]+alphabetrank->rank(c,i)-1;
 228
 229         c = alphabetrank->charAtPos(i); // "next" char.
 230     }
 231     assert(j == 0);
 232     return result;
 233 }
 234
 235 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 236 {
 237     assert(k < (DocId)textLength.size());
 238     assert(j < textLength[k].first);
 239     assert(i <= j);
 240
 241     // Start position of k'th text
 242     ulong start = textLength[k].second;
 243
 244     return Substring(i + start, j-i+1);
 245 }
 246
 247 /**
 248  * Existential queries
 249  */
 250 bool CSA::IsPrefix(uchar const * pattern) const
 251 {
 252     TextPosition m = strlen((char *)pattern);
 253     if (m == 0)
 254         return textLength.size();
 255
 256     TextPosition sp = 0, ep = 0;
 257     Search(pattern, m, &sp, &ep);
 258
 259     // Check for end-marker(s) in result interval
 260     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 261     it = endmarkers.lower_bound(sp);
 262     if (it != endmarkers.end() && it->first <= ep)
 263         return true;
 264     return false;
 265 }
 266
 267 bool CSA::IsSuffix(uchar const *pattern) const
 268 {
 269     // Here counting is as fast as isSuffix():
 270     if (CountSuffix(pattern) > 0)
 271         return true;
 272     return false;
 273 }
 274
 275 bool CSA::IsEqual(uchar const *pattern) const
 276 {
 277     TextPosition m = std::strlen((char *)pattern);
 278     if (m == 0)
 279         return textLength.size();
 280
 281     TextPosition sp = 0, ep = 0;
 282     // Match including end-marker
 283     Search(pattern, m+1, &sp, &ep);
 284
 285     // Check for end-marker(s) in result interval
 286     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 287     it = endmarkers.lower_bound(sp);
 288     if (it != endmarkers.end() && it->first <= ep)
 289         return true;
 290     return false;
 291 }
 292
 293 bool CSA::IsContains(uchar const * pattern) const
 294 {
 295     TextPosition m = strlen((char *)pattern);
 296     if (m == 0)
 297         return true;
 298
 299     TextPosition sp = 0, ep = 0;
 300     // Just check if pattern exists somewhere
 301     ulong count = Search(pattern, m, &sp, &ep);
 302
 303     if (count > 0)
 304         return true;
 305     return false;
 306 }
 307
 308 bool CSA::IsLessThan(uchar const*) const
 309 {
 310     // TODO
 311     return false;
 312 }
 313
 314 /**
 315  * Counting queries
 316  */
 317 unsigned CSA::CountPrefix(uchar const * pattern) const
 318 {
 319     TextPosition m = strlen((char *)pattern);
 320     if (m == 0)
 321         return textLength.size();
 322
 323     TextPosition sp = 0, ep = 0;
 324     Search(pattern, m, &sp, &ep);
 325
 326     // Count end-markers in result interval
 327     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 328     it = endmarkers.lower_bound(sp);
 329     unsigned count = 0;
 330     while (it != endmarkers.end() && it->first <= ep)
 331     {
 332         ++ count;
 333         ++ it;
 334     }
 335
 336     return count;
 337 }
 338
 339 unsigned CSA::CountSuffix(uchar const * pattern) const
 340 {
 341     TextPosition m = strlen((char *)pattern);
 342     if (m == 0)
 343         return textLength.size();
 344
 345     TextPosition sp = 0, ep = 0;
 346     // Search with end-marker
 347     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 348
 349     return count;
 350 }
 351
 352 unsigned CSA::CountEqual(uchar const *pattern) const
 353 {
 354     TextPosition m = strlen((char const *)pattern);
 355     if (m == 0)
 356         return textLength.size();
 357
 358     TextPosition sp = 0, ep = 0;
 359     // Match including end-marker
 360     Search(pattern, m+1, &sp, &ep);
 361
 362     // Count end-markers in result interval
 363     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 364     it = endmarkers.lower_bound(sp);
 365     unsigned count = 0;
 366     while (it != endmarkers.end() && it->first <= ep)
 367     {
 368         ++ count;
 369         ++ it;
 370     }
 371
 372     return count;
 373 }
 374
 375 unsigned CSA::CountContains(uchar const * pattern) const
 376 {
 377     // Here counting is as slow as fetching the result set
 378     // because we would have to filter out occ's that fall within same document.
 379     TextCollection::document_result result = Contains(pattern);
 380     return result.size();
 381 }
 382
 383 unsigned CSA::CountLessThan(uchar const * pattern) const
 384 {
 385     // TODO
 386     return 0;
 387 }
 388
 389 /**
 390  * Document reporting queries
 391  */
 392 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 393 {
 394     TextPosition m = strlen((char *)pattern);
 395     if (m == 0)
 396         return TextCollection::document_result();
 397
 398     TextPosition sp = 0, ep = 0;
 399     Search(pattern, m, &sp, &ep);
 400
 401     TextCollection::document_result result;
 402
 403     // Report end-markers in result interval
 404     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 405     it = endmarkers.lower_bound(sp);
 406     unsigned count = 0;
 407     while (it != endmarkers.end() && it->first <= ep)
 408     {
 409         // End-marker we found belongs to the "previous" doc in the collection
 410         DocId docId = ((it->second).first + 1) % textLength.size();
 411         result.push_back(docId);
 412         ++ count;
 413         ++ it;
 414     }
 415
 416     return result;
 417 }
 418
 419 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 420 {
 421     TextPosition m = strlen((char *)pattern);
 422     if (m == 0)
 423         return TextCollection::document_result();
 424
 425     TextPosition sp = 0, ep = 0;
 426     // Search with end-marker
 427     Search(pattern, m+1, &sp, &ep);
 428
 429     TextCollection::document_result result;
 430
 431     // Check each occurrence
 432     for (; sp <= ep; ++sp)
 433     {
 434         TextPosition i = sp;
 435         uchar c = alphabetrank->charAtPos(i);
 436         while (c != '\0' && !sampled->IsBitSet(i))
 437         {
 438             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 439             c = alphabetrank->charAtPos(i);
 440         }
 441         if (c == '\0')
 442         {
 443             // map::operator[] is not const, using find() instead:
 444             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 445             // End-marker that we found belongs to the "previous" doc in collection:
 446             DocId docId = (endm.first + 1) % textLength.size();
 447             result.push_back(docId);
 448         }
 449         else
 450         {
 451             TextPosition textPos = suffixes[sampled->rank(i)-1];
 452             result.push_back(DocIdAtTextPos(textPos));
 453         }
 454     }
 455
 456     return result;
 457 }
 458
 459 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 460 {
 461     TextPosition m = strlen((char const *)pattern);
 462     if (m == 0)
 463         return TextCollection::document_result();
 464
 465     TextPosition sp = 0, ep = 0;
 466     // Match including end-marker
 467     Search(pattern, m+1, &sp, &ep);
 468
 469     TextCollection::document_result result;
 470
 471     // Report end-markers in result interval
 472     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 473     it = endmarkers.lower_bound(sp);
 474     unsigned count = 0;
 475     while (it != endmarkers.end() && it->first <= ep)
 476     {
 477         // End-marker that we found belongs to the "previous" doc in collection:
 478         DocId docId = ((it->second).first + 1) % textLength.size();
 479         result.push_back(docId);
 480         ++ count;
 481         ++ it;
 482     }
 483
 484     return result;
 485 }
 486
 487
 488 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 489 {
 490     TextPosition m = strlen((char *)pattern);
 491     if (m == 0)
 492         return TextCollection::document_result();
 493
 494     TextPosition sp = 0, ep = 0;
 495     // Search all occurrences
 496     Search(pattern, m, &sp, &ep);
 497
 498     // We want unique document indentifiers, using std::set to collect them
 499     std::set<DocId> resultSet;
 500
 501     // Check each occurrence
 502     for (; sp <= ep; ++sp)
 503     {
 504         TextPosition i = sp;
 505         uchar c = alphabetrank->charAtPos(i);
 506         while (c != '\0' && !sampled->IsBitSet(i))
 507         {
 508             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 509             c = alphabetrank->charAtPos(i);
 510         }
 511         if (c == '\0')
 512         {
 513             // map::operator[] is not const, using find() instead:
 514             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 515             // End-marker we found belongs to the "previous" doc in collection
 516             DocId docId = (endm.first + 1) % textLength.size();
 517             resultSet.insert(docId);
 518         }
 519         else
 520         {
 521             TextPosition textPos = suffixes[sampled->rank(i)-1];
 522             resultSet.insert(DocIdAtTextPos(textPos));
 523         }
 524     }
 525
 526     // Convert std::set to std::vector (TODO better way to construct result vector?)
 527     TextCollection::document_result result;
 528     for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
 529         result.push_back(*it);
 530     return result;
 531 }
 532
 533 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 534 {
 535     // TODO
 536     return document_result();
 537 }
 538
 539 /**
 540  * Full result set queries
 541  */
 542 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 543 {
 544     TextPosition m = strlen((char *)pattern);
 545     if (m == 0)
 546         return full_result();
 547
 548     TextPosition sp = 0, ep = 0;
 549     // Search all occurrences
 550     Search(pattern, m, &sp, &ep);
 551
 552     full_result result;
 553
 554     // Report each occurrence
 555     for (; sp <= ep; ++sp)
 556     {
 557         TextPosition i = sp;
 558         TextPosition dist = 0;
 559         uchar c = alphabetrank->charAtPos(i);
 560         while (c != '\0' && !sampled->IsBitSet(i))
 561         {
 562             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 563             c = alphabetrank->charAtPos(i);
 564             ++ dist;
 565         }
 566         if (c == '\0')
 567         {
 568             // map::operator[] is not const, using find() instead:
 569             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 570             // End-marker we found belongs to the "previous" doc in the collection:
 571             DocId docId = (endm.first+1)%textLength.size();
 572             result.push_back(make_pair(docId, dist));
 573         }
 574         else
 575         {
 576             TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
 577             // Read document identifier and its starting position in text order
 578             DocId docId = DocIdAtTextPos(textPos);
 579             result.push_back(make_pair(docId, textPos - textLength[docId].second));
 580         }
 581     }
 582
 583     return result;
 584 }
 585
 586
 587 /**
 588  * Finds document identifier for given text position
 589  *
 590  * Starting text position of the document is stored into second parameter.
 591  * Binary searching on text starting positions.
 592  */
 593 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
 594 {
 595     assert(i < n);
 596
 597     DocId a = 0;
 598     DocId b = textLength.size() - 1;
 599     while (a < b)
 600     {
 601         DocId c = a + (b - a)/2;
 602         if (textLength[c].second > i)
 603             b = c - 1;
 604         else if (textLength[c+1].second > i)
 605             return c;
 606         else
 607             a = c + 1;
 608     }
 609
 610     assert(a < (DocId)textLength.size());
 611     assert(i > textLength[a].second);
 612     assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
 613     return a;
 614 }
 615
 616 /**
 617  * Save index to a file handle
 618  *
 619  * Throws a std::runtime_error exception on i/o error.
 620  * First byte that is saved represents the version number of the save file.
 621  * In version 1 files, the data fields are:
 622  *        <uchar version> version info;
 623  *        <unsigned s>    samplerate;
 624  *        <ulong n>       length of the BWT;
 625  *        <ulong bwt>     end marker position in BWT;
 626  *        <uchar *>       BWT string of length n;
 627  *        <unsigned r>    number of texts;
 628  *        <vector textLength>
 629  *                        array of <ulong, ulong> pairs.
 630  *
 631  * TODO: Save the data structures instead of BWT sequence?
 632  */
 633 void CSA::Save(FILE *file) const
 634 {
 635     // Saving version 1 data:
 636     uchar versionFlag = 1;
 637     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 638         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 639
 640     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 641         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 642
 643     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 644         throw std::runtime_error("CSA::Save(): file write error (n).");
 645
 646     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 647         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 648
 649     for (ulong offset = 0; offset < n; offset ++)
 650     {
 651         uchar c = alphabetrank->charAtPos(offset);
 652         if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
 653             throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
 654     }
 655
 656     unsigned r = textLength.size();
 657     if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
 658         throw std::runtime_error("CSA::Save(): file write error (r).");
 659
 660     for (r = 0; r < textLength.size(); ++ r)
 661     {
 662         if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1)
 663             throw std::runtime_error("CSA::Save(): file write error (text length).");
 664         if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1)
 665             throw std::runtime_error("CSA::Save(): file write error (text start).");
 666     }
 667 }
 668
 669
 670 /**
 671  * Load index from a file handle
 672  *
 673  * Throws a std::runtime_error exception on i/o error.
 674  * For more info, see CSA::Save().
 675  */
 676 void CSA::Load(FILE *file, unsigned samplerate)
 677 {
 678     // Delete everything
 679     delete dynFMI;       dynFMI = 0;
 680     delete alphabetrank; alphabetrank = 0;
 681     delete sampled;      sampled = 0;
 682     delete [] suffixes;  suffixes = 0;
 683     delete [] positions; positions = 0;
 684     delete [] codetable; codetable = 0;
 685
 686     endmarkers.clear();
 687     textLength.clear();
 688
 689     this->samplerate = samplerate;
 690     this->n = 0;
 691
 692     uchar versionFlag = 0;
 693     if (std::fread(&versionFlag, 1, 1, file) != 1)
 694         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 695     if (versionFlag != 1)
 696         throw std::runtime_error("CSA::Load(): invalid start byte.");
 697
 698     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 699         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 700     if (this->samplerate == 0)
 701         this->samplerate = samplerate;
 702
 703     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 704         throw std::runtime_error("CSA::Load(): file read error (n).");
 705
 706     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 707         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 708
 709     uchar *bwt = new uchar[n];
 710     for (ulong offset = 0; offset < n; offset ++)
 711         if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
 712             throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
 713
 714     unsigned r = 0;
 715     if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
 716         throw std::runtime_error("CSA::Load(): file read error (r).");
 717
 718     while (r > 0)
 719     {
 720         TextPosition length = 0, start = 0;
 721         if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
 722             throw std::runtime_error("CSA::Load(): file read error (text length).");
 723         if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
 724             throw std::runtime_error("CSA::Load(): file read error (text start).");
 725
 726         textLength.push_back(make_pair(length, start));
 727         --r;
 728     }
 729
 730     // Construct data structures
 731     makewavelet(bwt);
 732     delete [] bwt;
 733     maketables();
 734 }
 735
 736
 737
 738 /**
 739  * Rest of the functions follow...
 740  */
 741
 742 uchar * CSA::Substring(TextPosition i, TextPosition l) const
 743 {
 744     uchar *result = new uchar[l + 1];
 745     if (l == 0)
 746     {
 747         result[0] = 0u;
 748         return result;
 749     }
 750
 751     TextPosition dist;
 752     TextPosition k = i + l - 1;
 753     // Check for end of the string
 754     if (k > n - 1)
 755     {
 756         l -= k - n + 1;
 757         k = n - 1;
 758     }
 759
 760     TextPosition skip = samplerate - k % samplerate - 1;
 761     TextPosition j;
 762     if (k / samplerate + 1 >= n / samplerate)
 763     {
 764         j = bwtEndPos;
 765         skip = n - k - 1;
 766     }
 767     else
 768     {
 769         j = positions[k/samplerate+1];
 770         //cout << samplerate << ' ' << j << '\n';
 771     }
 772
 773     for (dist = 0; dist < skip + l; dist++)
 774     {
 775         int c = alphabetrank->charAtPos(j);
 776         if (c == '\0')
 777         {
 778             // map::operator[] is not const, using find() instead:
 779             pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
 780             j = endm.first; // LF-mapping for end-marker
 781         }
 782         else
 783             j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 784         if (dist >= skip)
 785             result[l + skip - dist - 1] = c;
 786     }
 787     result[l] = 0u;
 788     return result;
 789 }
 790
 791   /*TextPosition CSA::inverse(TextPosition i)
 792 {
 793     TextPosition skip = samplerate - i % samplerate;
 794     TextPosition j;
 795     if (i / samplerate + 1 >= n / samplerate)
 796     {
 797         j = bwtEndPos;
 798         skip = n - i;
 799     }
 800     else
 801     {
 802         j = positions[i/samplerate+1];
 803         //cout << samplerate << ' ' << j << '\n';
 804     }
 805
 806     while (skip > 0)
 807     {
 808         int c = alphabetrank->charAtPos(j);
 809         j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 810         skip --;
 811     }
 812     return j;
 813     }*/
 814
 815 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
 816     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 817     int c = (int)pattern[m-1];
 818     TextPosition i=m-1;
 819     TextPosition sp = C[c];
 820     TextPosition ep = C[c+1]-1;
 821     while (sp<=ep && i>=1)
 822     {
 823 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 824         c = (int)pattern[--i];
 825         sp = C[c]+alphabetrank->rank(c,sp-1);
 826         ep = C[c]+alphabetrank->rank(c,ep)-1;
 827     }
 828     *spResult = sp;
 829     *epResult = ep;
 830     if (sp<=ep)
 831         return ep - sp + 1;
 832     else
 833         return 0;
 834  }
 835
 836    /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
 837 {
 838     sp = C[(int)c]+alphabetrank->rank(c,sp-1);
 839     ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
 840
 841     if (sp<=ep)
 842         return ep - sp + 1;
 843     else
 844         return 0;
 845         }*/
 846
 847 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
 848 {
 849     TextPosition dist=0;
 850     while (!sampled->IsBitSet(i))
 851     {
 852         int c = alphabetrank->charAtPos(i);
 853         if (c == '\0')
 854         {
 855             // map::operator[] is not const, using find() instead:
 856             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 857             return endm.second + dist; // returns text position.
 858         }
 859         i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 860         ++dist;
 861     }
 862
 863     return suffixes[sampled->rank(i)-1]+dist;
 864 }
 865
 866 CSA::~CSA() {
 867     delete dynFMI;
 868     delete alphabetrank;
 869     delete sampled;
 870     delete [] suffixes;
 871     delete [] positions;
 872     delete [] codetable;
 873 }
 874
 875 void CSA::makewavelet(uchar *bwt)
 876 {
 877     ulong i, min = 0,
 878              max;
 879     for (i=0;i<256;i++)
 880         C[i]=0;
 881     for (i=0;i<n;++i)
 882         C[(int)bwt[i]]++;
 883     for (i=0;i<256;i++)
 884         if (C[i]>0) {min = i; break;}
 885     for (i=255;i>=min;--i)
 886         if (C[i]>0) {max = i; break;}
 887
 888     // Print frequencies
 889 /*    for(i = 0; i < 256; i++)
 890         if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
 891         fflush(stdout);*/
 892
 893     ulong prev=C[0], temp;
 894     C[0]=0;
 895     for (i=1;i<256;i++) {
 896         temp = C[i];
 897         C[i]=C[i-1]+prev;
 898         prev = temp;
 899     }
 900     this->codetable = node::makecodetable(bwt,n);
 901     alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 902     //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
 903
 904 /*    for (i = 0; i < n; ++i)
 905     {
 906         uchar c = alphabetrank->charAtPos(i);
 907         TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
 908         printf("LF[%lu] = %lu\n", i, j);
 909         }*/
 910 }
 911
 912 void CSA::maketables()
 913 {
 914     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
 915
 916     ulong *sampledpositions = new ulong[n/W+1];
 917     suffixes = new ulong[sampleLength];
 918     positions = new ulong[sampleLength];
 919
 920     ulong i,j=0;
 921     for (i=0;i<n/W+1;i++)
 922         sampledpositions[i]=0lu;
 923
 924     ulong x,p=bwtEndPos;
 925     DocId textId = textLength.size();
 926     ulong ulongmax = 0;
 927     ulongmax--;
 928
 929    //positions:
 930     for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
 931       // i substitutes SA->GetPos(i)
 932         x=(i==n-1)?0:i+1;
 933
 934         if (x % samplerate == 0) {
 935             Tools::SetField(sampledpositions,1,p,1);
 936             positions[x/samplerate] = p;
 937         }
 938
 939         uchar c = alphabetrank->charAtPos(p);
 940         if (c == '\0')
 941         {
 942             // Record the order of end-markers in BWT:
 943             --textId;
 944             endmarkers[p] = make_pair(textId, (TextPosition)x);
 945             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
 946             p = textId; // Correct LF-mapping to the last char of the previous text.
 947         }
 948         else
 949             p = C[c]+alphabetrank->rank(c, p)-1;
 950     }
 951     assert(textId == 0);
 952     /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
 953     {
 954         printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
 955         }*/
 956
 957     sampled = new BitRank(sampledpositions,n,true);
 958
 959    //suffixes:
 960     for(i=0; i<sampleLength; i++) {
 961         j = sampled->rank(positions[i]);
 962         if (j==0) j=sampleLength;
 963         suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
 964     }
 965 }
 966
 967 uchar * CSA::LoadFromFile(const char *filename)
 968 {
 969     uchar *s;
 970     std::ifstream file (filename, std::ios::in|std::ios::binary);
 971     if (file.is_open())
 972     {
 973         std::cerr << "Loading CSA from file: " << filename << std::endl;
 974         file.read((char *)&bwtEndPos, sizeof(TextPosition));
 975         s = new uchar[n];
 976         for (ulong offset = 0; offset < n; offset ++)
 977             file.read((char *)(s + offset), sizeof(char));
 978         file.close();
 979     }
 980     else
 981     {
 982         std::cerr << "Unable to open file " << filename << std::endl;
 983         exit(1);
 984     }
 985     return s;
 986 }
 987
 988 void CSA::SaveToFile(const char *filename, uchar *bwt)
 989 {
 990     std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
 991     if (file.is_open())
 992     {
 993         std::cerr << "Writing CSA to file: " << filename << std::endl;
 994         file.write((char *)&bwtEndPos, sizeof(TextPosition));
 995         std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
 996         for (ulong offset = 0; offset < n; offset ++)
 997             file.write((char *)(bwt + offset), sizeof(char));
 998         file.close();
 999     }
1000     else
1001     {
1002         std::cerr << "Unable to open file " << filename << std::endl;
1003         exit(1);
1004     }
1005 }
1006
1007 /*uchar * CSA::BWT(uchar *text)
1008 {
1009     uchar *s;
1010
1011     DynFMI *wt = new DynFMI((uchar *) text, n);
1012     s = wt->getBWT();
1013     for (ulong i=0;i<n;i++)
1014         if (s[i]==0u) {
1015             bwtEndPos = i;  // TODO: better solution ?
1016             i=n;
1017         }
1018
1019     delete wt;
1020     return s;
1021     }*/
1022
1023 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1024 {
1025     TCodeEntry *result = new TCodeEntry[ 256 ];
1026
1027     count_chars( text, n, result );
1028     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1029 //
1030 // First I push all the leaf nodes into the queue
1031 //
1032     for ( unsigned int i = 0 ; i < 256 ; i++ )
1033         if ( result[ i ].count )
1034             q.push(node( i, result[ i ].count ) );
1035 //
1036 // This loop removes the two smallest nodes from the
1037 // queue.  It creates a new internal node that has
1038 // those two nodes as children. The new internal node
1039 // is then inserted into the priority queue.  When there
1040 // is only one node in the priority queue, the tree
1041 // is complete.
1042 //
1043
1044     while ( q.size() > 1 ) {
1045         node *child0 = new node( q.top() );
1046         q.pop();
1047         node *child1 = new node( q.top() );
1048         q.pop();
1049         q.push( node( child0, child1 ) );
1050     }
1051 //
1052 // Now I compute and return the codetable
1053 //
1054     q.top().maketable(0u,0u, result);
1055     q.pop();
1056     return result;
1057 }
1058
1059
1060 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1061 {
1062     if ( child0 )
1063     {
1064         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1065         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1066         delete child0;
1067         delete child1;
1068     }
1069     else
1070     {
1071         codetable[value].code = code;
1072         codetable[value].bits = bits;
1073     }
1074 }
1075
1076 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1077 {
1078     ulong i;
1079     for (i = 0 ; i < 256 ; i++ )
1080         counts[ i ].count = 0;
1081     for (i=0; i<n; i++)
1082         counts[(int)text[i]].count++;
1083 }
1084
1085 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1086       return x | (bit << pos);
1087 }
1088