CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22 #include <iostream>
  23 #include <queue>
  24 #include <iomanip>
  25 #include <set>
  26 #include <vector>
  27 #include <utility>
  28 #include <stdexcept>
  29 #include <cassert>
  30 #include <cstring> // For strlen()
  31 using std::vector;
  32 using std::pair;
  33 using std::make_pair;
  34 using std::map;
  35
  36 using SXSI::TextCollection;
  37
  38
  39 ////////////////////////////////////////////////////////////////////////////
  40 // Class CSA::THuffAlphabetRank
  41
  42 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  43     left = NULL;
  44     right = NULL;
  45     bitrank = NULL;
  46     ch = s[0];
  47     leaf = false;
  48     this->codetable = codetable;
  49
  50     bool *B = new bool[n];
  51     TextPosition sum=0,i;
  52     /*
  53     for (i=0; i< n; i++) {
  54        printf("%c:", (char)((int)s[i]-128));
  55        for (r=0;r<codetable[(int)s[i]].bits;r++)
  56           if (codetable[(int)s[i]].code & (1u <<r))
  57              printf("1");
  58           else printf("0");
  59        printf("\n");
  60     }
  61     printf("\n");
  62     if (level > 100) return;*/
  63     for (i=0;i<n;i++)
  64        if (codetable[(int)s[i]].code & (1u << level)) {
  65           B[i] = true;
  66           sum++;
  67        }
  68        else B[i] = false;
  69     if (sum==0 || sum==n) {
  70         delete [] B;
  71         leaf = true;
  72         return;
  73     }
  74     uchar *sfirst, *ssecond;
  75     //if (n-sum > 0)
  76         sfirst = new uchar[n-sum];
  77     //if (sum > 0)
  78         ssecond = new uchar[sum];
  79     unsigned j=0,k=0;
  80    for (i=0;i<n;i++)
  81         if (B[i]) ssecond[k++] = s[i];
  82         else sfirst[j++] = s[i];
  83     ulong *Binbits = new ulong[n/W+1];
  84     for (i=0;i<n;i++)
  85         Tools::SetField(Binbits,1,i,B[i]);
  86     delete [] B;
  87     bitrank = new BitRank(Binbits,n,true);
  88     //if (j > 0) {
  89         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
  90         delete [] sfirst;
  91     //}
  92     //if (k>0) {
  93         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
  94         delete [] ssecond;
  95     //}
  96 }
  97
  98
  99 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 100     // testing that the code works correctly
 101     int C[256];
 102     unsigned i,j;
 103     bool correct=true;
 104     for (j=0;j<256;j++)
 105         C[j] = 0;
 106     for (i=0;i<n;i++) {
 107         C[(int)s[i]]++;
 108         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 109         correct = false;
 110         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 111         }
 112     }
 113     return correct;
 114 }
 115
 116 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 117     if (left!=NULL) delete left;
 118     if (right!=NULL) delete right;
 119     if (bitrank!=NULL)
 120         delete bitrank;
 121 }
 122
 123
 124 ////////////////////////////////////////////////////////////////////////////
 125 // Class CSA
 126
 127 /**
 128  * Constructor inits an empty dynamic FM-index.
 129  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 130  */
 131 CSA::CSA(unsigned samplerate)
 132     : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
 133       codetable(0), dynFMI(0)
 134 {
 135     this->samplerate = samplerate;
 136
 137     // FIXME TODO : DynFMI needs distribution of characters before hand
 138     // This will create fully balanced wavelet tree for all chars [0, 255].
 139     uchar temp[256];
 140     for (unsigned i = 0; i < 255; ++i)
 141         temp[i] = i+1;
 142     temp[255] = 0;
 143     dynFMI = new DynFMI(temp, 1, 255, false);
 144 }
 145
 146 /**
 147  * Insert new text
 148  *
 149  * Given text must include end-marker.
 150  * Text identifiers are numbered starting from 0.
 151  */
 152 void CSA::InsertText(uchar const * text)
 153 {
 154     // Sanity check:
 155     assert(dynFMI != 0);
 156
 157     TextPosition m = std::strlen((char *)text) + 1;
 158     // Store text length and starting position
 159     textLength.push_back(make_pair(m, n));
 160     this->n += m;
 161     dynFMI->addText(text, m);
 162 }
 163
 164 void CSA::MakeStatic()
 165 {
 166     // Sanity check:
 167     if (dynFMI == 0)
 168     {
 169         std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
 170         std::exit(0);
 171     }
 172
 173     uchar *bwt = dynFMI->getBWT();
 174     /*printf("1234567890123456789\n");
 175     for (TextPosition i = 0; i < n; i ++)
 176         if (bwt[i] < 50)
 177             printf("%d", (int)bwt[i]);
 178         else
 179             printf("%c", bwt[i]);
 180     printf("\n");
 181     */
 182
 183 /*    for (TextPosition i = 1; i <= n; i ++)
 184     {
 185         printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
 186     }*/
 187
 188     // Sanity check
 189     assert(textLength.size() == dynFMI->getCollectionSize());
 190
 191     delete dynFMI;
 192     dynFMI = 0;
 193
 194     makewavelet(bwt);
 195
 196     // Calculate BWT end-marker position (of last inserted text)
 197     ulong i = 0;
 198     while (bwt[i] != '\0')
 199     {
 200         uchar c = bwt[i];
 201         i = C[c]+alphabetrank->rank(c, i)-1;
 202     }
 203     bwtEndPos = i;
 204     //printf("bwtEndPos = %lu\n", bwtEndPos);
 205
 206     delete [] bwt;
 207
 208     // Make sampling tables
 209     maketables();
 210 }
 211
 212 bool CSA::EmptyText(DocId k) const
 213 {
 214   assert(k < (DocId)textLength.size());
 215   return (1 == textLength[k].first);
 216 }
 217
 218 uchar* CSA::GetText(DocId k) const
 219 {
 220     assert(k < (DocId)textLength.size());
 221
 222     uchar* result = new uchar[textLength[k].first];
 223     TextPosition i = k;
 224     TextPosition j = textLength[k].first-1;
 225     result[j] = '\0';
 226
 227     uchar c = alphabetrank->charAtPos(i);
 228     while (c != '\0')
 229     {
 230         --j;
 231         result[j] = c;
 232         i = C[c]+alphabetrank->rank(c,i)-1;
 233
 234         c = alphabetrank->charAtPos(i); // "next" char.
 235     }
 236     assert(j == 0);
 237     return result;
 238 }
 239
 240 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 241 {
 242     assert(k < (DocId)textLength.size());
 243     assert(j < textLength[k].first);
 244     assert(i <= j);
 245
 246     // Start position of k'th text
 247     ulong start = textLength[k].second;
 248
 249     return Substring(i + start, j-i+1);
 250 }
 251
 252 /**
 253  * Existential queries
 254  */
 255 bool CSA::IsPrefix(uchar const * pattern) const
 256 {
 257     TextPosition m = strlen((char *)pattern);
 258     if (m == 0)
 259         return textLength.size();
 260
 261     TextPosition sp = 0, ep = 0;
 262     Search(pattern, m, &sp, &ep);
 263
 264     // Check for end-marker(s) in result interval
 265     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 266     it = endmarkers.lower_bound(sp);
 267     if (it != endmarkers.end() && it->first <= ep)
 268         return true;
 269     return false;
 270 }
 271
 272 bool CSA::IsSuffix(uchar const *pattern) const
 273 {
 274     // Here counting is as fast as isSuffix():
 275     if (CountSuffix(pattern) > 0)
 276         return true;
 277     return false;
 278 }
 279
 280 bool CSA::IsEqual(uchar const *pattern) const
 281 {
 282     TextPosition m = std::strlen((char *)pattern);
 283     if (m == 0)
 284         return textLength.size();
 285
 286     TextPosition sp = 0, ep = 0;
 287     // Match including end-marker
 288     Search(pattern, m+1, &sp, &ep);
 289
 290     // Check for end-marker(s) in result interval
 291     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 292     it = endmarkers.lower_bound(sp);
 293     if (it != endmarkers.end() && it->first <= ep)
 294         return true;
 295     return false;
 296 }
 297
 298 bool CSA::IsContains(uchar const * pattern) const
 299 {
 300     TextPosition m = strlen((char *)pattern);
 301     if (m == 0)
 302         return true;
 303
 304     TextPosition sp = 0, ep = 0;
 305     // Just check if pattern exists somewhere
 306     ulong count = Search(pattern, m, &sp, &ep);
 307
 308     if (count > 0)
 309         return true;
 310     return false;
 311 }
 312
 313 bool CSA::IsLessThan(uchar const*) const
 314 {
 315     // TODO
 316     return false;
 317 }
 318
 319 /**
 320  * Counting queries
 321  */
 322 unsigned CSA::CountPrefix(uchar const * pattern) const
 323 {
 324     TextPosition m = strlen((char *)pattern);
 325     if (m == 0)
 326         return textLength.size();
 327
 328     TextPosition sp = 0, ep = 0;
 329     Search(pattern, m, &sp, &ep);
 330
 331     // Count end-markers in result interval
 332     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 333     it = endmarkers.lower_bound(sp);
 334     unsigned count = 0;
 335     while (it != endmarkers.end() && it->first <= ep)
 336     {
 337         ++ count;
 338         ++ it;
 339     }
 340
 341     return count;
 342 }
 343
 344 unsigned CSA::CountSuffix(uchar const * pattern) const
 345 {
 346     TextPosition m = strlen((char *)pattern);
 347     if (m == 0)
 348         return textLength.size();
 349
 350     TextPosition sp = 0, ep = 0;
 351     // Search with end-marker
 352     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 353
 354     return count;
 355 }
 356
 357 unsigned CSA::CountEqual(uchar const *pattern) const
 358 {
 359     TextPosition m = strlen((char const *)pattern);
 360     if (m == 0)
 361         return textLength.size();
 362
 363     TextPosition sp = 0, ep = 0;
 364     // Match including end-marker
 365     Search(pattern, m+1, &sp, &ep);
 366
 367     // Count end-markers in result interval
 368     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 369     it = endmarkers.lower_bound(sp);
 370     unsigned count = 0;
 371     while (it != endmarkers.end() && it->first <= ep)
 372     {
 373         ++ count;
 374         ++ it;
 375     }
 376
 377     return count;
 378 }
 379
 380 unsigned CSA::CountContains(uchar const * pattern) const
 381 {
 382     // Here counting is as slow as fetching the result set
 383     // because we would have to filter out occ's that fall within same document.
 384     TextCollection::document_result result = Contains(pattern);
 385     return result.size();
 386 }
 387
 388 unsigned CSA::CountLessThan(uchar const * pattern) const
 389 {
 390     // TODO
 391     return 0;
 392 }
 393
 394 /**
 395  * Document reporting queries
 396  */
 397 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 398 {
 399     TextPosition m = strlen((char *)pattern);
 400     if (m == 0)
 401         return TextCollection::document_result();
 402
 403     TextPosition sp = 0, ep = 0;
 404     Search(pattern, m, &sp, &ep);
 405
 406     TextCollection::document_result result;
 407
 408     // Report end-markers in result interval
 409     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 410     it = endmarkers.lower_bound(sp);
 411     unsigned count = 0;
 412     while (it != endmarkers.end() && it->first <= ep)
 413     {
 414         // End-marker we found belongs to the "previous" doc in the collection
 415         DocId docId = ((it->second).first + 1) % textLength.size();
 416         result.push_back(docId);
 417         ++ count;
 418         ++ it;
 419     }
 420
 421     return result;
 422 }
 423
 424 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 425 {
 426     TextPosition m = strlen((char *)pattern);
 427     if (m == 0)
 428         return TextCollection::document_result();
 429
 430     TextPosition sp = 0, ep = 0;
 431     // Search with end-marker
 432     Search(pattern, m+1, &sp, &ep);
 433
 434     TextCollection::document_result result;
 435
 436     // Check each occurrence
 437     for (; sp <= ep; ++sp)
 438     {
 439         TextPosition i = sp;
 440         uchar c = alphabetrank->charAtPos(i);
 441         while (c != '\0' && !sampled->IsBitSet(i))
 442         {
 443             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 444             c = alphabetrank->charAtPos(i);
 445         }
 446         if (c == '\0')
 447         {
 448             // map::operator[] is not const, using find() instead:
 449             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 450             // End-marker that we found belongs to the "previous" doc in collection:
 451             DocId docId = (endm.first + 1) % textLength.size();
 452             result.push_back(docId);
 453         }
 454         else
 455         {
 456             TextPosition textPos = suffixes[sampled->rank(i)-1];
 457             result.push_back(DocIdAtTextPos(textPos));
 458         }
 459     }
 460
 461     return result;
 462 }
 463
 464 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 465 {
 466     TextPosition m = strlen((char const *)pattern);
 467     if (m == 0)
 468         return TextCollection::document_result();
 469
 470     TextPosition sp = 0, ep = 0;
 471     // Match including end-marker
 472     Search(pattern, m+1, &sp, &ep);
 473
 474     TextCollection::document_result result;
 475
 476     // Report end-markers in result interval
 477     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 478     it = endmarkers.lower_bound(sp);
 479     unsigned count = 0;
 480     while (it != endmarkers.end() && it->first <= ep)
 481     {
 482         // End-marker that we found belongs to the "previous" doc in collection:
 483         DocId docId = ((it->second).first + 1) % textLength.size();
 484         result.push_back(docId);
 485         ++ count;
 486         ++ it;
 487     }
 488
 489     return result;
 490 }
 491
 492
 493 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 494 {
 495     TextPosition m = strlen((char *)pattern);
 496     if (m == 0)
 497         return TextCollection::document_result();
 498
 499     TextPosition sp = 0, ep = 0;
 500     // Search all occurrences
 501     Search(pattern, m, &sp, &ep);
 502
 503     // We want unique document indentifiers, using std::set to collect them
 504     std::set<DocId> resultSet;
 505
 506     // Check each occurrence
 507     for (; sp <= ep; ++sp)
 508     {
 509         TextPosition i = sp;
 510         uchar c = alphabetrank->charAtPos(i);
 511         while (c != '\0' && !sampled->IsBitSet(i))
 512         {
 513             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 514             c = alphabetrank->charAtPos(i);
 515         }
 516         if (c == '\0')
 517         {
 518             // map::operator[] is not const, using find() instead:
 519             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 520             // End-marker we found belongs to the "previous" doc in collection
 521             DocId docId = (endm.first + 1) % textLength.size();
 522             resultSet.insert(docId);
 523         }
 524         else
 525         {
 526             TextPosition textPos = suffixes[sampled->rank(i)-1];
 527             resultSet.insert(DocIdAtTextPos(textPos));
 528         }
 529     }
 530
 531     // Convert std::set to std::vector (TODO better way to construct result vector?)
 532     TextCollection::document_result result;
 533     for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
 534         result.push_back(*it);
 535     return result;
 536 }
 537
 538 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 539 {
 540     // TODO
 541     return document_result();
 542 }
 543
 544 /**
 545  * Full result set queries
 546  */
 547 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 548 {
 549     TextPosition m = strlen((char *)pattern);
 550     if (m == 0)
 551         return full_result();
 552
 553     TextPosition sp = 0, ep = 0;
 554     // Search all occurrences
 555     Search(pattern, m, &sp, &ep);
 556
 557     full_result result;
 558
 559     // Report each occurrence
 560     for (; sp <= ep; ++sp)
 561     {
 562         TextPosition i = sp;
 563         TextPosition dist = 0;
 564         uchar c = alphabetrank->charAtPos(i);
 565         while (c != '\0' && !sampled->IsBitSet(i))
 566         {
 567             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 568             c = alphabetrank->charAtPos(i);
 569             ++ dist;
 570         }
 571         if (c == '\0')
 572         {
 573             // map::operator[] is not const, using find() instead:
 574             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 575             // End-marker we found belongs to the "previous" doc in the collection:
 576             DocId docId = (endm.first+1)%textLength.size();
 577             result.push_back(make_pair(docId, dist));
 578         }
 579         else
 580         {
 581             TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
 582             // Read document identifier and its starting position in text order
 583             DocId docId = DocIdAtTextPos(textPos);
 584             result.push_back(make_pair(docId, textPos - textLength[docId].second));
 585         }
 586     }
 587
 588     return result;
 589 }
 590
 591
 592 /**
 593  * Finds document identifier for given text position
 594  *
 595  * Starting text position of the document is stored into second parameter.
 596  * Binary searching on text starting positions.
 597  */
 598 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
 599 {
 600     assert(i < n);
 601
 602     DocId a = 0;
 603     DocId b = textLength.size() - 1;
 604     while (a < b)
 605     {
 606         DocId c = a + (b - a)/2;
 607         if (textLength[c].second > i)
 608             b = c - 1;
 609         else if (textLength[c+1].second > i)
 610             return c;
 611         else
 612             a = c + 1;
 613     }
 614
 615     assert(a < (DocId)textLength.size());
 616     assert(i > textLength[a].second);
 617     assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
 618     return a;
 619 }
 620
 621 /**
 622  * Save index to a file handle
 623  *
 624  * Throws a std::runtime_error exception on i/o error.
 625  * First byte that is saved represents the version number of the save file.
 626  * In version 1 files, the data fields are:
 627  *        <uchar version> version info;
 628  *        <unsigned s>    samplerate;
 629  *        <ulong n>       length of the BWT;
 630  *        <ulong bwt>     end marker position in BWT;
 631  *        <uchar *>       BWT string of length n;
 632  *        <unsigned r>    number of texts;
 633  *        <vector textLength>
 634  *                        array of <ulong, ulong> pairs.
 635  *
 636  * TODO: Save the data structures instead of BWT sequence?
 637  */
 638 void CSA::Save(FILE *file) const
 639 {
 640     // Saving version 1 data:
 641     uchar versionFlag = 1;
 642     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 643         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 644
 645     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 646         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 647
 648     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 649         throw std::runtime_error("CSA::Save(): file write error (n).");
 650
 651     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 652         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 653
 654     for (ulong offset = 0; offset < n; offset ++)
 655     {
 656         uchar c = alphabetrank->charAtPos(offset);
 657         if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
 658             throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
 659     }
 660
 661     unsigned r = textLength.size();
 662     if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
 663         throw std::runtime_error("CSA::Save(): file write error (r).");
 664
 665     for (r = 0; r < textLength.size(); ++ r)
 666     {
 667         if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1)
 668             throw std::runtime_error("CSA::Save(): file write error (text length).");
 669         if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1)
 670             throw std::runtime_error("CSA::Save(): file write error (text start).");
 671     }
 672 }
 673
 674
 675 /**
 676  * Load index from a file handle
 677  *
 678  * Throws a std::runtime_error exception on i/o error.
 679  * For more info, see CSA::Save().
 680  */
 681 void CSA::Load(FILE *file, unsigned samplerate)
 682 {
 683     // Delete everything
 684     delete dynFMI;       dynFMI = 0;
 685     delete alphabetrank; alphabetrank = 0;
 686     delete sampled;      sampled = 0;
 687     delete [] suffixes;  suffixes = 0;
 688     delete [] positions; positions = 0;
 689     delete [] codetable; codetable = 0;
 690
 691     endmarkers.clear();
 692     textLength.clear();
 693
 694     this->samplerate = samplerate;
 695     this->n = 0;
 696
 697     uchar versionFlag = 0;
 698     if (std::fread(&versionFlag, 1, 1, file) != 1)
 699         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 700     if (versionFlag != 1)
 701         throw std::runtime_error("CSA::Load(): invalid start byte.");
 702
 703     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 704         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 705     if (this->samplerate == 0)
 706         this->samplerate = samplerate;
 707
 708     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 709         throw std::runtime_error("CSA::Load(): file read error (n).");
 710
 711     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 712         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 713
 714     uchar *bwt = new uchar[n];
 715     for (ulong offset = 0; offset < n; offset ++)
 716         if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
 717             throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
 718
 719     unsigned r = 0;
 720     if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
 721         throw std::runtime_error("CSA::Load(): file read error (r).");
 722
 723     while (r > 0)
 724     {
 725         TextPosition length = 0, start = 0;
 726         if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
 727             throw std::runtime_error("CSA::Load(): file read error (text length).");
 728         if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
 729             throw std::runtime_error("CSA::Load(): file read error (text start).");
 730
 731         textLength.push_back(make_pair(length, start));
 732         --r;
 733     }
 734
 735     // Construct data structures
 736     makewavelet(bwt);
 737     delete [] bwt;
 738     maketables();
 739 }
 740
 741
 742
 743 /**
 744  * Rest of the functions follow...
 745  */
 746
 747 uchar * CSA::Substring(TextPosition i, TextPosition l) const
 748 {
 749     uchar *result = new uchar[l + 1];
 750     if (l == 0)
 751     {
 752         result[0] = 0u;
 753         return result;
 754     }
 755
 756     TextPosition dist;
 757     TextPosition k = i + l - 1;
 758     // Check for end of the string
 759     if (k > n - 1)
 760     {
 761         l -= k - n + 1;
 762         k = n - 1;
 763     }
 764
 765     TextPosition skip = samplerate - k % samplerate - 1;
 766     TextPosition j;
 767     if (k / samplerate + 1 >= n / samplerate)
 768     {
 769         j = bwtEndPos;
 770         skip = n - k - 1;
 771     }
 772     else
 773     {
 774         j = positions[k/samplerate+1];
 775         //cout << samplerate << ' ' << j << '\n';
 776     }
 777
 778     for (dist = 0; dist < skip + l; dist++)
 779     {
 780         int c = alphabetrank->charAtPos(j);
 781         if (c == '\0')
 782         {
 783             // map::operator[] is not const, using find() instead:
 784             pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
 785             j = endm.first; // LF-mapping for end-marker
 786         }
 787         else
 788             j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 789         if (dist >= skip)
 790             result[l + skip - dist - 1] = c;
 791     }
 792     result[l] = 0u;
 793     return result;
 794 }
 795
 796   /*TextPosition CSA::inverse(TextPosition i)
 797 {
 798     TextPosition skip = samplerate - i % samplerate;
 799     TextPosition j;
 800     if (i / samplerate + 1 >= n / samplerate)
 801     {
 802         j = bwtEndPos;
 803         skip = n - i;
 804     }
 805     else
 806     {
 807         j = positions[i/samplerate+1];
 808         //cout << samplerate << ' ' << j << '\n';
 809     }
 810
 811     while (skip > 0)
 812     {
 813         int c = alphabetrank->charAtPos(j);
 814         j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 815         skip --;
 816     }
 817     return j;
 818     }*/
 819
 820 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
 821     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 822     int c = (int)pattern[m-1];
 823     TextPosition i=m-1;
 824     TextPosition sp = C[c];
 825     TextPosition ep = C[c+1]-1;
 826     while (sp<=ep && i>=1)
 827     {
 828 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 829         c = (int)pattern[--i];
 830         sp = C[c]+alphabetrank->rank(c,sp-1);
 831         ep = C[c]+alphabetrank->rank(c,ep)-1;
 832     }
 833     *spResult = sp;
 834     *epResult = ep;
 835     if (sp<=ep)
 836         return ep - sp + 1;
 837     else
 838         return 0;
 839  }
 840
 841    /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
 842 {
 843     sp = C[(int)c]+alphabetrank->rank(c,sp-1);
 844     ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
 845
 846     if (sp<=ep)
 847         return ep - sp + 1;
 848     else
 849         return 0;
 850         }*/
 851
 852 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
 853 {
 854     TextPosition dist=0;
 855     while (!sampled->IsBitSet(i))
 856     {
 857         int c = alphabetrank->charAtPos(i);
 858         if (c == '\0')
 859         {
 860             // map::operator[] is not const, using find() instead:
 861             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 862             return endm.second + dist; // returns text position.
 863         }
 864         i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 865         ++dist;
 866     }
 867
 868     return suffixes[sampled->rank(i)-1]+dist;
 869 }
 870
 871 CSA::~CSA() {
 872     delete dynFMI;
 873     delete alphabetrank;
 874     delete sampled;
 875     delete [] suffixes;
 876     delete [] positions;
 877     delete [] codetable;
 878 }
 879
 880 void CSA::makewavelet(uchar *bwt)
 881 {
 882     ulong i, min = 0,
 883              max;
 884     for (i=0;i<256;i++)
 885         C[i]=0;
 886     for (i=0;i<n;++i)
 887         C[(int)bwt[i]]++;
 888     for (i=0;i<256;i++)
 889         if (C[i]>0) {min = i; break;}
 890     for (i=255;i>=min;--i)
 891         if (C[i]>0) {max = i; break;}
 892
 893     // Print frequencies
 894 /*    for(i = 0; i < 256; i++)
 895         if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
 896         fflush(stdout);*/
 897
 898     ulong prev=C[0], temp;
 899     C[0]=0;
 900     for (i=1;i<256;i++) {
 901         temp = C[i];
 902         C[i]=C[i-1]+prev;
 903         prev = temp;
 904     }
 905     this->codetable = node::makecodetable(bwt,n);
 906     alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 907     //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
 908
 909 /*    for (i = 0; i < n; ++i)
 910     {
 911         uchar c = alphabetrank->charAtPos(i);
 912         TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
 913         printf("LF[%lu] = %lu\n", i, j);
 914         }*/
 915 }
 916
 917 void CSA::maketables()
 918 {
 919     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
 920
 921     ulong *sampledpositions = new ulong[n/W+1];
 922     suffixes = new ulong[sampleLength];
 923     positions = new ulong[sampleLength];
 924
 925     ulong i,j=0;
 926     for (i=0;i<n/W+1;i++)
 927         sampledpositions[i]=0lu;
 928
 929     ulong x,p=bwtEndPos;
 930     DocId textId = textLength.size();
 931     ulong ulongmax = 0;
 932     ulongmax--;
 933
 934    //positions:
 935     for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
 936       // i substitutes SA->GetPos(i)
 937         x=(i==n-1)?0:i+1;
 938
 939         if (x % samplerate == 0) {
 940             Tools::SetField(sampledpositions,1,p,1);
 941             positions[x/samplerate] = p;
 942         }
 943
 944         uchar c = alphabetrank->charAtPos(p);
 945         if (c == '\0')
 946         {
 947             // Record the order of end-markers in BWT:
 948             --textId;
 949             endmarkers[p] = make_pair(textId, (TextPosition)x);
 950             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
 951             p = textId; // Correct LF-mapping to the last char of the previous text.
 952         }
 953         else
 954             p = C[c]+alphabetrank->rank(c, p)-1;
 955     }
 956     assert(textId == 0);
 957     /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
 958     {
 959         printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
 960         }*/
 961
 962     sampled = new BitRank(sampledpositions,n,true);
 963
 964    //suffixes:
 965     for(i=0; i<sampleLength; i++) {
 966         j = sampled->rank(positions[i]);
 967         if (j==0) j=sampleLength;
 968         suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
 969     }
 970 }
 971
 972 uchar * CSA::LoadFromFile(const char *filename)
 973 {
 974     uchar *s;
 975     std::ifstream file (filename, std::ios::in|std::ios::binary);
 976     if (file.is_open())
 977     {
 978         std::cerr << "Loading CSA from file: " << filename << std::endl;
 979         file.read((char *)&bwtEndPos, sizeof(TextPosition));
 980         s = new uchar[n];
 981         for (ulong offset = 0; offset < n; offset ++)
 982             file.read((char *)(s + offset), sizeof(char));
 983         file.close();
 984     }
 985     else
 986     {
 987         std::cerr << "Unable to open file " << filename << std::endl;
 988         exit(1);
 989     }
 990     return s;
 991 }
 992
 993 void CSA::SaveToFile(const char *filename, uchar *bwt)
 994 {
 995     std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
 996     if (file.is_open())
 997     {
 998         std::cerr << "Writing CSA to file: " << filename << std::endl;
 999         file.write((char *)&bwtEndPos, sizeof(TextPosition));
1000         std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
1001         for (ulong offset = 0; offset < n; offset ++)
1002             file.write((char *)(bwt + offset), sizeof(char));
1003         file.close();
1004     }
1005     else
1006     {
1007         std::cerr << "Unable to open file " << filename << std::endl;
1008         exit(1);
1009     }
1010 }
1011
1012 /*uchar * CSA::BWT(uchar *text)
1013 {
1014     uchar *s;
1015
1016     DynFMI *wt = new DynFMI((uchar *) text, n);
1017     s = wt->getBWT();
1018     for (ulong i=0;i<n;i++)
1019         if (s[i]==0u) {
1020             bwtEndPos = i;  // TODO: better solution ?
1021             i=n;
1022         }
1023
1024     delete wt;
1025     return s;
1026     }*/
1027
1028 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1029 {
1030     TCodeEntry *result = new TCodeEntry[ 256 ];
1031
1032     count_chars( text, n, result );
1033     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1034 //
1035 // First I push all the leaf nodes into the queue
1036 //
1037     for ( unsigned int i = 0 ; i < 256 ; i++ )
1038         if ( result[ i ].count )
1039             q.push(node( i, result[ i ].count ) );
1040 //
1041 // This loop removes the two smallest nodes from the
1042 // queue.  It creates a new internal node that has
1043 // those two nodes as children. The new internal node
1044 // is then inserted into the priority queue.  When there
1045 // is only one node in the priority queue, the tree
1046 // is complete.
1047 //
1048
1049     while ( q.size() > 1 ) {
1050         node *child0 = new node( q.top() );
1051         q.pop();
1052         node *child1 = new node( q.top() );
1053         q.pop();
1054         q.push( node( child0, child1 ) );
1055     }
1056 //
1057 // Now I compute and return the codetable
1058 //
1059     q.top().maketable(0u,0u, result);
1060     q.pop();
1061     return result;
1062 }
1063
1064
1065 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1066 {
1067     if ( child0 )
1068     {
1069         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1070         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1071         delete child0;
1072         delete child1;
1073     }
1074     else
1075     {
1076         codetable[value].code = code;
1077         codetable[value].bits = bits;
1078     }
1079 }
1080
1081 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1082 {
1083     ulong i;
1084     for (i = 0 ; i < 256 ; i++ )
1085         counts[ i ].count = 0;
1086     for (i=0; i<n; i++)
1087         counts[(int)text[i]].count++;
1088 }
1089
1090 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1091       return x | (bit << pos);
1092 }
1093