CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22 #include <iostream>
  23 #include <queue>
  24 #include <iomanip>
  25 #include <set>
  26 #include <vector>
  27 #include <utility>
  28 #include <cassert>
  29 #include <cstring> // For strlen()
  30 using std::vector;
  31 using std::pair;
  32 using std::make_pair;
  33 using std::map;
  34
  35 using SXSI::TextCollection;
  36
  37
  38 ////////////////////////////////////////////////////////////////////////////
  39 // Class CSA::THuffAlphabetRank
  40
  41 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  42     left = NULL;
  43     right = NULL;
  44     bitrank = NULL;
  45     ch = s[0];
  46     leaf = false;
  47     this->codetable = codetable;
  48
  49     bool *B = new bool[n];
  50     TextPosition sum=0,i;
  51     /*
  52     for (i=0; i< n; i++) {
  53        printf("%c:", (char)((int)s[i]-128));
  54        for (r=0;r<codetable[(int)s[i]].bits;r++)
  55           if (codetable[(int)s[i]].code & (1u <<r))
  56              printf("1");
  57           else printf("0");
  58        printf("\n");
  59     }
  60     printf("\n");
  61     if (level > 100) return;*/
  62     for (i=0;i<n;i++)
  63        if (codetable[(int)s[i]].code & (1u << level)) {
  64           B[i] = true;
  65           sum++;
  66        }
  67        else B[i] = false;
  68     if (sum==0 || sum==n) {
  69         delete [] B;
  70         leaf = true;
  71         return;
  72     }
  73     uchar *sfirst, *ssecond;
  74     //if (n-sum > 0)
  75         sfirst = new uchar[n-sum];
  76     //if (sum > 0)
  77         ssecond = new uchar[sum];
  78     unsigned j=0,k=0;
  79    for (i=0;i<n;i++)
  80         if (B[i]) ssecond[k++] = s[i];
  81         else sfirst[j++] = s[i];
  82     ulong *Binbits = new ulong[n/W+1];
  83     for (i=0;i<n;i++)
  84         Tools::SetField(Binbits,1,i,B[i]);
  85     delete [] B;
  86     bitrank = new BitRank(Binbits,n,true);
  87     //if (j > 0) {
  88         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
  89         delete [] sfirst;
  90     //}
  91     //if (k>0) {
  92         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
  93         delete [] ssecond;
  94     //}
  95 }
  96
  97
  98 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
  99     // testing that the code works correctly
 100     int C[256];
 101     unsigned i,j;
 102     bool correct=true;
 103     for (j=0;j<256;j++)
 104         C[j] = 0;
 105     for (i=0;i<n;i++) {
 106         C[(int)s[i]]++;
 107         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 108         correct = false;
 109         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 110         }
 111     }
 112     return correct;
 113 }
 114
 115 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 116     if (left!=NULL) delete left;
 117     if (right!=NULL) delete right;
 118     if (bitrank!=NULL)
 119         delete bitrank;
 120 }
 121
 122
 123 ////////////////////////////////////////////////////////////////////////////
 124 // Class CSA
 125
 126 /**
 127  * Constructor inits an empty dynamic FM-index.
 128  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 129  */
 130 CSA::CSA(unsigned samplerate)
 131     : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
 132       codetable(0), dynFMI(0)
 133 {
 134     this->samplerate = samplerate;
 135
 136     // FIXME TODO : DynFMI needs distribution of characters before hand
 137     // This will create fully balanced wavelet tree for all chars [0, 255].
 138     uchar temp[256];
 139     for (unsigned i = 0; i < 255; ++i)
 140         temp[i] = i+1;
 141     temp[255] = 0;
 142     dynFMI = new DynFMI(temp, 1, 255, false);
 143 }
 144
 145 /**
 146  * Insert new text
 147  *
 148  * Given text must include end-marker.
 149  * Text identifiers are numbered starting from 0.
 150  */
 151 void CSA::InsertText(uchar const * text)
 152 {
 153     // Sanity check:
 154     assert(dynFMI != 0);
 155
 156     TextPosition m = std::strlen((char *)text) + 1;
 157     // Store text length and starting position
 158     textLength.push_back(make_pair(m, n));
 159     this->n += m;
 160     dynFMI->addText(text, m);
 161 }
 162
 163 void CSA::MakeStatic()
 164 {
 165     // Sanity check:
 166     if (dynFMI == 0)
 167     {
 168         std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
 169         std::exit(0);
 170     }
 171
 172     uchar *bwt = dynFMI->getBWT();
 173     /*printf("1234567890123456789\n");
 174     for (TextPosition i = 0; i < n; i ++)
 175         if (bwt[i] < 50)
 176             printf("%d", (int)bwt[i]);
 177         else
 178             printf("%c", bwt[i]);
 179     printf("\n");
 180     */
 181
 182 /*    for (TextPosition i = 1; i <= n; i ++)
 183     {
 184         printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
 185     }*/
 186
 187     // Sanity check
 188     assert(textLength.size() == dynFMI->getCollectionSize());
 189
 190     delete dynFMI;
 191     dynFMI = 0;
 192
 193     ulong i, min = 0,
 194              max;
 195     for (i=0;i<256;i++)
 196         C[i]=0;
 197     for (i=0;i<n;++i)
 198         C[(int)bwt[i]]++;
 199     for (i=0;i<256;i++)
 200         if (C[i]>0) {min = i; break;}
 201     for (i=255;i>=min;--i)
 202         if (C[i]>0) {max = i; break;}
 203
 204     // Print frequencies
 205 /*    for(i = 0; i < 256; i++)
 206         if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
 207         fflush(stdout);*/
 208
 209     ulong prev=C[0], temp;
 210     C[0]=0;
 211     for (i=1;i<256;i++) {
 212         temp = C[i];
 213         C[i]=C[i-1]+prev;
 214         prev = temp;
 215     }
 216     this->codetable = node::makecodetable(bwt,n);
 217     alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 218     //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
 219
 220 /*    for (i = 0; i < n; ++i)
 221     {
 222         uchar c = alphabetrank->charAtPos(i);
 223         TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
 224         printf("LF[%lu] = %lu\n", i, j);
 225         }*/
 226
 227     // Calculate BWT end-marker position (of last inserted text)
 228     i = 0;
 229     while (bwt[i] != '\0')
 230     {
 231         uchar c = alphabetrank->charAtPos(i);
 232         i = C[c]+alphabetrank->rank(c, i)-1;
 233     }
 234     bwtEndPos = i;
 235     //printf("bwtEndPos = %lu\n", bwtEndPos);
 236
 237     delete [] bwt;
 238
 239     // Make sampling tables
 240     maketables();
 241     // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small
 242     // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+...
 243     //printf("FMindex takes %d B\n",
 244     //    6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8);
 245 }
 246
 247
 248 uchar* CSA::GetText(DocId k) const
 249 {
 250     assert(k < (DocId)textLength.size());
 251
 252     uchar* result = new uchar[textLength[k].first];
 253     TextPosition i = k;
 254     TextPosition j = textLength[k].first-1;
 255     result[j] = '\0';
 256
 257     uchar c = alphabetrank->charAtPos(i);
 258     while (c != '\0')
 259     {
 260         --j;
 261         result[j] = c;
 262         i = C[c]+alphabetrank->rank(c,i)-1;
 263
 264         c = alphabetrank->charAtPos(i); // "next" char.
 265     }
 266     assert(j == 0);
 267     return result;
 268 }
 269
 270 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 271 {
 272     assert(k < (DocId)textLength.size());
 273     assert(j < textLength[k].first);
 274     assert(i <= j);
 275
 276     // Start position of k'th text
 277     ulong start = textLength[k].second;
 278
 279     return Substring(i + start, j-i+1);
 280 }
 281
 282 /**
 283  * Existential queries
 284  */
 285 bool CSA::IsPrefix(uchar const * pattern) const
 286 {
 287     TextPosition m = strlen((char *)pattern);
 288     if (m == 0)
 289         return textLength.size();
 290
 291     TextPosition sp = 0, ep = 0;
 292     Search(pattern, m, &sp, &ep);
 293
 294     // Check for end-marker(s) in result interval
 295     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 296     it = endmarkers.lower_bound(sp);
 297     if (it != endmarkers.end() && it->first <= ep)
 298         return true;
 299     return false;
 300 }
 301
 302 bool CSA::IsSuffix(uchar const *pattern) const
 303 {
 304     // Here counting is as fast as isSuffix():
 305     if (CountSuffix(pattern) > 0)
 306         return true;
 307     return false;
 308 }
 309
 310 bool CSA::IsEqual(uchar const *pattern) const
 311 {
 312     TextPosition m = std::strlen((char *)pattern);
 313     if (m == 0)
 314         return textLength.size();
 315
 316     TextPosition sp = 0, ep = 0;
 317     // Match including end-marker
 318     Search(pattern, m+1, &sp, &ep);
 319
 320     // Check for end-marker(s) in result interval
 321     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 322     it = endmarkers.lower_bound(sp);
 323     if (it != endmarkers.end() && it->first <= ep)
 324         return true;
 325     return false;
 326 }
 327
 328 bool CSA::IsContains(uchar const * pattern) const
 329 {
 330     TextPosition m = strlen((char *)pattern);
 331     if (m == 0)
 332         return true;
 333
 334     TextPosition sp = 0, ep = 0;
 335     // Just check if pattern exists somewhere
 336     ulong count = Search(pattern, m, &sp, &ep);
 337
 338     if (count > 0)
 339         return true;
 340     return false;
 341 }
 342
 343 bool CSA::IsLessThan(uchar const*) const
 344 {
 345     // TODO
 346     return false;
 347 }
 348
 349 /**
 350  * Counting queries
 351  */
 352 unsigned CSA::CountPrefix(uchar const * pattern) const
 353 {
 354     TextPosition m = strlen((char *)pattern);
 355     if (m == 0)
 356         return textLength.size();
 357
 358     TextPosition sp = 0, ep = 0;
 359     Search(pattern, m, &sp, &ep);
 360
 361     // Count end-markers in result interval
 362     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 363     it = endmarkers.lower_bound(sp);
 364     unsigned count = 0;
 365     while (it != endmarkers.end() && it->first <= ep)
 366     {
 367         ++ count;
 368         ++ it;
 369     }
 370
 371     return count;
 372 }
 373
 374 unsigned CSA::CountSuffix(uchar const * pattern) const
 375 {
 376     TextPosition m = strlen((char *)pattern);
 377     if (m == 0)
 378         return textLength.size();
 379
 380     TextPosition sp = 0, ep = 0;
 381     // Search with end-marker
 382     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 383
 384     return count;
 385 }
 386
 387 unsigned CSA::CountEqual(uchar const *pattern) const
 388 {
 389     TextPosition m = strlen((char const *)pattern);
 390     if (m == 0)
 391         return textLength.size();
 392
 393     TextPosition sp = 0, ep = 0;
 394     // Match including end-marker
 395     Search(pattern, m+1, &sp, &ep);
 396
 397     // Count end-markers in result interval
 398     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 399     it = endmarkers.lower_bound(sp);
 400     unsigned count = 0;
 401     while (it != endmarkers.end() && it->first <= ep)
 402     {
 403         ++ count;
 404         ++ it;
 405     }
 406
 407     return count;
 408 }
 409
 410 unsigned CSA::CountContains(uchar const * pattern) const
 411 {
 412     // Here counting is as slow as fetching the result set
 413     // because we would have to filter out occ's that fall within same document.
 414     TextCollection::document_result result = Contains(pattern);
 415     return result.size();
 416 }
 417
 418 unsigned CSA::CountLessThan(uchar const * pattern) const
 419 {
 420     // TODO
 421     return 0;
 422 }
 423
 424 /**
 425  * Document reporting queries
 426  */
 427 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 428 {
 429     TextPosition m = strlen((char *)pattern);
 430     if (m == 0)
 431         return TextCollection::document_result();
 432
 433     TextPosition sp = 0, ep = 0;
 434     Search(pattern, m, &sp, &ep);
 435
 436     TextCollection::document_result result;
 437
 438     // Report end-markers in result interval
 439     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 440     it = endmarkers.lower_bound(sp);
 441     unsigned count = 0;
 442     while (it != endmarkers.end() && it->first <= ep)
 443     {
 444         // End-marker we found belongs to the "previous" doc in the collection
 445         DocId docId = ((it->second).first + 1) % textLength.size();
 446         result.push_back(docId);
 447         ++ count;
 448         ++ it;
 449     }
 450
 451     return result;
 452 }
 453
 454 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 455 {
 456     TextPosition m = strlen((char *)pattern);
 457     if (m == 0)
 458         return TextCollection::document_result();
 459
 460     TextPosition sp = 0, ep = 0;
 461     // Search with end-marker
 462     Search(pattern, m+1, &sp, &ep);
 463
 464     TextCollection::document_result result;
 465
 466     // Check each occurrence
 467     for (; sp <= ep; ++sp)
 468     {
 469         TextPosition i = sp;
 470         uchar c = alphabetrank->charAtPos(i);
 471         while (c != '\0' && !sampled->IsBitSet(i))
 472         {
 473             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 474             c = alphabetrank->charAtPos(i);
 475         }
 476         if (c == '\0')
 477         {
 478             // map::operator[] is not const, using find() instead:
 479             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 480             // End-marker that we found belongs to the "previous" doc in collection:
 481             DocId docId = (endm.first + 1) % textLength.size();
 482             result.push_back(docId);
 483         }
 484         else
 485         {
 486             TextPosition textPos = suffixes[sampled->rank(i)-1];
 487             result.push_back(DocIdAtTextPos(textPos));
 488         }
 489     }
 490
 491     return result;
 492 }
 493
 494 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 495 {
 496     TextPosition m = strlen((char const *)pattern);
 497     if (m == 0)
 498         return TextCollection::document_result();
 499
 500     TextPosition sp = 0, ep = 0;
 501     // Match including end-marker
 502     Search(pattern, m+1, &sp, &ep);
 503
 504     TextCollection::document_result result;
 505
 506     // Report end-markers in result interval
 507     map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
 508     it = endmarkers.lower_bound(sp);
 509     unsigned count = 0;
 510     while (it != endmarkers.end() && it->first <= ep)
 511     {
 512         // End-marker that we found belongs to the "previous" doc in collection:
 513         DocId docId = ((it->second).first + 1) % textLength.size();
 514         result.push_back(docId);
 515         ++ count;
 516         ++ it;
 517     }
 518
 519     return result;
 520 }
 521
 522
 523 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 524 {
 525     TextPosition m = strlen((char *)pattern);
 526     if (m == 0)
 527         return TextCollection::document_result();
 528
 529     TextPosition sp = 0, ep = 0;
 530     // Search all occurrences
 531     Search(pattern, m, &sp, &ep);
 532
 533     // We want unique document indentifiers, using std::set to collect them
 534     std::set<DocId> resultSet;
 535
 536     // Check each occurrence
 537     for (; sp <= ep; ++sp)
 538     {
 539         TextPosition i = sp;
 540         uchar c = alphabetrank->charAtPos(i);
 541         while (c != '\0' && !sampled->IsBitSet(i))
 542         {
 543             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 544             c = alphabetrank->charAtPos(i);
 545         }
 546         if (c == '\0')
 547         {
 548             // map::operator[] is not const, using find() instead:
 549             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 550             // End-marker we found belongs to the "previous" doc in collection
 551             DocId docId = (endm.first + 1) % textLength.size();
 552             resultSet.insert(docId);
 553         }
 554         else
 555         {
 556             TextPosition textPos = suffixes[sampled->rank(i)-1];
 557             resultSet.insert(DocIdAtTextPos(textPos));
 558         }
 559     }
 560
 561     // Convert std::set to std::vector (TODO better way to construct result vector?)
 562     TextCollection::document_result result;
 563     for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
 564         result.push_back(*it);
 565     return result;
 566 }
 567
 568 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 569 {
 570     // TODO
 571     return document_result();
 572 }
 573
 574 /**
 575  * Full result set queries
 576  */
 577 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 578 {
 579     TextPosition m = strlen((char *)pattern);
 580     if (m == 0)
 581         return full_result();
 582
 583     TextPosition sp = 0, ep = 0;
 584     // Search all occurrences
 585     Search(pattern, m, &sp, &ep);
 586
 587     full_result result;
 588
 589     // Report each occurrence
 590     for (; sp <= ep; ++sp)
 591     {
 592         TextPosition i = sp;
 593         TextPosition dist = 0;
 594         uchar c = alphabetrank->charAtPos(i);
 595         while (c != '\0' && !sampled->IsBitSet(i))
 596         {
 597             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 598             c = alphabetrank->charAtPos(i);
 599             ++ dist;
 600         }
 601         if (c == '\0')
 602         {
 603             // map::operator[] is not const, using find() instead:
 604             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 605             // End-marker we found belongs to the "previous" doc in the collection:
 606             DocId docId = (endm.first+1)%textLength.size();
 607             result.push_back(make_pair(docId, dist));
 608         }
 609         else
 610         {
 611             TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
 612             // Read document identifier and its starting position in text order
 613             DocId docId = DocIdAtTextPos(textPos);
 614             result.push_back(make_pair(docId, textPos - textLength[docId].second));
 615         }
 616     }
 617
 618     return result;
 619 }
 620
 621
 622 /**
 623  * Finds document identifier for given text position
 624  *
 625  * Starting text position of the document is stored into second parameter.
 626  * Binary searching on text starting positions.
 627  */
 628 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
 629 {
 630     assert(i < n);
 631
 632     DocId a = 0;
 633     DocId b = textLength.size() - 1;
 634     while (a < b)
 635     {
 636         DocId c = a + (b - a)/2;
 637         if (textLength[c].second > i)
 638             b = c - 1;
 639         else if (textLength[c+1].second > i)
 640             return c;
 641         else
 642             a = c + 1;
 643     }
 644
 645     assert(a < (DocId)textLength.size());
 646     assert(i > textLength[a].second);
 647     assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
 648     return a;
 649 }
 650
 651 void CSA::Load(FILE *filename, unsigned samplerate)
 652 {
 653     // TODO
 654
 655 }
 656
 657 void CSA::Save(FILE *filename)
 658 {
 659     // TODO
 660 }
 661
 662
 663 /**
 664  * Rest of the functions follow...
 665  */
 666
 667 uchar * CSA::Substring(TextPosition i, TextPosition l) const
 668 {
 669     uchar *result = new uchar[l + 1];
 670     if (l == 0)
 671     {
 672         result[0] = 0u;
 673         return result;
 674     }
 675
 676     TextPosition dist;
 677     TextPosition k = i + l - 1;
 678     // Check for end of the string
 679     if (k > n - 1)
 680     {
 681         l -= k - n + 1;
 682         k = n - 1;
 683     }
 684
 685     TextPosition skip = samplerate - k % samplerate - 1;
 686     TextPosition j;
 687     if (k / samplerate + 1 >= n / samplerate)
 688     {
 689         j = bwtEndPos;
 690         skip = n - k - 1;
 691     }
 692     else
 693     {
 694         j = positions[k/samplerate+1];
 695         //cout << samplerate << ' ' << j << '\n';
 696     }
 697
 698     for (dist = 0; dist < skip + l; dist++)
 699     {
 700         int c = alphabetrank->charAtPos(j);
 701         if (c == '\0')
 702         {
 703             // map::operator[] is not const, using find() instead:
 704             pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
 705             j = endm.first; // LF-mapping for end-marker
 706         }
 707         else
 708             j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 709         if (dist >= skip)
 710             result[l + skip - dist - 1] = c;
 711     }
 712     result[l] = 0u;
 713     return result;
 714 }
 715
 716   /*TextPosition CSA::inverse(TextPosition i)
 717 {
 718     TextPosition skip = samplerate - i % samplerate;
 719     TextPosition j;
 720     if (i / samplerate + 1 >= n / samplerate)
 721     {
 722         j = bwtEndPos;
 723         skip = n - i;
 724     }
 725     else
 726     {
 727         j = positions[i/samplerate+1];
 728         //cout << samplerate << ' ' << j << '\n';
 729     }
 730
 731     while (skip > 0)
 732     {
 733         int c = alphabetrank->charAtPos(j);
 734         j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 735         skip --;
 736     }
 737     return j;
 738     }*/
 739
 740 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
 741     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 742     int c = (int)pattern[m-1];
 743     TextPosition i=m-1;
 744     TextPosition sp = C[c];
 745     TextPosition ep = C[c+1]-1;
 746     while (sp<=ep && i>=1)
 747     {
 748 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 749         c = (int)pattern[--i];
 750         sp = C[c]+alphabetrank->rank(c,sp-1);
 751         ep = C[c]+alphabetrank->rank(c,ep)-1;
 752     }
 753     *spResult = sp;
 754     *epResult = ep;
 755     if (sp<=ep)
 756         return ep - sp + 1;
 757     else
 758         return 0;
 759  }
 760
 761    /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
 762 {
 763     sp = C[(int)c]+alphabetrank->rank(c,sp-1);
 764     ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
 765
 766     if (sp<=ep)
 767         return ep - sp + 1;
 768     else
 769         return 0;
 770         }*/
 771
 772 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
 773 {
 774     TextPosition dist=0;
 775     while (!sampled->IsBitSet(i))
 776     {
 777         int c = alphabetrank->charAtPos(i);
 778         if (c == '\0')
 779         {
 780             // map::operator[] is not const, using find() instead:
 781             pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
 782             return endm.second + dist; // returns text position.
 783         }
 784         i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 785         ++dist;
 786     }
 787
 788     return suffixes[sampled->rank(i)-1]+dist;
 789 }
 790
 791 CSA::~CSA() {
 792     delete alphabetrank;
 793     delete sampled;
 794     delete [] suffixes;
 795     delete [] positions;
 796     delete [] codetable;
 797 }
 798
 799
 800 void CSA::maketables()
 801 {
 802     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
 803
 804     ulong *sampledpositions = new ulong[n/W+1];
 805     suffixes = new ulong[sampleLength];
 806     positions = new ulong[sampleLength];
 807
 808     ulong i,j=0;
 809     for (i=0;i<n/W+1;i++)
 810         sampledpositions[i]=0lu;
 811
 812     ulong x,p=bwtEndPos;
 813     DocId textId = textLength.size();
 814     ulong ulongmax = 0;
 815     ulongmax--;
 816
 817    //positions:
 818     for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
 819       // i substitutes SA->GetPos(i)
 820         x=(i==n-1)?0:i+1;
 821
 822         if (x % samplerate == 0) {
 823             Tools::SetField(sampledpositions,1,p,1);
 824             positions[x/samplerate] = p;
 825         }
 826
 827         uchar c = alphabetrank->charAtPos(p);
 828         if (c == '\0')
 829         {
 830             // Record the order of end-markers in BWT:
 831             --textId;
 832             endmarkers[p] = make_pair(textId, (TextPosition)x);
 833             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
 834             p = textId; // Correct LF-mapping to the last char of the previous text.
 835         }
 836         else
 837             p = C[c]+alphabetrank->rank(c, p)-1;
 838     }
 839     assert(textId == 0);
 840     /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
 841     {
 842         printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
 843         }*/
 844
 845     sampled = new BitRank(sampledpositions,n,true);
 846
 847    //suffixes:
 848     for(i=0; i<sampleLength; i++) {
 849         j = sampled->rank(positions[i]);
 850         if (j==0) j=sampleLength;
 851         suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
 852     }
 853 }
 854
 855 uchar * CSA::LoadFromFile(const char *filename)
 856 {
 857     uchar *s;
 858     std::ifstream file (filename, std::ios::in|std::ios::binary);
 859     if (file.is_open())
 860     {
 861         std::cerr << "Loading CSA from file: " << filename << std::endl;
 862         file.read((char *)&bwtEndPos, sizeof(TextPosition));
 863         s = new uchar[n];
 864         for (ulong offset = 0; offset < n; offset ++)
 865             file.read((char *)(s + offset), sizeof(char));
 866         file.close();
 867     }
 868     else
 869     {
 870         std::cerr << "Unable to open file " << filename << std::endl;
 871         exit(1);
 872     }
 873     return s;
 874 }
 875
 876 void CSA::SaveToFile(const char *filename, uchar *bwt)
 877 {
 878     std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
 879     if (file.is_open())
 880     {
 881         std::cerr << "Writing CSA to file: " << filename << std::endl;
 882         file.write((char *)&bwtEndPos, sizeof(TextPosition));
 883         std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
 884         for (ulong offset = 0; offset < n; offset ++)
 885             file.write((char *)(bwt + offset), sizeof(char));
 886         file.close();
 887     }
 888     else
 889     {
 890         std::cerr << "Unable to open file " << filename << std::endl;
 891         exit(1);
 892     }
 893 }
 894
 895 /*uchar * CSA::BWT(uchar *text)
 896 {
 897     uchar *s;
 898
 899     DynFMI *wt = new DynFMI((uchar *) text, n);
 900     s = wt->getBWT();
 901     for (ulong i=0;i<n;i++)
 902         if (s[i]==0u) {
 903             bwtEndPos = i;  // TODO: better solution ?
 904             i=n;
 905         }
 906
 907     delete wt;
 908     return s;
 909     }*/
 910
 911 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
 912 {
 913     TCodeEntry *result = new TCodeEntry[ 256 ];
 914
 915     count_chars( text, n, result );
 916     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
 917 //
 918 // First I push all the leaf nodes into the queue
 919 //
 920     for ( unsigned int i = 0 ; i < 256 ; i++ )
 921         if ( result[ i ].count )
 922             q.push(node( i, result[ i ].count ) );
 923 //
 924 // This loop removes the two smallest nodes from the
 925 // queue.  It creates a new internal node that has
 926 // those two nodes as children. The new internal node
 927 // is then inserted into the priority queue.  When there
 928 // is only one node in the priority queue, the tree
 929 // is complete.
 930 //
 931
 932     while ( q.size() > 1 ) {
 933         node *child0 = new node( q.top() );
 934         q.pop();
 935         node *child1 = new node( q.top() );
 936         q.pop();
 937         q.push( node( child0, child1 ) );
 938     }
 939 //
 940 // Now I compute and return the codetable
 941 //
 942     q.top().maketable(0u,0u, result);
 943     q.pop();
 944     return result;
 945 }
 946
 947
 948 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
 949 {
 950     if ( child0 )
 951     {
 952         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
 953         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
 954         delete child0;
 955         delete child1;
 956     }
 957     else
 958     {
 959         codetable[value].code = code;
 960         codetable[value].bits = bits;
 961     }
 962 }
 963
 964 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
 965 {
 966     ulong i;
 967     for (i = 0 ; i < 256 ; i++ )
 968         counts[ i ].count = 0;
 969     for (i=0; i<n; i++)
 970         counts[(int)text[i]].count++;
 971 }
 972
 973 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
 974       return x | (bit << pos);
 975 }
 976