CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22
  23 // Conflicts with std::min and std::max
  24 #undef min
  25 #undef max
  26 #include "dcover/bwt.hpp"
  27
  28 #include "HeapProfiler.h" // FIXME remove
  29
  30 #include <iostream>
  31 #include <queue>
  32 #include <set>
  33 #include <vector>
  34 #include <utility>
  35 #include <stdexcept>
  36 #include <cassert>
  37 #include <cstring> // For strlen()
  38 using std::vector;
  39 using std::pair;
  40 using std::make_pair;
  41 using std::map;
  42
  43 using SXSI::TextCollection;
  44
  45 // Save file version info
  46 const uchar CSA::versionFlag = 2;
  47
  48 ////////////////////////////////////////////////////////////////////////////
  49 // Class CSA::THuffAlphabetRank
  50 // FIXME Unused code
  51 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  52     left = NULL;
  53     right = NULL;
  54     bitrank = NULL;
  55     ch = s[0];
  56     leaf = false;
  57     this->codetable = codetable;
  58
  59     bool *B = new bool[n];
  60     TextPosition sum=0,i;
  61     /*
  62     for (i=0; i< n; i++) {
  63        printf("%c:", (char)((int)s[i]-128));
  64        for (r=0;r<codetable[(int)s[i]].bits;r++)
  65           if (codetable[(int)s[i]].code & (1u <<r))
  66              printf("1");
  67           else printf("0");
  68        printf("\n");
  69     }
  70     printf("\n");
  71     if (level > 100) return;*/
  72     for (i=0;i<n;i++)
  73        if (codetable[(int)s[i]].code & (1u << level)) {
  74           B[i] = true;
  75           sum++;
  76        }
  77        else B[i] = false;
  78     if (sum==0 || sum==n) {
  79         delete [] B;
  80         leaf = true;
  81         return;
  82     }
  83     uchar *sfirst, *ssecond;
  84     //if (n-sum > 0)
  85         sfirst = new uchar[n-sum];
  86     //if (sum > 0)
  87         ssecond = new uchar[sum];
  88     unsigned j=0,k=0;
  89    for (i=0;i<n;i++)
  90         if (B[i]) ssecond[k++] = s[i];
  91         else sfirst[j++] = s[i];
  92     ulong *Binbits = new ulong[n/W+1];
  93     for (i=0;i<n;i++)
  94         Tools::SetField(Binbits,1,i,B[i]);
  95     delete [] B;
  96     bitrank = new BitRank(Binbits,n,true);
  97     //if (j > 0) {
  98         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
  99         delete [] sfirst;
 100     //}
 101     //if (k>0) {
 102         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
 103         delete [] ssecond;
 104     //}
 105 }
 106
 107
 108 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 109     // testing that the code works correctly
 110     int C[256];
 111     unsigned i,j;
 112     bool correct=true;
 113     for (j=0;j<256;j++)
 114         C[j] = 0;
 115     for (i=0;i<n;i++) {
 116         C[(int)s[i]]++;
 117         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 118         correct = false;
 119         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 120         }
 121     }
 122     return correct;
 123 }
 124
 125 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 126     if (left!=NULL) delete left;
 127     if (right!=NULL) delete right;
 128     if (bitrank!=NULL)
 129         delete bitrank;
 130 }
 131
 132
 133 ////////////////////////////////////////////////////////////////////////////
 134 // Class CSA
 135
 136 /**
 137  * Constructor inits an empty dynamic FM-index.
 138  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 139  */
 140 CSA::CSA(unsigned samplerate)
 141     : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
 142       suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
 143       emptyTextRank(0)
 144 {
 145     this->samplerate = samplerate;
 146
 147 #ifdef CSA_TEST_BWT
 148     // FIXME TODO : DynFMI needs distribution of characters before hand
 149     // This will create fully balanced wavelet tree for all chars [0, 255].
 150     uchar temp[256];
 151     for (unsigned i = 0; i < 255; ++i)
 152         temp[i] = i+1;
 153     temp[255] = 0;
 154     dynFMI = new DynFMI(temp, 1, 255, false);
 155
 156     /* Debug code: take char distribution from data.txt.
 157     uchar *temp = Tools::GetFileContents("data.txt", 0);
 158     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 159     delete [] temp;*/
 160 #endif
 161 }
 162
 163 /**
 164  * Insert new text
 165  *
 166  * Given text must include end-marker.
 167  * Text identifiers are numbered starting from 0.
 168  */
 169 void CSA::InsertText(uchar const * text)
 170 {
 171     // Sanity check:
 172 #ifdef CSA_TEST_BWT
 173     assert(dynFMI != 0);
 174 #endif
 175
 176     TextPosition m = std::strlen((char *)text) + 1;
 177     if (m > maxTextLength)
 178         maxTextLength = m; // Store length of the longest text seen so far.
 179
 180     if (m > 1)
 181     {
 182         this->n += m;
 183         this->numberOfTexts ++;
 184         this->numberOfAllTexts ++;
 185 #ifdef CSA_TEST_BWT
 186         dynFMI->addText(text, m);
 187 #endif
 188
 189         texts.append((char *) text);
 190         texts.append(1, '\0');
 191     }
 192     else
 193     {
 194         emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
 195         this->numberOfAllTexts ++;
 196     }
 197 }
 198
 199 void CSA::MakeStatic()
 200 {
 201     // Sanity check:
 202 #ifdef CSA_TEST_BWT
 203     if (dynFMI == 0)
 204         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 205 #endif
 206
 207     // Bitvector of empty texts
 208     {
 209         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
 210         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
 211         for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
 212             tempB[i] = 0;
 213         for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
 214             Tools::SetField(tempB, 1, (*it), 1);
 215         emptyTextId.clear();
 216         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
 217     }
 218
 219 /*    std::cout << "texts: ";
 220     for (ulong i = 0; i < n; i++)
 221         if (texts[i] > 50)
 222             std::cout << texts[i];
 223         else
 224             std::cout << (int)texts[i];
 225             std::cout << std::endl;*/
 226     uchar *bwt = new uchar[n];
 227     {
 228         // FIXME More succinct solution needed
 229         unsigned* maptexts = new unsigned[n+1];
 230         // Map text chars to [0..255+numberOfTexts]
 231         unsigned count = 0;
 232         for (ulong i = 0; i < n; ++i)
 233             if (texts[i] == 0)
 234                 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
 235             else {
 236                 uchar c = (uchar)texts[i];
 237                 maptexts[i] = (unsigned)c + numberOfTexts + 1;
 238             }
 239
 240         maptexts[n] = '\0';
 241         texts.erase();
 242         texts.reserve(0); // Release capacity
 243         assert(count == numberOfTexts);
 244
 245         bwtEndPos = (ulong)compute_bwt(&maptexts[0], &maptexts[n],
 246                                        &bwt[0], numberOfTexts + 1);
 247
 248         bwt[--bwtEndPos] = '\0';
 249         delete [] maptexts;
 250     } // End of bw transform
 251
 252 #ifdef CSA_TEST_BWT
 253     {
 254         uchar *bwtTest = dynFMI->getBWT();
 255 /*        printf("123456789012345678901234567890123456789\n");
 256         for (TextPosition i = 0; i < n && i < 100; i ++)
 257             if (bwt[i] < 50)
 258                 printf("%d", (int)bwt[i]);
 259             else
 260                 printf("%c", bwt[i]);
 261         printf("\n");
 262         for (TextPosition i = 0; i < n && i < 100; i ++)
 263             if (bwtTest[i] < 50)
 264                 printf("%d", (int)bwtTest[i]);
 265             else
 266                 printf("%c", bwtTest[i]);
 267                 printf("\n");*/
 268
 269         // Sanity check
 270         assert(numberOfTexts == dynFMI->getCollectionSize());
 271
 272         delete dynFMI;
 273         dynFMI = 0;
 274         for (ulong i = 0; i < n; ++i)
 275             if (bwt[i] != bwtTest[i])
 276             {
 277                 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
 278                 assert(0);
 279             }
 280         delete [] bwtTest;
 281     }
 282 #endif // CSA_TEST_BWT
 283
 284     makewavelet(bwt); // Deletes bwt!
 285     bwt = 0;
 286
 287     // Make sampling tables
 288     maketables();
 289 }
 290
 291 bool CSA::EmptyText(DocId k) const
 292 {
 293     assert(k < (DocId)numberOfTexts);
 294     if (emptyTextRank->IsBitSet(k))
 295         return true;
 296     return false;
 297 }
 298
 299 uchar* CSA::GetText(DocId k) const
 300 {
 301     assert(k < (DocId)numberOfTexts);
 302
 303     ulong textRank = 0;
 304     if (emptyTextRank->IsBitSet(k, &textRank))
 305     {
 306         uchar* result = new uchar[1];
 307         result[0] = 0;
 308         return result;
 309     }
 310     // Map to non-empty text
 311     k -= textRank; //emptyTextRank->rank(k);
 312
 313     TextPosition i = k;
 314
 315     string result;
 316     // Reserve average string length to avoid reallocs
 317     result.reserve(n/numberOfTexts);
 318
 319     uchar c = alphabetrank->access(i);
 320     while (c != '\0')
 321     {
 322         result.push_back(c);
 323         i = C[c]+alphabetrank->rank(c,i)-1;
 324
 325         c = alphabetrank->access(i); // "next" char.
 326     }
 327
 328     // Convert to uchar (FIXME return string?)
 329     i = result.size();
 330     uchar* res = new uchar[i+1];
 331     res[i] = '\0';
 332     for (ulong j = 0; j < i; ++j)
 333         res[i-j-1] = result[j];
 334     return res;
 335 }
 336 /*
 337  * Not supported
 338 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 339 {
 340     assert(k < (DocId)numberOfTexts);
 341     assert(j < (*textLength)[k]);
 342     assert(i <= j);
 343
 344     ulong textRank = 0;
 345     if (emptyTextRank->IsBitSet(k, &textRank))
 346     {
 347         uchar* result = new uchar[1];
 348         result[0] = 0;
 349         return result;
 350     }
 351
 352     // Map to non-empty text
 353     k -= textRank; // emptyTextRank->rank(k);
 354
 355     // Start position of k'th text
 356     ulong start = (*textStartPos)[k];
 357
 358     return Substring(i + start, j-i+1);
 359     }*/
 360
 361 /******************************************************************
 362  * Existential queries
 363  */
 364 bool CSA::IsPrefix(uchar const * pattern) const
 365 {
 366     TextPosition m = strlen((char *)pattern);
 367     if (m == 0)
 368         return true;
 369
 370     TextPosition sp = 0, ep = 0;
 371     Search(pattern, m, &sp, &ep);
 372
 373     // Check for end-marker(s) in result interval
 374     if (CountEndmarkers(sp, ep))
 375         return true;
 376     return false;
 377 }
 378
 379 bool CSA::IsSuffix(uchar const *pattern) const
 380 {
 381     // Here counting is as fast as IsSuffix():
 382     if (CountSuffix(pattern) > 0)
 383         return true;
 384     return false;
 385 }
 386
 387 bool CSA::IsEqual(uchar const *pattern) const
 388 {
 389     TextPosition m = std::strlen((char *)pattern);
 390     if (m == 0)
 391     {
 392         if (numberOfAllTexts - numberOfTexts > 0u)
 393             return true; // Empty texts exists
 394         return false; // No empty texts exists
 395     }
 396
 397     TextPosition sp = 0, ep = 0;
 398     // Match including end-marker
 399     Search(pattern, m+1, &sp, &ep);
 400
 401     // Check for end-marker(s) in result interval
 402     if (CountEndmarkers(sp, ep))
 403         return true;
 404     return false;
 405 }
 406
 407 bool CSA::IsContains(uchar const * pattern) const
 408 {
 409     TextPosition m = strlen((char *)pattern);
 410     if (m == 0)
 411         return true;
 412
 413     TextPosition sp = 0, ep = 0;
 414     // Just check if pattern exists somewhere
 415     ulong count = Search(pattern, m, &sp, &ep);
 416
 417     if (count > 0)
 418         return true;
 419     return false;
 420 }
 421
 422 bool CSA::IsLessThan(uchar const*) const
 423 {
 424     // TODO
 425     assert(0);
 426     return false;
 427 }
 428
 429 /******************************************************************
 430  * Counting queries
 431  */
 432 ulong CSA::Count(uchar const * pattern) const
 433 {
 434     TextPosition m = strlen((char *)pattern);
 435     if (m == 0)
 436         return 0;
 437
 438     TextPosition sp = 0, ep = 0;
 439     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 440     return count;
 441 }
 442
 443 unsigned CSA::CountPrefix(uchar const * pattern) const
 444 {
 445     TextPosition m = strlen((char *)pattern);
 446     if (m == 0)
 447         return numberOfAllTexts;
 448
 449     TextPosition sp = 0, ep = 0;
 450     Search(pattern, m, &sp, &ep);
 451
 452     // Count end-markers in result interval
 453     return CountEndmarkers(sp, ep);
 454 }
 455
 456 unsigned CSA::CountSuffix(uchar const * pattern) const
 457 {
 458     TextPosition m = strlen((char *)pattern);
 459     if (m == 0)
 460         return numberOfAllTexts;
 461
 462     TextPosition sp = 0, ep = 0;
 463     // Search with end-marker
 464     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 465
 466     return count;
 467 }
 468
 469 unsigned CSA::CountEqual(uchar const *pattern) const
 470 {
 471     TextPosition m = strlen((char const *)pattern);
 472     if (m == 0)
 473         return numberOfAllTexts - numberOfTexts; // Empty texts.
 474
 475     TextPosition sp = 0, ep = 0;
 476     // Match including end-marker
 477     Search(pattern, m+1, &sp, &ep);
 478
 479     // Count end-markers in result interval
 480     return CountEndmarkers(sp, ep);
 481 }
 482
 483 unsigned CSA::CountContains(uchar const * pattern) const
 484 {
 485     TextPosition m = strlen((char const *)pattern);
 486     if (m == 0)
 487         return numberOfAllTexts; // Total number of texts.
 488
 489     // Here counting is as slow as fetching the result set
 490     // because we would have to filter out occ's that fall within same document.
 491     TextCollection::document_result result = Contains(pattern);
 492     return result.size();
 493 }
 494
 495 unsigned CSA::CountLessThan(uchar const * pattern) const
 496 {
 497     // TODO
 498     assert(0);
 499     return 0;
 500 }
 501
 502 /**
 503  * Document reporting queries
 504  */
 505 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 506 {
 507     TextPosition m = strlen((char *)pattern);
 508     if (m == 0)
 509         return TextCollection::document_result(); // FIXME Should return all 1...k
 510
 511     TextPosition sp = 0, ep = 0;
 512     Search(pattern, m, &sp, &ep);
 513
 514     TextCollection::document_result result;
 515
 516     // Report end-markers in result interval
 517     unsigned resultSize = CountEndmarkers(sp, ep);
 518     if (resultSize == 0)
 519         return result;
 520
 521     result.reserve(resultSize); // Try to avoid reallocation.
 522
 523     // Iterate through end-markers in [sp,ep]:
 524     unsigned i = 0;
 525     if (sp > 0)
 526         i = alphabetrank->rank(0, sp - 1);
 527     while (resultSize)
 528     {
 529         // End-marker we found belongs to the "preceeding" doc in the collection
 530         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 531         // Map to doc ID:
 532         docId = emptyTextRank->select0(docId+1);
 533         result.push_back(docId);
 534
 535         -- resultSize;
 536         ++ i;
 537     }
 538
 539     return result;
 540 }
 541
 542 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 543 {
 544     TextPosition m = strlen((char *)pattern);
 545     if (m == 0)
 546         return TextCollection::document_result(); // FIXME Should return all 1...k
 547
 548     TextPosition sp = 0, ep = 0;
 549     // Search with end-marker
 550     Search(pattern, m+1, &sp, &ep);
 551
 552     TextCollection::document_result result;
 553     result.reserve(ep-sp+1); // Try to avoid reallocation.
 554
 555     ulong sampled_rank_i = 0;
 556     // Check each occurrence
 557     for (; sp <= ep; ++sp)
 558     {
 559         TextPosition i = sp;
 560
 561         uchar c = alphabetrank->access(i);
 562         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 563         {
 564             i = C[c]+alphabetrank->rank(c,i)-1;
 565             c = alphabetrank->access(i);
 566         }
 567         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 568
 569         if (c == '\0')
 570         {
 571             // Rank among the end-markers in BWT
 572             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 573
 574             // End-marker that we found belongs to the "preceeding" doc in collection:
 575             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 576             // Map to doc ID:
 577             docId = emptyTextRank->select0(docId+1);
 578             result.push_back(docId);
 579         }
 580         else // Sampled position
 581         {
 582             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 583             // Map to doc ID:
 584             docId = emptyTextRank->select0(docId+1);
 585             result.push_back(docId);
 586         }
 587     }
 588
 589     return result;
 590 }
 591
 592 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 593 {
 594     TextPosition m = strlen((char const *)pattern);
 595     if (m == 0)
 596         return TextCollection::document_result(); // FIXME Should return all empty texts
 597
 598     TextPosition sp = 0, ep = 0;
 599     // Match including end-marker
 600     Search(pattern, m+1, &sp, &ep);
 601
 602     TextCollection::document_result result;
 603
 604     // Report end-markers in result interval
 605     unsigned resultSize = CountEndmarkers(sp, ep);
 606     if (resultSize == 0)
 607         return result;
 608
 609     result.reserve(resultSize); // Try to avoid reallocation.
 610
 611     unsigned i = 0;
 612     if (sp > 0)
 613         i = alphabetrank->rank(0, sp - 1);
 614     while (resultSize)
 615     {
 616         // End-marker we found belongs to the "previous" doc in the collection
 617         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 618         // Map to doc ID:
 619         docId = emptyTextRank->select0(docId+1);
 620         result.push_back(docId);
 621
 622         -- resultSize;
 623         ++ i;
 624     }
 625
 626     return result;
 627 }
 628
 629
 630 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 631 {
 632     TextPosition m = strlen((char *)pattern);
 633     if (m == 0)
 634         return TextCollection::document_result();
 635
 636     TextPosition sp = 0, ep = 0;
 637     // Search all occurrences
 638     Search(pattern, m, &sp, &ep);
 639
 640     // We want unique document indentifiers, using std::set to collect them
 641     std::set<DocId> resultSet;
 642
 643     ulong sampled_rank_i = 0;
 644     // Check each occurrence
 645     for (; sp <= ep; ++sp)
 646     {
 647         TextPosition i = sp;
 648         uchar c = alphabetrank->access(i);
 649         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 650         {
 651             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 652             c = alphabetrank->access(i);
 653         }
 654         if (c == '\0')
 655         {
 656             // Rank among the end-markers in BWT
 657             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 658
 659             // End-marker that we found belongs to the "preceeding" doc in collection:
 660             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 661             resultSet.insert(docId);
 662         }
 663         else
 664         {
 665             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 666             assert((unsigned)di < numberOfTexts);
 667             resultSet.insert(di);
 668         }
 669     }
 670
 671     // Convert std::set to std::vector
 672     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 673     // Map to doc ID's
 674     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 675         *it = emptyTextRank->select0(*it+1);
 676     return result;
 677 }
 678
 679 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 680 {
 681     // TODO
 682     assert(0);
 683     return document_result();
 684 }
 685
 686 /**
 687  * Full result set queries
 688  */
 689 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 690 {
 691     TextPosition m = strlen((char *)pattern);
 692     if (m == 0)
 693         return full_result(); // FIXME Throw exception?
 694
 695     TextPosition sp = 0, ep = 0;
 696     // Search all occurrences
 697     Search(pattern, m, &sp, &ep);
 698
 699     full_result result;
 700     result.reserve(ep-sp+1); // Try to avoid reallocation.
 701
 702     ulong sampled_rank_i = 0;
 703     // Report each occurrence
 704     for (; sp <= ep; ++sp)
 705     {
 706         TextPosition i = sp;
 707         TextPosition dist = 0;
 708         uchar c = alphabetrank->access(i);
 709         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 710         {
 711             i = C[c]+alphabetrank->rank(c,i)-1;
 712             c = alphabetrank->access(i);
 713             ++ dist;
 714         }
 715         if (c == '\0')
 716         {
 717             // Rank among the end-markers in BWT
 718             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 719
 720             // End-marker that we found belongs to the "preceeding" doc in collection:
 721             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 722             // Map to doc ID:
 723             docId = emptyTextRank->select0(docId+1);
 724             result.push_back(make_pair(docId, dist));
 725         }
 726         else
 727         {
 728             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
 729             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 730 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
 731
 732             // Map to doc ID:
 733             docId = emptyTextRank->select0(docId+1);
 734             result.push_back(make_pair(docId, textPos));
 735         }
 736     }
 737
 738     return result;
 739 }
 740
 741
 742 /**
 743  * Save index to a file handle
 744  *
 745  * Throws a std::runtime_error exception on i/o error.
 746  * First byte that is saved represents the version number of the save file.
 747  * In version 2 files, the data fields are:
 748  *  uchar versionFlag;
 749     TextPosition n;
 750     unsigned samplerate;
 751     unsigned C[256];
 752     TextPosition bwtEndPos;
 753     static_sequence * alphabetrank;
 754     BSGAP *sampled;
 755     BlockArray *suffixes;
 756     BlockArray *suffixDocId;
 757     unsigned numberOfTexts;
 758     unsigned numberOfAllTexts;
 759     ulong maxTextLength;
 760     BlockArray *endmarkerDocId;
 761     BSGAP *emptyTextRank;
 762  */
 763 void CSA::Save(FILE *file) const
 764 {
 765     // Saving version info:
 766     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 767         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 768
 769     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 770         throw std::runtime_error("CSA::Save(): file write error (n).");
 771     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 772         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 773
 774     for(ulong i = 0; i < 256; ++i)
 775         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 776             throw std::runtime_error("CSA::Save(): file write error (C table).");
 777
 778     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 779         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 780
 781     alphabetrank->save(file);
 782     sampled->Save(file);
 783     suffixes->Save(file);
 784     suffixDocId->Save(file);
 785
 786     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 787         throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
 788     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 789         throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
 790     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 791         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
 792
 793     endmarkerDocId->Save(file);
 794     emptyTextRank->Save(file);
 795     fflush(file);
 796 }
 797
 798
 799 /**
 800  * Load index from a file handle
 801  *
 802  * Throws a std::runtime_error exception on i/o error.
 803  * For more info, see CSA::Save().
 804  */
 805 void CSA::Load(FILE *file, unsigned samplerate)
 806 {
 807     // Delete everything
 808 #ifdef CSA_TEST_BWT
 809     delete dynFMI;       dynFMI = 0;
 810 #endif
 811     delete alphabetrank; alphabetrank = 0;
 812     delete sampled;      sampled = 0;
 813     delete suffixes;     suffixes = 0;
 814     delete suffixDocId;  suffixDocId = 0;
 815     delete [] codetable; codetable = 0;
 816
 817     delete endmarkerDocId; endmarkerDocId = 0;
 818     delete emptyTextRank;  emptyTextRank = 0;
 819
 820     this->maxTextLength = 0;
 821     this->numberOfTexts = 0;
 822     this->numberOfAllTexts = 0;
 823     this->samplerate = samplerate;
 824     this->n = 0;
 825
 826     uchar verFlag = 0;
 827     if (std::fread(&verFlag, 1, 1, file) != 1)
 828         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 829     if (verFlag != CSA::versionFlag)
 830         throw std::runtime_error("CSA::Load(): invalid save file version.");
 831
 832     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 833         throw std::runtime_error("CSA::Load(): file read error (n).");
 834     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 835         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 836 // FIXME samplerate can not be changed during load.
 837 //    if (this->samplerate == 0)
 838 //        this->samplerate = samplerate;
 839
 840     for(ulong i = 0; i < 256; ++i)
 841         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 842             throw std::runtime_error("CSA::Load(): file read error (C table).");
 843
 844     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 845         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 846
 847     alphabetrank = static_sequence::load(file);
 848     sampled = new BSGAP(file);
 849     suffixes = new BlockArray(file);
 850     suffixDocId = new BlockArray(file);
 851
 852     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 853         throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
 854     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 855         throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
 856     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 857         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
 858
 859     endmarkerDocId = new BlockArray(file);
 860     emptyTextRank = new BSGAP(file);
 861
 862     // FIXME Construct data structures with new samplerate
 863     //maketables();
 864 }
 865
 866
 867
 868 /**
 869  * Rest of the functions follow...
 870  */
 871
 872
 873 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 874 {
 875     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 876     int c = (int)pattern[m-1];
 877     TextPosition i=m-1;
 878     TextPosition sp = C[c];
 879     TextPosition ep = C[c+1]-1;
 880     while (sp<=ep && i>=1)
 881     {
 882 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 883         c = (int)pattern[--i];
 884         sp = C[c]+alphabetrank->rank(c,sp-1);
 885         ep = C[c]+alphabetrank->rank(c,ep)-1;
 886     }
 887     *spResult = sp;
 888     *epResult = ep;
 889     if (sp<=ep)
 890         return ep - sp + 1;
 891     else
 892         return 0;
 893 }
 894
 895 CSA::~CSA() {
 896 #ifdef CSA_TEST_BWT
 897     delete dynFMI;
 898 #endif
 899     delete alphabetrank;
 900     delete sampled;
 901     delete suffixes;
 902     delete suffixDocId;
 903     delete [] codetable; // FIXME remove
 904
 905     delete endmarkerDocId;
 906     delete emptyTextRank;
 907 }
 908
 909 void CSA::makewavelet(uchar *bwt)
 910 {
 911     ulong i, min = 0,
 912              max;
 913     for (i=0;i<256;i++)
 914         C[i]=0;
 915     for (i=0;i<n;++i)
 916         C[(int)bwt[i]]++;
 917     for (i=0;i<256;i++)
 918         if (C[i]>0) {min = i; break;}
 919     for (i=255;i>=min;--i)
 920         if (C[i]>0) {max = i; break;}
 921
 922     ulong prev=C[0], temp;
 923     C[0]=0;
 924     for (i=1;i<256;i++) {
 925         temp = C[i];
 926         C[i]=C[i-1]+prev;
 927         prev = temp;
 928     }
 929 //    this->codetable = node::makecodetable(bwt,n);
 930 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 931 //    delete [] bwt;
 932     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
 933
 934     // FIXME: static_sequence_wvtree accepts only uint arrays
 935     uint *text = new uint[n];
 936     for (i = 0; i < n; ++i) // Silly
 937         text[i] = bwt[i];
 938     delete [] bwt;
 939     bwt = 0;
 940
 941     alphabet_mapper * am = new alphabet_mapper_none();
 942     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
 943     wt_coder * wtc = new wt_coder_huff(text,n,am);
 944     alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
 945     delete bmb;
 946     delete [] text;
 947     text = 0;
 948 }
 949
 950 void CSA::maketables()
 951 {
 952     // Calculate BWT end-marker position (of last inserted text)
 953     // Note: bwtEndPos already set!
 954     // and the length of the first text (counter l):
 955 #ifdef CSA_TEST_BWT
 956     ulong i = 0;
 957     ulong l = 1;
 958
 959     uchar c  = alphabetrank->access(i);
 960     while (c != '\0')
 961     {
 962         i = C[c]+alphabetrank->rank(c, i)-1;
 963         ++l;
 964         c = alphabetrank->access(i);
 965     }
 966     assert(i == bwtEndPos); // compare result
 967 #endif
 968
 969     // Build up arrays for text length and starting positions
 970     // FIXME Temp, remove
 971     //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
 972     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
 973     //(*textLength)[0] = l;
 974     (*textStartPos)[0] = 0;
 975
 976     // Construct samples
 977     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
 978     unsigned ceilLog2n = Tools::CeilLog2(n);
 979     BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
 980     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
 981
 982     // Mapping from end-markers to doc ID's:
 983     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
 984
 985     ulong *sampledpositions = new ulong[n/W+1];
 986     for (ulong i=0;i<n/W+1;i++)
 987         sampledpositions[i]=0lu;
 988
 989     ulong x,p=bwtEndPos;
 990     ulong sampleCount = 0;
 991     // Keeping track of text position of prev. end-marker seen
 992     ulong posOfSuccEndmarker = n;
 993     DocId textId = numberOfTexts;
 994     ulong ulongmax = 0;
 995     ulongmax--;
 996
 997     //positions:
 998     for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
 999       // i substitutes SA->GetPos(i)
1000         x=(i==n-1)?0:i+1;
1001
1002         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1003             Tools::SetField(sampledpositions,1,p,1);
1004             (*positions)[sampleCount] = p;
1005             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1006             sampleCount ++;
1007         }
1008
1009         uchar c = alphabetrank->access(p);
1010         if (c == '\0')
1011         {
1012             --textId;
1013
1014             // Record the order of end-markers in BWT:
1015             ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1016             (*endmarkerDocId)[endmarkerRank] = textId;
1017
1018             // Store text length and text start position:
1019             if (textId < (DocId)numberOfTexts - 1)
1020             {
1021                 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1022                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1023                 posOfSuccEndmarker = x;
1024             }
1025
1026             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1027             p = textId; // Correct LF-mapping to the last char of the previous text.
1028         }
1029         else
1030             p = C[c]+alphabetrank->rank(c, p)-1;
1031     }
1032     assert(textId == 0);
1033
1034     sampled = new BSGAP(sampledpositions,n,true);
1035     sampleLength = sampled->rank(n-1);
1036     assert(sampleCount == sampleLength);
1037
1038     // Suffixes store an offset from the text start position
1039     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1040     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1041
1042     for(ulong i=0; i<sampleLength; i++) {
1043         assert((*positions)[i] < n);
1044         ulong j = sampled->rank((*positions)[i]);
1045         if (j==0) j=sampleLength;
1046         TextPosition textPos = (*tmpSuffix)[i];
1047         (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1048
1049         assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1050         assert((*suffixDocId)[j-1] < numberOfTexts);
1051         // calculate offset from text start:
1052         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1053     }
1054     // FIXME Temp, remove
1055     delete tmpSuffix;
1056     delete positions;
1057 //    delete textLength;
1058     delete textStartPos;
1059 }
1060
1061
1062 /**
1063  * Finds document identifier for given text position
1064  *
1065  * Starting text position of the document is stored into second parameter.
1066  * Binary searching on text starting positions.
1067  */
1068 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1069 {
1070     assert(i < n);
1071
1072     DocId a = 0;
1073     DocId b = numberOfTexts - 1;
1074     while (a < b)
1075     {
1076         DocId c = a + (b - a)/2;
1077         if ((*textStartPos)[c] > i)
1078             b = c - 1;
1079         else if ((*textStartPos)[c+1] > i)
1080             return c;
1081         else
1082             a = c + 1;
1083     }
1084
1085     assert(a < (DocId)numberOfTexts);
1086     assert(i >= (*textStartPos)[a]);
1087     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1088     return a;
1089 }
1090
1091 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1092 {
1093     TCodeEntry *result = new TCodeEntry[ 256 ];
1094
1095     count_chars( text, n, result );
1096     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1097 //
1098 // First I push all the leaf nodes into the queue
1099 //
1100     for ( unsigned int i = 0 ; i < 256 ; i++ )
1101         if ( result[ i ].count )
1102             q.push(node( i, result[ i ].count ) );
1103 //
1104 // This loop removes the two smallest nodes from the
1105 // queue.  It creates a new internal node that has
1106 // those two nodes as children. The new internal node
1107 // is then inserted into the priority queue.  When there
1108 // is only one node in the priority queue, the tree
1109 // is complete.
1110 //
1111
1112     while ( q.size() > 1 ) {
1113         node *child0 = new node( q.top() );
1114         q.pop();
1115         node *child1 = new node( q.top() );
1116         q.pop();
1117         q.push( node( child0, child1 ) );
1118     }
1119 //
1120 // Now I compute and return the codetable
1121 //
1122     q.top().maketable(0u,0u, result);
1123     q.pop();
1124     return result;
1125 }
1126
1127
1128 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1129 {
1130     if ( child0 )
1131     {
1132         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1133         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1134         delete child0;
1135         delete child1;
1136     }
1137     else
1138     {
1139         codetable[value].code = code;
1140         codetable[value].bits = bits;
1141     }
1142 }
1143
1144 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1145 {
1146     ulong i;
1147     for (i = 0 ; i < 256 ; i++ )
1148         counts[ i ].count = 0;
1149     for (i=0; i<n; i++)
1150         counts[(int)text[i]].count++;
1151 }
1152
1153 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1154       return x | (bit << pos);
1155 }
1156