CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22
  23 // Conflicts with std::min and std::max
  24 #undef min
  25 #undef max
  26 #include "dcover/bwt.hpp"
  27
  28 #include "HeapProfiler.h" // FIXME remove
  29
  30 #include <iostream>
  31 #include <queue>
  32 #include <set>
  33 #include <vector>
  34 #include <utility>
  35 #include <stdexcept>
  36 #include <cassert>
  37 #include <cstring> // For strlen()
  38 using std::vector;
  39 using std::pair;
  40 using std::make_pair;
  41 using std::map;
  42
  43 namespace SXSI
  44 {
  45
  46 // Save file version info
  47 const uchar CSA::versionFlag = 2;
  48
  49 ////////////////////////////////////////////////////////////////////////////
  50 // Class CSA::THuffAlphabetRank
  51 // FIXME Unused code
  52 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  53     left = NULL;
  54     right = NULL;
  55     bitrank = NULL;
  56     ch = s[0];
  57     leaf = false;
  58     this->codetable = codetable;
  59
  60     bool *B = new bool[n];
  61     TextPosition sum=0,i;
  62     /*
  63     for (i=0; i< n; i++) {
  64        printf("%c:", (char)((int)s[i]-128));
  65        for (r=0;r<codetable[(int)s[i]].bits;r++)
  66           if (codetable[(int)s[i]].code & (1u <<r))
  67              printf("1");
  68           else printf("0");
  69        printf("\n");
  70     }
  71     printf("\n");
  72     if (level > 100) return;*/
  73     for (i=0;i<n;i++)
  74        if (codetable[(int)s[i]].code & (1u << level)) {
  75           B[i] = true;
  76           sum++;
  77        }
  78        else B[i] = false;
  79     if (sum==0 || sum==n) {
  80         delete [] B;
  81         leaf = true;
  82         return;
  83     }
  84     uchar *sfirst, *ssecond;
  85     //if (n-sum > 0)
  86         sfirst = new uchar[n-sum];
  87     //if (sum > 0)
  88         ssecond = new uchar[sum];
  89     unsigned j=0,k=0;
  90    for (i=0;i<n;i++)
  91         if (B[i]) ssecond[k++] = s[i];
  92         else sfirst[j++] = s[i];
  93     ulong *Binbits = new ulong[n/W+1];
  94     for (i=0;i<n;i++)
  95         Tools::SetField(Binbits,1,i,B[i]);
  96     delete [] B;
  97     bitrank = new BitRank(Binbits,n,true);
  98     //if (j > 0) {
  99         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
 100         delete [] sfirst;
 101     //}
 102     //if (k>0) {
 103         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
 104         delete [] ssecond;
 105     //}
 106 }
 107
 108
 109 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 110     // testing that the code works correctly
 111     int C[256];
 112     unsigned i,j;
 113     bool correct=true;
 114     for (j=0;j<256;j++)
 115         C[j] = 0;
 116     for (i=0;i<n;i++) {
 117         C[(int)s[i]]++;
 118         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 119         correct = false;
 120         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 121         }
 122     }
 123     return correct;
 124 }
 125
 126 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 127     if (left!=NULL) delete left;
 128     if (right!=NULL) delete right;
 129     if (bitrank!=NULL)
 130         delete bitrank;
 131 }
 132
 133
 134 ////////////////////////////////////////////////////////////////////////////
 135 // Class CSA
 136
 137 /**
 138  * Constructor inits an empty dynamic FM-index.
 139  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 140  */
 141 CSA::CSA(unsigned samplerate)
 142     : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
 143       suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
 144       emptyTextRank(0)
 145 {
 146     this->samplerate = samplerate;
 147
 148 #ifdef CSA_TEST_BWT
 149     // FIXME TODO : DynFMI needs distribution of characters before hand
 150     // This will create fully balanced wavelet tree for all chars [0, 255].
 151     uchar temp[256];
 152     for (unsigned i = 0; i < 255; ++i)
 153         temp[i] = i+1;
 154     temp[255] = 0;
 155     dynFMI = new DynFMI(temp, 1, 255, false);
 156
 157     /* Debug code: take char distribution from data.txt.
 158     uchar *temp = Tools::GetFileContents("data.txt", 0);
 159     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 160     delete [] temp;*/
 161 #endif
 162 }
 163
 164 /**
 165  * Insert new text
 166  *
 167  * Given text must include end-marker.
 168  * Text identifiers are numbered starting from 0.
 169  */
 170 void CSA::InsertText(uchar const * text)
 171 {
 172     // Sanity check:
 173 #ifdef CSA_TEST_BWT
 174     assert(dynFMI != 0);
 175 #endif
 176
 177     TextPosition m = std::strlen((char *)text) + 1;
 178     if (m > maxTextLength)
 179         maxTextLength = m; // Store length of the longest text seen so far.
 180
 181     if (m > 1)
 182     {
 183         this->n += m;
 184         this->numberOfTexts ++;
 185         this->numberOfAllTexts ++;
 186 #ifdef CSA_TEST_BWT
 187         dynFMI->addText(text, m);
 188 #endif
 189
 190         texts.append((char *) text);
 191         texts.append(1, '\0');
 192     }
 193     else
 194     {
 195         emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
 196         this->numberOfAllTexts ++;
 197     }
 198 }
 199
 200 void CSA::MakeStatic()
 201 {
 202     // Sanity check:
 203 #ifdef CSA_TEST_BWT
 204     if (dynFMI == 0)
 205         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 206 #endif
 207
 208     // Bitvector of empty texts, FIXME Remove?
 209     {
 210         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
 211         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
 212         for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
 213             tempB[i] = 0;
 214         for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
 215             Tools::SetField(tempB, 1, (*it), 1);
 216         emptyTextId.clear();
 217         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
 218     }
 219
 220     uchar *bwt = new uchar[n];
 221     {
 222         // More succinct solution via StringIterator, see below.
 223 /*        unsigned* maptexts = new unsigned[n+1];
 224         // Map text chars to [0..255+numberOfTexts]
 225         unsigned count = 0;
 226         for (ulong i = 0; i < n; ++i)
 227             if (texts[i] == 0)
 228                 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
 229             else {
 230                 uchar c = (uchar)texts[i];
 231                 maptexts[i] = (unsigned)c + numberOfTexts;
 232             }
 233         maptexts[n] = '\0';
 234         assert(count == numberOfTexts);
 235
 236         std::cout << "maptext: ";
 237         for (ulong i = 0; i <= n; ++i)
 238             std::cout << maptexts[i] << ", ";
 239             std::cout << std::endl;*/
 240
 241         // Mark endmarker positions into bitvector
 242         ulong * endmarkers = new ulong[n/W+1];
 243         for (ulong i = 0; i < n/W+1; ++i)
 244             endmarkers[i] = 0;
 245         for (ulong i = 0; i < n; ++i)
 246             if (texts[i] == 0)
 247                 Tools::SetField(endmarkers, 1, i, 1);
 248         BitRank* br = new BitRank(endmarkers, n, true);
 249         assert(numberOfTexts == br->rank(n-1));
 250
 251         // Build iterators, FIXME clean up iterator construction.
 252         StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
 253         StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
 254
 255         bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
 256                                        &bwt[0], numberOfTexts);
 257
 258         bwt[--bwtEndPos] = '\0';
 259         texts.erase();
 260         texts.reserve(0); // Release capacity
 261         delete br; // bitvector of endmarkers
 262     } // End of bw transform
 263
 264 #ifdef CSA_TEST_BWT
 265     {
 266         uchar *bwtTest = dynFMI->getBWT();
 267         printf("123456789012345678901234567890123456789\n");
 268         for (TextPosition i = 0; i < n && i < 100; i ++)
 269             if (bwt[i] < 50)
 270                 printf("%d", (int)bwt[i]);
 271             else
 272                 printf("%c", bwt[i]);
 273         printf("\n");
 274         for (TextPosition i = 0; i < n && i < 100; i ++)
 275             if (bwtTest[i] < 50)
 276                 printf("%d", (int)bwtTest[i]);
 277             else
 278                 printf("%c", bwtTest[i]);
 279                 printf("\n");
 280
 281         // Sanity check
 282         assert(numberOfTexts == dynFMI->getCollectionSize());
 283
 284         delete dynFMI;
 285         dynFMI = 0;
 286         for (ulong i = 0; i < n; ++i)
 287             if (bwt[i] != bwtTest[i])
 288             {
 289                 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
 290                 assert(0);
 291             }
 292         delete [] bwtTest;
 293     }
 294 #endif // CSA_TEST_BWT
 295
 296     makewavelet(bwt); // Deletes bwt!
 297     bwt = 0;
 298
 299     // Make sampling tables
 300     maketables();
 301 }
 302
 303 bool CSA::EmptyText(DocId k) const
 304 {
 305     assert(k < (DocId)numberOfTexts);
 306     if (emptyTextRank->IsBitSet(k))
 307         return true;
 308     return false;
 309 }
 310
 311 uchar* CSA::GetText(DocId k) const
 312 {
 313     assert(k < (DocId)numberOfTexts);
 314
 315     ulong textRank = 0;
 316     if (emptyTextRank->IsBitSet(k, &textRank))
 317     {
 318         uchar* result = new uchar[1];
 319         result[0] = 0;
 320         return result;
 321     }
 322     // Map to non-empty text
 323     k -= textRank; //emptyTextRank->rank(k);
 324
 325     TextPosition i = k;
 326
 327     string result;
 328     // Reserve average string length to avoid reallocs
 329     result.reserve(n/numberOfTexts);
 330
 331     uchar c = alphabetrank->access(i);
 332     while (c != '\0')
 333     {
 334         result.push_back(c);
 335         i = C[c]+alphabetrank->rank(c,i)-1;
 336
 337         c = alphabetrank->access(i); // "next" char.
 338     }
 339
 340     // Convert to uchar (FIXME return string?)
 341     i = result.size();
 342     uchar* res = new uchar[i+1];
 343     res[i] = '\0';
 344     for (ulong j = 0; j < i; ++j)
 345         res[i-j-1] = result[j];
 346     return res;
 347 }
 348 /*
 349  * Not supported
 350 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 351 {
 352     assert(k < (DocId)numberOfTexts);
 353     assert(j < (*textLength)[k]);
 354     assert(i <= j);
 355
 356     ulong textRank = 0;
 357     if (emptyTextRank->IsBitSet(k, &textRank))
 358     {
 359         uchar* result = new uchar[1];
 360         result[0] = 0;
 361         return result;
 362     }
 363
 364     // Map to non-empty text
 365     k -= textRank; // emptyTextRank->rank(k);
 366
 367     // Start position of k'th text
 368     ulong start = (*textStartPos)[k];
 369
 370     return Substring(i + start, j-i+1);
 371     }*/
 372
 373 /******************************************************************
 374  * Existential queries
 375  */
 376 bool CSA::IsPrefix(uchar const * pattern) const
 377 {
 378     TextPosition m = strlen((char *)pattern);
 379     if (m == 0)
 380         return true;
 381
 382     TextPosition sp = 0, ep = 0;
 383     Search(pattern, m, &sp, &ep);
 384
 385     // Check for end-marker(s) in result interval
 386     if (CountEndmarkers(sp, ep))
 387         return true;
 388     return false;
 389 }
 390
 391 bool CSA::IsSuffix(uchar const *pattern) const
 392 {
 393     // Here counting is as fast as IsSuffix():
 394     if (CountSuffix(pattern) > 0)
 395         return true;
 396     return false;
 397 }
 398
 399 bool CSA::IsEqual(uchar const *pattern) const
 400 {
 401     TextPosition m = std::strlen((char *)pattern);
 402     if (m == 0)
 403     {
 404         if (numberOfAllTexts - numberOfTexts > 0u)
 405             return true; // Empty texts exists
 406         return false; // No empty texts exists
 407     }
 408
 409     TextPosition sp = 0, ep = 0;
 410     // Match including end-marker
 411     Search(pattern, m+1, &sp, &ep);
 412
 413     // Check for end-marker(s) in result interval
 414     if (CountEndmarkers(sp, ep))
 415         return true;
 416     return false;
 417 }
 418
 419 bool CSA::IsContains(uchar const * pattern) const
 420 {
 421     TextPosition m = strlen((char *)pattern);
 422     if (m == 0)
 423         return true;
 424
 425     TextPosition sp = 0, ep = 0;
 426     // Just check if pattern exists somewhere
 427     ulong count = Search(pattern, m, &sp, &ep);
 428
 429     if (count > 0)
 430         return true;
 431     return false;
 432 }
 433
 434 bool CSA::IsLessThan(uchar const*) const
 435 {
 436     // TODO
 437     assert(0);
 438     return false;
 439 }
 440
 441 /******************************************************************
 442  * Counting queries
 443  */
 444 ulong CSA::Count(uchar const * pattern) const
 445 {
 446     TextPosition m = strlen((char *)pattern);
 447     if (m == 0)
 448         return 0;
 449
 450     TextPosition sp = 0, ep = 0;
 451     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 452     return count;
 453 }
 454
 455 unsigned CSA::CountPrefix(uchar const * pattern) const
 456 {
 457     TextPosition m = strlen((char *)pattern);
 458     if (m == 0)
 459         return numberOfAllTexts;
 460
 461     TextPosition sp = 0, ep = 0;
 462     Search(pattern, m, &sp, &ep);
 463
 464     // Count end-markers in result interval
 465     return CountEndmarkers(sp, ep);
 466 }
 467
 468 unsigned CSA::CountSuffix(uchar const * pattern) const
 469 {
 470     TextPosition m = strlen((char *)pattern);
 471     if (m == 0)
 472         return numberOfAllTexts;
 473
 474     TextPosition sp = 0, ep = 0;
 475     // Search with end-marker
 476     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 477
 478     return count;
 479 }
 480
 481 unsigned CSA::CountEqual(uchar const *pattern) const
 482 {
 483     TextPosition m = strlen((char const *)pattern);
 484     if (m == 0)
 485         return numberOfAllTexts - numberOfTexts; // Empty texts.
 486
 487     TextPosition sp = 0, ep = 0;
 488     // Match including end-marker
 489     Search(pattern, m+1, &sp, &ep);
 490
 491     // Count end-markers in result interval
 492     return CountEndmarkers(sp, ep);
 493 }
 494
 495 unsigned CSA::CountContains(uchar const * pattern) const
 496 {
 497     TextPosition m = strlen((char const *)pattern);
 498     if (m == 0)
 499         return numberOfAllTexts; // Total number of texts.
 500
 501     // Here counting is as slow as fetching the result set
 502     // because we would have to filter out occ's that fall within same document.
 503     TextCollection::document_result result = Contains(pattern);
 504     return result.size();
 505 }
 506
 507 unsigned CSA::CountLessThan(uchar const * pattern) const
 508 {
 509     // TODO
 510     assert(0);
 511     return 0;
 512 }
 513
 514 /**
 515  * Document reporting queries
 516  */
 517 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 518 {
 519     TextPosition m = strlen((char *)pattern);
 520     if (m == 0)
 521         return TextCollection::document_result(); // FIXME Should return all 1...k
 522
 523     TextPosition sp = 0, ep = 0;
 524     Search(pattern, m, &sp, &ep);
 525
 526     TextCollection::document_result result;
 527
 528     // Report end-markers in result interval
 529     unsigned resultSize = CountEndmarkers(sp, ep);
 530     if (resultSize == 0)
 531         return result;
 532
 533     result.reserve(resultSize); // Try to avoid reallocation.
 534
 535     // Iterate through end-markers in [sp,ep]:
 536     unsigned i = 0;
 537     if (sp > 0)
 538         i = alphabetrank->rank(0, sp - 1);
 539     while (resultSize)
 540     {
 541         // End-marker we found belongs to the "preceeding" doc in the collection
 542         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 543         // Map to doc ID:
 544         docId = emptyTextRank->select0(docId+1);
 545         result.push_back(docId);
 546
 547         -- resultSize;
 548         ++ i;
 549     }
 550
 551     return result;
 552 }
 553
 554 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 555 {
 556     TextPosition m = strlen((char *)pattern);
 557     if (m == 0)
 558         return TextCollection::document_result(); // FIXME Should return all 1...k
 559
 560     TextPosition sp = 0, ep = 0;
 561     // Search with end-marker
 562     Search(pattern, m+1, &sp, &ep);
 563
 564     TextCollection::document_result result;
 565     result.reserve(ep-sp+1); // Try to avoid reallocation.
 566
 567     ulong sampled_rank_i = 0;
 568     // Check each occurrence
 569     for (; sp <= ep; ++sp)
 570     {
 571         TextPosition i = sp;
 572
 573         uchar c = alphabetrank->access(i);
 574         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 575         {
 576             i = C[c]+alphabetrank->rank(c,i)-1;
 577             c = alphabetrank->access(i);
 578         }
 579         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 580
 581         if (c == '\0')
 582         {
 583             // Rank among the end-markers in BWT
 584             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 585
 586             // End-marker that we found belongs to the "preceeding" doc in collection:
 587             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 588             // Map to doc ID:
 589             docId = emptyTextRank->select0(docId+1);
 590             result.push_back(docId);
 591         }
 592         else // Sampled position
 593         {
 594             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 595             // Map to doc ID:
 596             docId = emptyTextRank->select0(docId+1);
 597             result.push_back(docId);
 598         }
 599     }
 600
 601     return result;
 602 }
 603
 604 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 605 {
 606     TextPosition m = strlen((char const *)pattern);
 607     if (m == 0)
 608         return TextCollection::document_result(); // FIXME Should return all empty texts
 609
 610     TextPosition sp = 0, ep = 0;
 611     // Match including end-marker
 612     Search(pattern, m+1, &sp, &ep);
 613
 614     TextCollection::document_result result;
 615
 616     // Report end-markers in result interval
 617     unsigned resultSize = CountEndmarkers(sp, ep);
 618     if (resultSize == 0)
 619         return result;
 620
 621     result.reserve(resultSize); // Try to avoid reallocation.
 622
 623     unsigned i = 0;
 624     if (sp > 0)
 625         i = alphabetrank->rank(0, sp - 1);
 626     while (resultSize)
 627     {
 628         // End-marker we found belongs to the "previous" doc in the collection
 629         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 630         // Map to doc ID:
 631         docId = emptyTextRank->select0(docId+1);
 632         result.push_back(docId);
 633
 634         -- resultSize;
 635         ++ i;
 636     }
 637
 638     return result;
 639 }
 640
 641
 642 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 643 {
 644     TextPosition m = strlen((char *)pattern);
 645     if (m == 0)
 646         return TextCollection::document_result();
 647
 648     TextPosition sp = 0, ep = 0;
 649     // Search all occurrences
 650     Search(pattern, m, &sp, &ep);
 651
 652     // We want unique document indentifiers, using std::set to collect them
 653     std::set<DocId> resultSet;
 654
 655     ulong sampled_rank_i = 0;
 656     // Check each occurrence
 657     for (; sp <= ep; ++sp)
 658     {
 659         TextPosition i = sp;
 660         uchar c = alphabetrank->access(i);
 661         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 662         {
 663             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 664             c = alphabetrank->access(i);
 665         }
 666         if (c == '\0')
 667         {
 668             // Rank among the end-markers in BWT
 669             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 670
 671             // End-marker that we found belongs to the "preceeding" doc in collection:
 672             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 673             resultSet.insert(docId);
 674         }
 675         else
 676         {
 677             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 678             assert((unsigned)di < numberOfTexts);
 679             resultSet.insert(di);
 680         }
 681     }
 682
 683     // Convert std::set to std::vector
 684     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 685     // Map to doc ID's
 686     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 687         *it = emptyTextRank->select0(*it+1);
 688     return result;
 689 }
 690
 691 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 692 {
 693     // TODO
 694     assert(0);
 695     return document_result();
 696 }
 697
 698 /**
 699  * Full result set queries
 700  */
 701 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 702 {
 703     TextPosition m = strlen((char *)pattern);
 704     if (m == 0)
 705         return full_result(); // FIXME Throw exception?
 706
 707     TextPosition sp = 0, ep = 0;
 708     // Search all occurrences
 709     Search(pattern, m, &sp, &ep);
 710
 711     full_result result;
 712     result.reserve(ep-sp+1); // Try to avoid reallocation.
 713
 714     ulong sampled_rank_i = 0;
 715     // Report each occurrence
 716     for (; sp <= ep; ++sp)
 717     {
 718         TextPosition i = sp;
 719         TextPosition dist = 0;
 720         uchar c = alphabetrank->access(i);
 721         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 722         {
 723             i = C[c]+alphabetrank->rank(c,i)-1;
 724             c = alphabetrank->access(i);
 725             ++ dist;
 726         }
 727         if (c == '\0')
 728         {
 729             // Rank among the end-markers in BWT
 730             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 731
 732             // End-marker that we found belongs to the "preceeding" doc in collection:
 733             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 734             // Map to doc ID:
 735             docId = emptyTextRank->select0(docId+1);
 736             result.push_back(make_pair(docId, dist));
 737         }
 738         else
 739         {
 740             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
 741             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 742 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
 743
 744             // Map to doc ID:
 745             docId = emptyTextRank->select0(docId+1);
 746             result.push_back(make_pair(docId, textPos));
 747         }
 748     }
 749
 750     return result;
 751 }
 752
 753
 754 /**
 755  * Save index to a file handle
 756  *
 757  * Throws a std::runtime_error exception on i/o error.
 758  * First byte that is saved represents the version number of the save file.
 759  * In version 2 files, the data fields are:
 760  *  uchar versionFlag;
 761     TextPosition n;
 762     unsigned samplerate;
 763     unsigned C[256];
 764     TextPosition bwtEndPos;
 765     static_sequence * alphabetrank;
 766     BSGAP *sampled;
 767     BlockArray *suffixes;
 768     BlockArray *suffixDocId;
 769     unsigned numberOfTexts;
 770     unsigned numberOfAllTexts;
 771     ulong maxTextLength;
 772     BlockArray *endmarkerDocId;
 773     BSGAP *emptyTextRank;
 774  */
 775 void CSA::Save(FILE *file) const
 776 {
 777     // Saving version info:
 778     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 779         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 780
 781     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 782         throw std::runtime_error("CSA::Save(): file write error (n).");
 783     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 784         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 785
 786     for(ulong i = 0; i < 256; ++i)
 787         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 788             throw std::runtime_error("CSA::Save(): file write error (C table).");
 789
 790     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 791         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 792
 793     alphabetrank->save(file);
 794     sampled->Save(file);
 795     suffixes->Save(file);
 796     suffixDocId->Save(file);
 797
 798     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 799         throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
 800     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 801         throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
 802     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 803         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
 804
 805     endmarkerDocId->Save(file);
 806     emptyTextRank->Save(file);
 807     fflush(file);
 808 }
 809
 810
 811 /**
 812  * Load index from a file handle
 813  *
 814  * Throws a std::runtime_error exception on i/o error.
 815  * For more info, see CSA::Save().
 816  */
 817 void CSA::Load(FILE *file, unsigned samplerate)
 818 {
 819     // Delete everything
 820 #ifdef CSA_TEST_BWT
 821     delete dynFMI;       dynFMI = 0;
 822 #endif
 823     delete alphabetrank; alphabetrank = 0;
 824     delete sampled;      sampled = 0;
 825     delete suffixes;     suffixes = 0;
 826     delete suffixDocId;  suffixDocId = 0;
 827     delete [] codetable; codetable = 0;
 828
 829     delete endmarkerDocId; endmarkerDocId = 0;
 830     delete emptyTextRank;  emptyTextRank = 0;
 831
 832     this->maxTextLength = 0;
 833     this->numberOfTexts = 0;
 834     this->numberOfAllTexts = 0;
 835     this->samplerate = samplerate;
 836     this->n = 0;
 837
 838     uchar verFlag = 0;
 839     if (std::fread(&verFlag, 1, 1, file) != 1)
 840         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 841     if (verFlag != CSA::versionFlag)
 842         throw std::runtime_error("CSA::Load(): invalid save file version.");
 843
 844     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 845         throw std::runtime_error("CSA::Load(): file read error (n).");
 846     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 847         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 848 // FIXME samplerate can not be changed during load.
 849 //    if (this->samplerate == 0)
 850 //        this->samplerate = samplerate;
 851
 852     for(ulong i = 0; i < 256; ++i)
 853         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 854             throw std::runtime_error("CSA::Load(): file read error (C table).");
 855
 856     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 857         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 858
 859     alphabetrank = static_sequence::load(file);
 860     sampled = new BSGAP(file);
 861     suffixes = new BlockArray(file);
 862     suffixDocId = new BlockArray(file);
 863
 864     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 865         throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
 866     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 867         throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
 868     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 869         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
 870
 871     endmarkerDocId = new BlockArray(file);
 872     emptyTextRank = new BSGAP(file);
 873
 874     // FIXME Construct data structures with new samplerate
 875     //maketables();
 876 }
 877
 878
 879
 880 /**
 881  * Rest of the functions follow...
 882  */
 883
 884
 885 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 886 {
 887     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 888     int c = (int)pattern[m-1];
 889     TextPosition i=m-1;
 890     TextPosition sp = C[c];
 891     TextPosition ep = C[c+1]-1;
 892     while (sp<=ep && i>=1)
 893     {
 894 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 895         c = (int)pattern[--i];
 896         sp = C[c]+alphabetrank->rank(c,sp-1);
 897         ep = C[c]+alphabetrank->rank(c,ep)-1;
 898     }
 899     *spResult = sp;
 900     *epResult = ep;
 901     if (sp<=ep)
 902         return ep - sp + 1;
 903     else
 904         return 0;
 905 }
 906
 907 CSA::~CSA() {
 908 #ifdef CSA_TEST_BWT
 909     delete dynFMI;
 910 #endif
 911     delete alphabetrank;
 912     delete sampled;
 913     delete suffixes;
 914     delete suffixDocId;
 915     delete [] codetable; // FIXME remove
 916
 917     delete endmarkerDocId;
 918     delete emptyTextRank;
 919 }
 920
 921 void CSA::makewavelet(uchar *bwt)
 922 {
 923     ulong i, min = 0,
 924              max;
 925     for (i=0;i<256;i++)
 926         C[i]=0;
 927     for (i=0;i<n;++i)
 928         C[(int)bwt[i]]++;
 929     for (i=0;i<256;i++)
 930         if (C[i]>0) {min = i; break;}
 931     for (i=255;i>=min;--i)
 932         if (C[i]>0) {max = i; break;}
 933
 934     ulong prev=C[0], temp;
 935     C[0]=0;
 936     for (i=1;i<256;i++) {
 937         temp = C[i];
 938         C[i]=C[i-1]+prev;
 939         prev = temp;
 940     }
 941 //    this->codetable = node::makecodetable(bwt,n);
 942 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
 943 //    delete [] bwt;
 944     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
 945
 946     alphabet_mapper * am = new alphabet_mapper_none();
 947     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
 948     wt_coder * wtc = new wt_coder_huff(bwt,n,am);
 949     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
 950     delete bmb;
 951     delete [] bwt;
 952 }
 953
 954 void CSA::maketables()
 955 {
 956     // Calculate BWT end-marker position (of last inserted text)
 957     // Note: bwtEndPos already set!
 958     // and the length of the first text (counter l):
 959 #ifdef CSA_TEST_BWT
 960     ulong i = 0;
 961     ulong l = 1;
 962
 963     uchar c  = alphabetrank->access(i);
 964     while (c != '\0')
 965     {
 966         i = C[c]+alphabetrank->rank(c, i)-1;
 967         ++l;
 968         c = alphabetrank->access(i);
 969     }
 970     assert(i == bwtEndPos); // compare result
 971 #endif
 972
 973     // Build up arrays for text length and starting positions
 974     // FIXME Temp, remove
 975     //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
 976     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
 977     //(*textLength)[0] = l;
 978     (*textStartPos)[0] = 0;
 979
 980     // Construct samples
 981     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
 982     unsigned ceilLog2n = Tools::CeilLog2(n);
 983     BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
 984     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
 985
 986     // Mapping from end-markers to doc ID's:
 987     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
 988
 989     ulong *sampledpositions = new ulong[n/W+1];
 990     for (ulong i=0;i<n/W+1;i++)
 991         sampledpositions[i]=0lu;
 992
 993     ulong x,p=bwtEndPos;
 994     ulong sampleCount = 0;
 995     // Keeping track of text position of prev. end-marker seen
 996     ulong posOfSuccEndmarker = n;
 997     DocId textId = numberOfTexts;
 998     ulong ulongmax = 0;
 999     ulongmax--;
1000
1001     //positions:
1002     for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1003       // i substitutes SA->GetPos(i)
1004         x=(i==n-1)?0:i+1;
1005
1006         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1007             Tools::SetField(sampledpositions,1,p,1);
1008             (*positions)[sampleCount] = p;
1009             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1010             sampleCount ++;
1011         }
1012
1013         uchar c = alphabetrank->access(p);
1014         if (c == '\0')
1015         {
1016             --textId;
1017
1018             // Record the order of end-markers in BWT:
1019             ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1020             (*endmarkerDocId)[endmarkerRank] = textId;
1021
1022             // Store text length and text start position:
1023             if (textId < (DocId)numberOfTexts - 1)
1024             {
1025                 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1026                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1027                 posOfSuccEndmarker = x;
1028             }
1029
1030             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1031             p = textId; // Correct LF-mapping to the last char of the previous text.
1032         }
1033         else
1034             p = C[c]+alphabetrank->rank(c, p)-1;
1035     }
1036     assert(textId == 0);
1037
1038     sampled = new BSGAP(sampledpositions,n,true);
1039     sampleLength = sampled->rank(n-1);
1040     assert(sampleCount == sampleLength);
1041
1042     // Suffixes store an offset from the text start position
1043     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1044     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1045
1046     for(ulong i=0; i<sampleLength; i++) {
1047         assert((*positions)[i] < n);
1048         ulong j = sampled->rank((*positions)[i]);
1049         if (j==0) j=sampleLength;
1050         TextPosition textPos = (*tmpSuffix)[i];
1051         (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1052
1053         assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1054         assert((*suffixDocId)[j-1] < numberOfTexts);
1055         // calculate offset from text start:
1056         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1057     }
1058     // FIXME Temp, remove
1059     delete tmpSuffix;
1060     delete positions;
1061 //    delete textLength;
1062     delete textStartPos;
1063 }
1064
1065
1066 /**
1067  * Finds document identifier for given text position
1068  *
1069  * Starting text position of the document is stored into second parameter.
1070  * Binary searching on text starting positions.
1071  */
1072 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1073 {
1074     assert(i < n);
1075
1076     DocId a = 0;
1077     DocId b = numberOfTexts - 1;
1078     while (a < b)
1079     {
1080         DocId c = a + (b - a)/2;
1081         if ((*textStartPos)[c] > i)
1082             b = c - 1;
1083         else if ((*textStartPos)[c+1] > i)
1084             return c;
1085         else
1086             a = c + 1;
1087     }
1088
1089     assert(a < (DocId)numberOfTexts);
1090     assert(i >= (*textStartPos)[a]);
1091     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1092     return a;
1093 }
1094
1095 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1096 {
1097     TCodeEntry *result = new TCodeEntry[ 256 ];
1098
1099     count_chars( text, n, result );
1100     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1101 //
1102 // First I push all the leaf nodes into the queue
1103 //
1104     for ( unsigned int i = 0 ; i < 256 ; i++ )
1105         if ( result[ i ].count )
1106             q.push(node( i, result[ i ].count ) );
1107 //
1108 // This loop removes the two smallest nodes from the
1109 // queue.  It creates a new internal node that has
1110 // those two nodes as children. The new internal node
1111 // is then inserted into the priority queue.  When there
1112 // is only one node in the priority queue, the tree
1113 // is complete.
1114 //
1115
1116     while ( q.size() > 1 ) {
1117         node *child0 = new node( q.top() );
1118         q.pop();
1119         node *child1 = new node( q.top() );
1120         q.pop();
1121         q.push( node( child0, child1 ) );
1122     }
1123 //
1124 // Now I compute and return the codetable
1125 //
1126     q.top().maketable(0u,0u, result);
1127     q.pop();
1128     return result;
1129 }
1130
1131
1132 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1133 {
1134     if ( child0 )
1135     {
1136         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1137         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1138         delete child0;
1139         delete child1;
1140     }
1141     else
1142     {
1143         codetable[value].code = code;
1144         codetable[value].bits = bits;
1145     }
1146 }
1147
1148 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1149 {
1150     ulong i;
1151     for (i = 0 ; i < 256 ; i++ )
1152         counts[ i ].count = 0;
1153     for (i=0; i<n; i++)
1154         counts[(int)text[i]].count++;
1155 }
1156
1157 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1158       return x | (bit << pos);
1159 }
1160
1161 } // namespace SXSI
1162