CSA.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "CSA.h"
  21 #include "TextCollection.h"
  22 #include <iostream>
  23 #include <queue>
  24 #include <iomanip>
  25 #include <set>
  26 #include <vector>
  27 #include <utility>
  28 #include <stdexcept>
  29 #include <cassert>
  30 #include <cstring> // For strlen()
  31 using std::vector;
  32 using std::pair;
  33 using std::make_pair;
  34 using std::map;
  35
  36 using SXSI::TextCollection;
  37
  38 // Save file version info
  39 const uchar CSA::versionFlag = 2;
  40
  41 ////////////////////////////////////////////////////////////////////////////
  42 // Class CSA::THuffAlphabetRank
  43
  44 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
  45     left = NULL;
  46     right = NULL;
  47     bitrank = NULL;
  48     ch = s[0];
  49     leaf = false;
  50     this->codetable = codetable;
  51
  52     bool *B = new bool[n];
  53     TextPosition sum=0,i;
  54     /*
  55     for (i=0; i< n; i++) {
  56        printf("%c:", (char)((int)s[i]-128));
  57        for (r=0;r<codetable[(int)s[i]].bits;r++)
  58           if (codetable[(int)s[i]].code & (1u <<r))
  59              printf("1");
  60           else printf("0");
  61        printf("\n");
  62     }
  63     printf("\n");
  64     if (level > 100) return;*/
  65     for (i=0;i<n;i++)
  66        if (codetable[(int)s[i]].code & (1u << level)) {
  67           B[i] = true;
  68           sum++;
  69        }
  70        else B[i] = false;
  71     if (sum==0 || sum==n) {
  72         delete [] B;
  73         leaf = true;
  74         return;
  75     }
  76     uchar *sfirst, *ssecond;
  77     //if (n-sum > 0)
  78         sfirst = new uchar[n-sum];
  79     //if (sum > 0)
  80         ssecond = new uchar[sum];
  81     unsigned j=0,k=0;
  82    for (i=0;i<n;i++)
  83         if (B[i]) ssecond[k++] = s[i];
  84         else sfirst[j++] = s[i];
  85     ulong *Binbits = new ulong[n/W+1];
  86     for (i=0;i<n;i++)
  87         Tools::SetField(Binbits,1,i,B[i]);
  88     delete [] B;
  89     bitrank = new BitRank(Binbits,n,true);
  90     //if (j > 0) {
  91         left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
  92         delete [] sfirst;
  93     //}
  94     //if (k>0) {
  95         right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
  96         delete [] ssecond;
  97     //}
  98 }
  99
 100
 101 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
 102     // testing that the code works correctly
 103     int C[256];
 104     unsigned i,j;
 105     bool correct=true;
 106     for (j=0;j<256;j++)
 107         C[j] = 0;
 108     for (i=0;i<n;i++) {
 109         C[(int)s[i]]++;
 110         if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
 111         correct = false;
 112         printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
 113         }
 114     }
 115     return correct;
 116 }
 117
 118 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
 119     if (left!=NULL) delete left;
 120     if (right!=NULL) delete right;
 121     if (bitrank!=NULL)
 122         delete bitrank;
 123 }
 124
 125 /**
 126  * Saving data fields:
 127         BitRank *bitrank;
 128         bool leaf;
 129         uchar ch;
 130         left child;
 131         right child;
 132 */
 133 void CSA::THuffAlphabetRank::Save(FILE *file)
 134 {
 135
 136 }
 137
 138 CSA::THuffAlphabetRank::THuffAlphabetRank(FILE *file)
 139 {
 140
 141
 142 }
 143
 144 /////////////////////////////////////////////f///////////////////////////////
 145 // Class CSA
 146
 147 /**
 148  * Constructor inits an empty dynamic FM-index.
 149  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
 150  */
 151 CSA::CSA(unsigned samplerate)
 152     : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0),
 153       codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
 154       textLength(0), textStartPos(0), emptyTextRank(0)
 155 {
 156     this->samplerate = samplerate;
 157
 158     // FIXME TODO : DynFMI needs distribution of characters before hand
 159     // This will create fully balanced wavelet tree for all chars [0, 255].
 160     uchar temp[256];
 161     for (unsigned i = 0; i < 255; ++i)
 162         temp[i] = i+1;
 163     temp[255] = 0;
 164     dynFMI = new DynFMI(temp, 1, 255, false);
 165
 166     /* Debug code: take char distribution from data.txt.
 167     uchar *temp = Tools::GetFileContents("data.txt", 0);
 168     dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
 169     delete [] temp;*/
 170 }
 171
 172 /**
 173  * Insert new text
 174  *
 175  * Given text must include end-marker.
 176  * Text identifiers are numbered starting from 0.
 177  */
 178 void CSA::InsertText(uchar const * text)
 179 {
 180     // Sanity check:
 181     assert(dynFMI != 0);
 182
 183     TextPosition m = std::strlen((char *)text) + 1;
 184     if (m > maxTextLength)
 185         maxTextLength = m; // Store length of the longest text seen so far.
 186
 187     if (m > 1)
 188     {
 189         this->n += m;
 190         this->numberOfTexts ++;
 191         this->numberOfAllTexts ++;
 192         dynFMI->addText(text, m);
 193     }
 194     else
 195     {
 196         emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
 197         this->numberOfAllTexts ++;            // Replace with dynamic bitvector
 198     }
 199 }
 200
 201 void CSA::MakeStatic()
 202 {
 203     // Sanity check:
 204     if (dynFMI == 0)
 205         throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
 206
 207     // Bitvector of empty texts
 208     {
 209         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
 210         ulong *tempB = new ulong[numberOfAllTexts/W + 1];
 211         for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
 212             tempB[i] = 0;
 213         for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
 214             Tools::SetField(tempB, 1, (*it), 1);
 215         emptyTextId.clear();
 216         emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
 217     }
 218
 219     uchar *bwt = dynFMI->getBWT();
 220     /*printf("123456789012345678901234567890123456789\n");
 221     for (TextPosition i = 0; i < n; i ++)
 222         if (bwt[i] < 50)
 223             printf("%d", (int)bwt[i]);
 224         else
 225             printf("%c", bwt[i]);
 226             printf("\n");*/
 227
 228
 229 /*    for (TextPosition i = 1; i <= n; i ++)
 230     {
 231         printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
 232     }*/
 233
 234     // Sanity check
 235     assert(numberOfTexts == dynFMI->getCollectionSize());
 236
 237     delete dynFMI;
 238     dynFMI = 0;
 239
 240     makewavelet(bwt); // Deletes bwt!
 241     bwt = 0;
 242
 243     // Calculate BWT end-marker position (of last inserted text)
 244     // and the length of the first text (counter l):
 245     ulong i = 0;
 246     ulong l = 1;
 247
 248     uchar c  = alphabetrank->access(i);
 249     while (c != '\0')
 250     {
 251         i = C[c]+alphabetrank->rank(c, i)-1;
 252         ++l;
 253         c = alphabetrank->access(i);
 254     }
 255     bwtEndPos = i;
 256     assert(bwtEndPos < n);
 257     //printf("bwtEndPos = %lu\n", bwtEndPos);
 258
 259     // Build up arrays for text length and starting positions
 260     // FIXME Remove, temp data
 261     textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
 262     textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
 263     (*textLength)[0] = l;
 264     (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables().
 265
 266     // Make sampling tables
 267     maketables();
 268 }
 269
 270 bool CSA::EmptyText(DocId k) const
 271 {
 272     assert(k < (DocId)numberOfTexts);
 273     if (emptyTextRank->IsBitSet(k))
 274         return true;
 275     return false;
 276 }
 277
 278 uchar* CSA::GetText(DocId k) const
 279 {
 280     assert(k < (DocId)numberOfTexts);
 281
 282     ulong textRank = 0;
 283     if (emptyTextRank->IsBitSet(k, &textRank))
 284     {
 285         uchar* result = new uchar[1];
 286         result[0] = 0;
 287         return result;
 288     }
 289     // Map to non-empty text
 290     k -= textRank; //emptyTextRank->rank(k);
 291
 292     TextPosition i = k;
 293
 294     string result;
 295     // Reserve average string length to avoid reallocs
 296     result.reserve(n/numberOfTexts);
 297
 298     uchar c = alphabetrank->access(i);
 299     while (c != '\0')
 300     {
 301         result.push_back(c);
 302         i = C[c]+alphabetrank->rank(c,i)-1;
 303
 304         c = alphabetrank->access(i); // "next" char.
 305     }
 306
 307     // Convert to uchar (FIXME return string?)
 308     i = result.size();
 309     uchar* res = new uchar[i+1];
 310     res[i] = '\0';
 311     for (ulong j = 0; j < i; ++j)
 312         res[i-j-1] = result[j];
 313     return res;
 314 }
 315 /*
 316  * Not supported
 317 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
 318 {
 319     assert(k < (DocId)numberOfTexts);
 320     assert(j < (*textLength)[k]);
 321     assert(i <= j);
 322
 323     ulong textRank = 0;
 324     if (emptyTextRank->IsBitSet(k, &textRank))
 325     {
 326         uchar* result = new uchar[1];
 327         result[0] = 0;
 328         return result;
 329     }
 330
 331     // Map to non-empty text
 332     k -= textRank; // emptyTextRank->rank(k);
 333
 334     // Start position of k'th text
 335     ulong start = (*textStartPos)[k];
 336
 337     return Substring(i + start, j-i+1);
 338     }*/
 339
 340 /******************************************************************
 341  * Existential queries
 342  */
 343 bool CSA::IsPrefix(uchar const * pattern) const
 344 {
 345     TextPosition m = strlen((char *)pattern);
 346     if (m == 0)
 347         return true;
 348
 349     TextPosition sp = 0, ep = 0;
 350     Search(pattern, m, &sp, &ep);
 351
 352     // Check for end-marker(s) in result interval
 353     if (CountEndmarkers(sp, ep))
 354         return true;
 355     return false;
 356 }
 357
 358 bool CSA::IsSuffix(uchar const *pattern) const
 359 {
 360     // Here counting is as fast as IsSuffix():
 361     if (CountSuffix(pattern) > 0)
 362         return true;
 363     return false;
 364 }
 365
 366 bool CSA::IsEqual(uchar const *pattern) const
 367 {
 368     TextPosition m = std::strlen((char *)pattern);
 369     if (m == 0)
 370     {
 371         if (numberOfAllTexts - numberOfTexts > 0u)
 372             return true; // Empty texts exists
 373         return false; // No empty texts exists
 374     }
 375
 376     TextPosition sp = 0, ep = 0;
 377     // Match including end-marker
 378     Search(pattern, m+1, &sp, &ep);
 379
 380     // Check for end-marker(s) in result interval
 381     if (CountEndmarkers(sp, ep))
 382         return true;
 383     return false;
 384 }
 385
 386 bool CSA::IsContains(uchar const * pattern) const
 387 {
 388     TextPosition m = strlen((char *)pattern);
 389     if (m == 0)
 390         return true;
 391
 392     TextPosition sp = 0, ep = 0;
 393     // Just check if pattern exists somewhere
 394     ulong count = Search(pattern, m, &sp, &ep);
 395
 396     if (count > 0)
 397         return true;
 398     return false;
 399 }
 400
 401 bool CSA::IsLessThan(uchar const*) const
 402 {
 403     // TODO
 404     assert(0);
 405     return false;
 406 }
 407
 408 /******************************************************************
 409  * Counting queries
 410  */
 411 unsigned CSA::CountPrefix(uchar const * pattern) const
 412 {
 413     TextPosition m = strlen((char *)pattern);
 414     if (m == 0)
 415         return numberOfAllTexts;
 416
 417     TextPosition sp = 0, ep = 0;
 418     Search(pattern, m, &sp, &ep);
 419
 420     // Count end-markers in result interval
 421     return CountEndmarkers(sp, ep);
 422 }
 423
 424 unsigned CSA::CountSuffix(uchar const * pattern) const
 425 {
 426     TextPosition m = strlen((char *)pattern);
 427     if (m == 0)
 428         return numberOfAllTexts;
 429
 430     TextPosition sp = 0, ep = 0;
 431     // Search with end-marker
 432     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 433
 434     return count;
 435 }
 436
 437 unsigned CSA::CountEqual(uchar const *pattern) const
 438 {
 439     TextPosition m = strlen((char const *)pattern);
 440     if (m == 0)
 441         return numberOfAllTexts - numberOfTexts; // Empty texts.
 442
 443     TextPosition sp = 0, ep = 0;
 444     // Match including end-marker
 445     Search(pattern, m+1, &sp, &ep);
 446
 447     // Count end-markers in result interval
 448     return CountEndmarkers(sp, ep);
 449 }
 450
 451 unsigned CSA::CountContains(uchar const * pattern) const
 452 {
 453     TextPosition m = strlen((char const *)pattern);
 454     if (m == 0)
 455         return numberOfAllTexts; // Total number of texts.
 456
 457     // Here counting is as slow as fetching the result set
 458     // because we would have to filter out occ's that fall within same document.
 459     TextCollection::document_result result = Contains(pattern);
 460     return result.size();
 461 }
 462
 463 unsigned CSA::CountLessThan(uchar const * pattern) const
 464 {
 465     // TODO
 466     assert(0);
 467     return 0;
 468 }
 469
 470 /**
 471  * Document reporting queries
 472  */
 473 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
 474 {
 475     TextPosition m = strlen((char *)pattern);
 476     if (m == 0)
 477         return TextCollection::document_result(); // FIXME Should return all 1...k
 478
 479     TextPosition sp = 0, ep = 0;
 480     Search(pattern, m, &sp, &ep);
 481
 482     TextCollection::document_result result;
 483
 484     // Report end-markers in result interval
 485     unsigned resultSize = CountEndmarkers(sp, ep);
 486     if (resultSize == 0)
 487         return result;
 488
 489     result.reserve(resultSize); // Try to avoid reallocation.
 490
 491     // Iterate through end-markers in [sp,ep]:
 492     unsigned i = 0;
 493     if (sp > 0)
 494         i = alphabetrank->rank(0, sp - 1);
 495     while (resultSize)
 496     {
 497         // End-marker we found belongs to the "preceeding" doc in the collection
 498         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 499         // Map to doc ID:
 500         docId = emptyTextRank->select0(docId+1);
 501         result.push_back(docId);
 502
 503         -- resultSize;
 504         ++ i;
 505     }
 506
 507     return result;
 508 }
 509
 510 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
 511 {
 512     TextPosition m = strlen((char *)pattern);
 513     if (m == 0)
 514         return TextCollection::document_result(); // FIXME Should return all 1...k
 515
 516     TextPosition sp = 0, ep = 0;
 517     // Search with end-marker
 518     Search(pattern, m+1, &sp, &ep);
 519
 520     TextCollection::document_result result;
 521     result.reserve(ep-sp+1); // Try to avoid reallocation.
 522
 523     ulong sampled_rank_i = 0;
 524     // Check each occurrence
 525     for (; sp <= ep; ++sp)
 526     {
 527         TextPosition i = sp;
 528
 529         uchar c = alphabetrank->access(i);
 530         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 531         {
 532             i = C[c]+alphabetrank->rank(c,i)-1;
 533             c = alphabetrank->access(i);
 534         }
 535         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 536
 537         if (c == '\0')
 538         {
 539             // Rank among the end-markers in BWT
 540             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 541
 542             // End-marker that we found belongs to the "preceeding" doc in collection:
 543             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 544             // Map to doc ID:
 545             docId = emptyTextRank->select0(docId+1);
 546             result.push_back(docId);
 547         }
 548         else // Sampled position
 549         {
 550             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 551             // Map to doc ID:
 552             docId = emptyTextRank->select0(docId+1);
 553             result.push_back(docId);
 554         }
 555     }
 556
 557     return result;
 558 }
 559
 560 TextCollection::document_result CSA::Equal(uchar const *pattern) const
 561 {
 562     TextPosition m = strlen((char const *)pattern);
 563     if (m == 0)
 564         return TextCollection::document_result(); // FIXME Should return all empty texts
 565
 566     TextPosition sp = 0, ep = 0;
 567     // Match including end-marker
 568     Search(pattern, m+1, &sp, &ep);
 569
 570     TextCollection::document_result result;
 571
 572     // Report end-markers in result interval
 573     unsigned resultSize = CountEndmarkers(sp, ep);
 574     if (resultSize == 0)
 575         return result;
 576
 577     result.reserve(resultSize); // Try to avoid reallocation.
 578
 579     unsigned i = 0;
 580     if (sp > 0)
 581         i = alphabetrank->rank(0, sp - 1);
 582     while (resultSize)
 583     {
 584         // End-marker we found belongs to the "previous" doc in the collection
 585         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 586         // Map to doc ID:
 587         docId = emptyTextRank->select0(docId+1);
 588         result.push_back(docId);
 589
 590         -- resultSize;
 591         ++ i;
 592     }
 593
 594     return result;
 595 }
 596
 597
 598 TextCollection::document_result CSA::Contains(uchar const * pattern) const
 599 {
 600     TextPosition m = strlen((char *)pattern);
 601     if (m == 0)
 602         return TextCollection::document_result();
 603
 604     TextPosition sp = 0, ep = 0;
 605     // Search all occurrences
 606     Search(pattern, m, &sp, &ep);
 607
 608     // We want unique document indentifiers, using std::set to collect them
 609     std::set<DocId> resultSet;
 610
 611     ulong sampled_rank_i = 0;
 612     // Check each occurrence
 613     for (; sp <= ep; ++sp)
 614     {
 615         TextPosition i = sp;
 616         uchar c = alphabetrank->access(i);
 617         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 618         {
 619             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 620             c = alphabetrank->access(i);
 621         }
 622         if (c == '\0')
 623         {
 624             // Rank among the end-markers in BWT
 625             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 626
 627             // End-marker that we found belongs to the "preceeding" doc in collection:
 628             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 629             resultSet.insert(docId);
 630         }
 631         else
 632         {
 633             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 634             assert((unsigned)di < numberOfTexts);
 635             resultSet.insert(di);
 636         }
 637     }
 638
 639     // Convert std::set to std::vector
 640     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 641     // Map to doc ID's
 642     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 643         *it = emptyTextRank->select0(*it+1);
 644     return result;
 645 }
 646
 647 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
 648 {
 649     // TODO
 650     assert(0);
 651     return document_result();
 652 }
 653
 654 /**
 655  * Full result set queries
 656  */
 657 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
 658 {
 659     TextPosition m = strlen((char *)pattern);
 660     if (m == 0)
 661         return full_result(); // FIXME Throw exception?
 662
 663     TextPosition sp = 0, ep = 0;
 664     // Search all occurrences
 665     Search(pattern, m, &sp, &ep);
 666
 667     full_result result;
 668     result.reserve(ep-sp+1); // Try to avoid reallocation.
 669
 670     ulong sampled_rank_i = 0;
 671     // Report each occurrence
 672     for (; sp <= ep; ++sp)
 673     {
 674         TextPosition i = sp;
 675         TextPosition dist = 0;
 676         uchar c = alphabetrank->access(i);
 677         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 678         {
 679             i = C[c]+alphabetrank->rank(c,i)-1;
 680             c = alphabetrank->access(i);
 681             ++ dist;
 682         }
 683         if (c == '\0')
 684         {
 685             // Rank among the end-markers in BWT
 686             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 687
 688             // End-marker that we found belongs to the "preceeding" doc in collection:
 689             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 690             // Map to doc ID:
 691             docId = emptyTextRank->select0(docId+1);
 692             result.push_back(make_pair(docId, dist));
 693         }
 694         else
 695         {
 696             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
 697             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 698 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
 699
 700             // Map to doc ID:
 701             docId = emptyTextRank->select0(docId+1);
 702             result.push_back(make_pair(docId, textPos));
 703         }
 704     }
 705
 706     return result;
 707 }
 708
 709
 710 /**
 711  * Save index to a file handle
 712  *
 713  * Throws a std::runtime_error exception on i/o error.
 714  * First byte that is saved represents the version number of the save file.
 715  * In version 2 files, the data fields are:
 716  *  uchar versionFlag;
 717     TextPosition n;
 718     unsigned samplerate;
 719     unsigned C[256];
 720     TextPosition bwtEndPos;
 721     static_sequence * alphabetrank;
 722     BSGAP *sampled;
 723     BlockArray *suffixes;
 724     BlockArray *suffixDocId;
 725     unsigned numberOfTexts;
 726     unsigned numberOfAllTexts;
 727     ulong maxTextLength;
 728     BlockArray *endmarkerDocId;
 729     BSGAP *emptyTextRank;
 730  */
 731 void CSA::Save(FILE *file) const
 732 {
 733     // Saving version info:
 734     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 735         throw std::runtime_error("CSA::Save(): file write error (version flag).");
 736
 737     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 738         throw std::runtime_error("CSA::Save(): file write error (n).");
 739     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 740         throw std::runtime_error("CSA::Save(): file write error (samplerate).");
 741
 742     for(ulong i = 0; i < 256; ++i)
 743         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 744             throw std::runtime_error("CSA::Save(): file write error (C table).");
 745
 746     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 747         throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
 748
 749     alphabetrank->save(file);
 750     sampled->Save(file);
 751     suffixes->Save(file);
 752     suffixDocId->Save(file);
 753
 754     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 755         throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
 756     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 757         throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
 758     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 759         throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
 760
 761     endmarkerDocId->Save(file);
 762     emptyTextRank->Save(file);
 763     fflush(file);
 764 }
 765
 766
 767 /**
 768  * Load index from a file handle
 769  *
 770  * Throws a std::runtime_error exception on i/o error.
 771  * For more info, see CSA::Save().
 772  */
 773 void CSA::Load(FILE *file, unsigned samplerate)
 774 {
 775     // Delete everything
 776     delete dynFMI;       dynFMI = 0;
 777     delete alphabetrank; alphabetrank = 0;
 778     delete sampled;      sampled = 0;
 779     delete suffixes;     suffixes = 0;
 780     delete suffixDocId;  suffixDocId = 0;
 781     delete positions; positions = 0;
 782     delete [] codetable; codetable = 0;
 783
 784     delete endmarkerDocId; endmarkerDocId = 0;
 785     delete emptyTextRank;  emptyTextRank = 0;
 786     // FIXME Remove following:
 787     delete textLength;     textLength = 0;
 788     delete textStartPos;   textStartPos = 0;
 789
 790     this->maxTextLength = 0;
 791     this->numberOfTexts = 0;
 792     this->numberOfAllTexts = 0;
 793     this->samplerate = samplerate;
 794     this->n = 0;
 795
 796     uchar verFlag = 0;
 797     if (std::fread(&verFlag, 1, 1, file) != 1)
 798         throw std::runtime_error("CSA::Load(): file read error (version flag).");
 799     if (verFlag != CSA::versionFlag)
 800         throw std::runtime_error("CSA::Load(): invalid save file version.");
 801
 802     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
 803         throw std::runtime_error("CSA::Load(): file read error (n).");
 804     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
 805         throw std::runtime_error("CSA::Load(): file read error (samplerate).");
 806 // FIXME samplerate can not be changed during load.
 807 //    if (this->samplerate == 0)
 808 //        this->samplerate = samplerate;
 809
 810     for(ulong i = 0; i < 256; ++i)
 811         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
 812             throw std::runtime_error("CSA::Load(): file read error (C table).");
 813
 814     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 815         throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
 816
 817     alphabetrank = static_sequence::load(file);
 818     sampled = new BSGAP(file);
 819     suffixes = new BlockArray(file);
 820     suffixDocId = new BlockArray(file);
 821
 822     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 823         throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
 824     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 825         throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
 826     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
 827         throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
 828
 829     endmarkerDocId = new BlockArray(file);
 830     emptyTextRank = new BSGAP(file);
 831
 832     // FIXME Construct data structures with new samplerate
 833     //maketables();  // FIXME: this will redo text length tables
 834 }
 835
 836
 837
 838 /**
 839  * Rest of the functions follow...
 840  */
 841
 842 /*
 843  * Not supported
 844 uchar * CSA::Substring(TextPosition i, TextPosition l) const
 845 {
 846     uchar *result = new uchar[l + 1];
 847     if (l == 0)
 848     {
 849         result[0] = 0u;
 850         return result;
 851     }
 852
 853     TextPosition dist;
 854     TextPosition k = i + l - 1;
 855     // Check for end of the string
 856     if (k > n - 1)
 857     {
 858         l -= k - n + 1;
 859         k = n - 1;
 860     }
 861
 862     TextPosition skip = samplerate - k % samplerate - 1;
 863     TextPosition j;
 864     if (k / samplerate + 1 >= n / samplerate)
 865     {
 866         j = bwtEndPos;
 867         skip = n - k - 1;
 868     }
 869     else
 870     {
 871         j = (*positions)[k/samplerate+1];
 872         //cout << samplerate << ' ' << j << '\n';
 873     }
 874
 875     for (dist = 0; dist < skip + l; dist++)
 876     {
 877         int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp);
 878         if (c == '\0')
 879         {
 880             // Rank among the end-markers in BWT
 881             unsigned endmarkerRank = alphabetrank->rank(0, j) - 1;
 882             j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker
 883         }
 884         else
 885             j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 886         if (dist >= skip)
 887             result[l + skip - dist - 1] = c;
 888     }
 889     result[l] = 0u;
 890     return result;
 891 }
 892 */
 893   /*TextPosition CSA::inverse(TextPosition i)
 894 {
 895     TextPosition skip = samplerate - i % samplerate;
 896     TextPosition j;
 897     if (i / samplerate + 1 >= n / samplerate)
 898     {
 899         j = bwtEndPos;
 900         skip = n - i;
 901     }
 902     else
 903     {
 904         j = (*positions)[i/samplerate+1];
 905         //cout << samplerate << ' ' << j << '\n';
 906     }
 907
 908     while (skip > 0)
 909     {
 910         int c = alphabetrank->charAtPos(j);
 911         j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
 912         skip --;
 913     }
 914     return j;
 915     }*/
 916
 917 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
 918 {
 919     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
 920     int c = (int)pattern[m-1];
 921     TextPosition i=m-1;
 922     TextPosition sp = C[c];
 923     TextPosition ep = C[c+1]-1;
 924     while (sp<=ep && i>=1)
 925     {
 926 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
 927         c = (int)pattern[--i];
 928         sp = C[c]+alphabetrank->rank(c,sp-1);
 929         ep = C[c]+alphabetrank->rank(c,ep)-1;
 930     }
 931     *spResult = sp;
 932     *epResult = ep;
 933     if (sp<=ep)
 934         return ep - sp + 1;
 935     else
 936         return 0;
 937 }
 938
 939    /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
 940 {
 941     sp = C[(int)c]+alphabetrank->rank(c,sp-1);
 942     ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
 943
 944     if (sp<=ep)
 945         return ep - sp + 1;
 946     else
 947         return 0;
 948         }*/
 949
 950 CSA::TextPosition CSA::Lookup(TextPosition i) const
 951 {
 952     TextPosition dist=0;
 953     while (!sampled->IsBitSet(i))
 954     {
 955
 956         int c = alphabetrank->access(i);
 957         if (c == '\0')
 958         {
 959             // End-markers are sampled
 960             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 961             DocId docId = (*endmarkerDocId)[endmarkerRank];
 962             return (*textStartPos)[docId] + dist;
 963         }
 964         i = C[c]+alphabetrank->rank(c,i)-1;
 965         ++dist;
 966     }
 967
 968     return (*suffixes)[sampled->rank(i)-1]+dist;
 969 }
 970
 971 CSA::~CSA() {
 972     delete dynFMI;
 973     delete alphabetrank;
 974     delete sampled;
 975     delete suffixes;
 976     delete suffixDocId;
 977     delete positions;
 978     delete [] codetable;
 979
 980     delete endmarkerDocId;
 981     delete textLength;
 982     delete textStartPos;
 983     delete emptyTextRank;
 984 }
 985
 986 void CSA::makewavelet(uchar *bwt)
 987 {
 988     ulong i, min = 0,
 989              max;
 990     for (i=0;i<256;i++)
 991         C[i]=0;
 992     for (i=0;i<n;++i)
 993         C[(int)bwt[i]]++;
 994     for (i=0;i<256;i++)
 995         if (C[i]>0) {min = i; break;}
 996     for (i=255;i>=min;--i)
 997         if (C[i]>0) {max = i; break;}
 998
 999     // Print frequencies
1000     /*for(i = 0; i < 256; i++)
1001         if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
1002         fflush(stdout);*/
1003
1004     ulong prev=C[0], temp;
1005     C[0]=0;
1006     for (i=1;i<256;i++) {
1007         temp = C[i];
1008         C[i]=C[i-1]+prev;
1009         prev = temp;
1010     }
1011 //    this->codetable = node::makecodetable(bwt,n);
1012 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1013 //    delete [] bwt;
1014     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1015
1016     uint *text = new uint[n];
1017     for (i = 0; i < n; ++i) // Silly
1018         text[i] = bwt[i];
1019     delete [] bwt;
1020     bwt = 0;
1021
1022     alphabet_mapper * am = new alphabet_mapper_none();
1023     static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1024
1025     //cout << "Building Huffman table..."; cout.flush();
1026
1027     wt_coder * wtc = new wt_coder_huff(text,n,am);
1028
1029     //cout << "done" << endl; cout.flush();
1030     //cout << "Building static_sequence..."; cout.flush();
1031
1032     alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
1033     delete [] text;
1034     text = 0;
1035
1036 /*    for (i = 0; i < n; ++i)
1037     {
1038         uchar c = alphabetrank->charAtPos(i);
1039         TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
1040         printf("LF[%lu] = %lu\n", i, j);
1041         }*/
1042 }
1043
1044 void CSA::maketables()
1045 {
1046     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1047
1048     ulong *sampledpositions = new ulong[n/W+1];
1049     unsigned ceilLog2n = Tools::CeilLog2(n);
1050     positions = new BlockArray(sampleLength, ceilLog2n);
1051     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1052
1053     // Mapping from end-markers to doc ID's:
1054     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1055
1056     ulong i,j=0;
1057     for (i=0;i<n/W+1;i++)
1058         sampledpositions[i]=0lu;
1059
1060     ulong x,p=bwtEndPos;
1061     ulong sampleCount = 0;
1062     // Keeping track of text position of end-markers seen
1063     ulong posOfSuccEndmarker = n;
1064     DocId textId = numberOfTexts;
1065     ulong ulongmax = 0;
1066     ulongmax--;
1067
1068     //positions:
1069     for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1070       // i substitutes SA->GetPos(i)
1071         x=(i==n-1)?0:i+1;
1072
1073         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1074             Tools::SetField(sampledpositions,1,p,1);
1075             (*positions)[sampleCount] = p;
1076             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1077             sampleCount ++;
1078         }
1079
1080         uchar c = alphabetrank->access(p);
1081         if (c == '\0')
1082         {
1083             --textId;
1084
1085             // Record the order of end-markers in BWT:
1086             ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1087             (*endmarkerDocId)[endmarkerRank] = textId;
1088
1089             // Store text length and text start position:
1090             if (textId < (DocId)numberOfTexts - 1)
1091             {
1092                 (*textLength)[textId + 1] = posOfSuccEndmarker - x;
1093                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1094                 posOfSuccEndmarker = x;
1095             }
1096
1097             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1098             p = textId; // Correct LF-mapping to the last char of the previous text.
1099         }
1100         else
1101             p = C[c]+alphabetrank->rank(c, p)-1;
1102     }
1103     assert(textId == 0);
1104
1105     /*i = 0;
1106     for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i)
1107     {
1108         int docc = (*endmarkerDocId)[i];
1109         ulong poss = (*endmarkerPos)[i];
1110         printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss);
1111         }*/
1112 /*
1113     for (i = 0; i < numberOfTexts; ++ i)
1114     {
1115         //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl;
1116         assert(dynTextLength[i].first == (*textLength)[i]);
1117         assert(dynTextLength[i].second == (*textStartPos)[i]);
1118     }
1119 */
1120     sampled = new BSGAP(sampledpositions,n,true);
1121     sampleLength = sampled->rank(n-1);
1122     assert(sampleCount == sampleLength);
1123 //    std::cout << ";sampleLength;" << sampleLength << std::endl;
1124     // Suffixes == offset from text start position
1125     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1126     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1127
1128    //suffixes:
1129     for(i=0; i<sampleLength; i++) {
1130         assert((*positions)[i] < n);
1131         j = sampled->rank((*positions)[i]);
1132         if (j==0) j=sampleLength;
1133         TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate;
1134         (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]);
1135
1136         assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts);
1137         assert((*suffixDocId)[j-1] < numberOfTexts);
1138         // calculate offset from text start:
1139         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1140     }
1141     // FIXME Temp, remove
1142     delete tmpSuffix;
1143     tmpSuffix=0;
1144     delete positions;
1145     positions =0;
1146     delete textLength;
1147     textLength = 0;
1148     delete textStartPos;
1149     textStartPos = 0;
1150 }
1151
1152
1153 /**
1154  * Finds document identifier for given text position
1155  *
1156  * Starting text position of the document is stored into second parameter.
1157  * Binary searching on text starting positions.
1158  */
1159 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
1160 {
1161     assert(i < n);
1162
1163     DocId a = 0;
1164     DocId b = numberOfTexts - 1;
1165     while (a < b)
1166     {
1167         DocId c = a + (b - a)/2;
1168         if ((*textStartPos)[c] > i)
1169             b = c - 1;
1170         else if ((*textStartPos)[c+1] > i)
1171             return c;
1172         else
1173             a = c + 1;
1174     }
1175
1176     assert(a < (DocId)numberOfTexts);
1177     assert(i >= (*textStartPos)[a]);
1178     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1179     return a;
1180 }
1181
1182 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1183 {
1184     TCodeEntry *result = new TCodeEntry[ 256 ];
1185
1186     count_chars( text, n, result );
1187     std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1188 //
1189 // First I push all the leaf nodes into the queue
1190 //
1191     for ( unsigned int i = 0 ; i < 256 ; i++ )
1192         if ( result[ i ].count )
1193             q.push(node( i, result[ i ].count ) );
1194 //
1195 // This loop removes the two smallest nodes from the
1196 // queue.  It creates a new internal node that has
1197 // those two nodes as children. The new internal node
1198 // is then inserted into the priority queue.  When there
1199 // is only one node in the priority queue, the tree
1200 // is complete.
1201 //
1202
1203     while ( q.size() > 1 ) {
1204         node *child0 = new node( q.top() );
1205         q.pop();
1206         node *child1 = new node( q.top() );
1207         q.pop();
1208         q.push( node( child0, child1 ) );
1209     }
1210 //
1211 // Now I compute and return the codetable
1212 //
1213     q.top().maketable(0u,0u, result);
1214     q.pop();
1215     return result;
1216 }
1217
1218
1219 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1220 {
1221     if ( child0 )
1222     {
1223         child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1224         child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1225         delete child0;
1226         delete child1;
1227     }
1228     else
1229     {
1230         codetable[value].code = code;
1231         codetable[value].bits = bits;
1232     }
1233 }
1234
1235 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1236 {
1237     ulong i;
1238     for (i = 0 ; i < 256 ; i++ )
1239         counts[ i ].count = 0;
1240     for (i=0; i<n; i++)
1241         counts[(int)text[i]].count++;
1242 }
1243
1244 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1245       return x | (bit << pos);
1246 }
1247