TCImplementation.cpp

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20 #include "TCImplementation.h"
  21
  22 //#include "HeapProfiler.h" // FIXME remove
  23
  24 #include <iostream>
  25 #include <map>
  26 #include <set>
  27 #include <vector>
  28 #include <utility>
  29 #include <stdexcept>
  30 #include <cassert>
  31 #include <cstring> // For strlen()
  32 using std::vector;
  33 using std::pair;
  34 using std::make_pair;
  35 using std::map;
  36
  37 namespace SXSI
  38 {
  39
  40 // Save file version info
  41 const uchar TCImplementation::versionFlag = 3;
  42
  43 /**
  44  * Constructor inits an empty dynamic FM-index.
  45  * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
  46  */
  47 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
  48     : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
  49       suffixDocId(0), numberOfTexts(numberOfTexts_), numberOfAllTexts(numberOfTexts_), maxTextLength(maxTextLength_), endmarkerDocId(0),
  50       emptyTextRank(0)
  51 {
  52
  53     // Bitvector of empty texts, FIXME Remove?
  54     {
  55         //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
  56         ulong *tempB = new ulong[numberOfTexts/W + 1];
  57         for (ulong i = 0; i < numberOfTexts/W + 1; ++i)
  58             tempB[i] = 0;
  59 //        for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
  60 //            Tools::SetField(tempB, 1, (*it), 1);
  61         emptyTextRank = new BSGAP(tempB, numberOfTexts, true);
  62    }
  63     makewavelet(bwt); // Deletes bwt!
  64     bwt = 0;
  65
  66     // Make sampling tables
  67     maketables();
  68 }
  69
  70 bool TCImplementation::EmptyText(DocId k) const
  71 {
  72     assert(k < (DocId)numberOfTexts);
  73     if (emptyTextRank->IsBitSet(k))
  74         return true;
  75     return false;
  76 }
  77
  78 uchar* TCImplementation::GetText(DocId k) const
  79 {
  80     assert(k < (DocId)numberOfTexts);
  81
  82     ulong textRank = 0;
  83     if (emptyTextRank->IsBitSet(k, &textRank))
  84     {
  85         uchar* result = new uchar[1];
  86         result[0] = 0;
  87         return result;
  88     }
  89     // Map to non-empty text
  90     k -= textRank; //emptyTextRank->rank(k);
  91
  92     TextPosition i = k;
  93
  94     string result;
  95     // Reserve average string length to avoid reallocs
  96     result.reserve(n/numberOfTexts);
  97
  98     uchar c = alphabetrank->access(i);
  99     while (c != '\0')
 100     {
 101         result.push_back(c);
 102         i = C[c]+alphabetrank->rank(c,i)-1;
 103
 104         c = alphabetrank->access(i); // "next" char.
 105     }
 106
 107     // Convert to uchar (FIXME return string?)
 108     i = result.size();
 109     uchar* res = new uchar[i+1];
 110     res[i] = '\0';
 111     for (ulong j = 0; j < i; ++j)
 112         res[i-j-1] = result[j];
 113     return res;
 114 }
 115 /*
 116  * Not supported
 117 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
 118 {
 119     assert(k < (DocId)numberOfTexts);
 120     assert(j < (*textLength)[k]);
 121     assert(i <= j);
 122
 123     ulong textRank = 0;
 124     if (emptyTextRank->IsBitSet(k, &textRank))
 125     {
 126         uchar* result = new uchar[1];
 127         result[0] = 0;
 128         return result;
 129     }
 130
 131     // Map to non-empty text
 132     k -= textRank; // emptyTextRank->rank(k);
 133
 134     // Start position of k'th text
 135     ulong start = (*textStartPos)[k];
 136
 137     return Substring(i + start, j-i+1);
 138     }*/
 139
 140 /******************************************************************
 141  * Existential queries
 142  */
 143 bool TCImplementation::IsPrefix(uchar const * pattern) const
 144 {
 145     TextPosition m = strlen((char *)pattern);
 146     if (m == 0)
 147         return true;
 148
 149     TextPosition sp = 0, ep = 0;
 150     Search(pattern, m, &sp, &ep);
 151
 152     // Check for end-marker(s) in result interval
 153     if (CountEndmarkers(sp, ep))
 154         return true;
 155     return false;
 156 }
 157
 158 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
 159 {
 160     TextPosition m = strlen((char *)pattern);
 161     if (m == 0)
 162         return true;
 163
 164     TextPosition sp = 0, ep = 0;
 165     Search(pattern, m, &sp, &ep);
 166
 167     // Check for end-marker(s) in result interval
 168     if (CountEndmarkers(sp, ep, begin, end))
 169         return true;
 170     return false;
 171 }
 172
 173
 174 bool TCImplementation::IsSuffix(uchar const *pattern) const
 175 {
 176     // Here counting is as fast as IsSuffix():
 177     if (CountSuffix(pattern) > 0)
 178         return true;
 179     return false;
 180 }
 181
 182 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
 183 {
 184     // Here counting is as fast as IsSuffix():
 185     if (CountSuffix(pattern, begin, end) > 0)
 186         return true;
 187     return false;
 188 }
 189
 190 bool TCImplementation::IsEqual(uchar const *pattern) const
 191 {
 192     TextPosition m = std::strlen((char *)pattern);
 193     if (m == 0)
 194     {
 195         if (numberOfAllTexts - numberOfTexts > 0u)
 196             return true; // Empty texts exists
 197         return false; // No empty texts exists
 198     }
 199
 200     TextPosition sp = 0, ep = 0;
 201     // Match including end-marker
 202     Search(pattern, m+1, &sp, &ep);
 203
 204     // Check for end-marker(s) in result interval
 205     if (CountEndmarkers(sp, ep))
 206         return true;
 207     return false;
 208 }
 209
 210 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
 211 {
 212     TextPosition m = std::strlen((char *)pattern);
 213     if (m == 0)
 214     {
 215         if (numberOfAllTexts - numberOfTexts > 0u)
 216             return true; // Empty texts exists
 217         return false; // No empty texts exists
 218     }
 219
 220     TextPosition sp = 0, ep = 0;
 221     // Match including end-marker
 222     Search(pattern, m+1, &sp, &ep, begin, end);
 223
 224     // Check for end-marker(s) in result interval
 225     if (CountEndmarkers(sp, ep))
 226         return true;
 227     return false;
 228 }
 229
 230 bool TCImplementation::IsContains(uchar const * pattern) const
 231 {
 232     TextPosition m = strlen((char *)pattern);
 233     if (m == 0)
 234         return true;
 235
 236     TextPosition sp = 0, ep = 0;
 237     // Just check if pattern exists somewhere
 238     ulong count = Search(pattern, m, &sp, &ep);
 239
 240     if (count > 0)
 241         return true;
 242     return false;
 243 }
 244
 245 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
 246 {
 247     // Here counting is as fast as existential querying
 248     if (CountContains(pattern, begin, end) > 0)
 249         return true; // FIXME No need to filter result set
 250     return false;
 251 }
 252
 253 bool TCImplementation::IsLessThan(uchar const * pattern) const
 254 {
 255     if (CountLessThan(pattern) > 0)
 256         return true;
 257     return false;
 258 }
 259
 260 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
 261 {
 262     if (CountLessThan(pattern, begin, end) > 0)
 263         return true;
 264     return false;
 265 }
 266
 267 /******************************************************************
 268  * Counting queries
 269  */
 270 ulong TCImplementation::Count(uchar const * pattern) const
 271 {
 272     TextPosition m = strlen((char *)pattern);
 273     if (m == 0)
 274         return 0;
 275
 276     TextPosition sp = 0, ep = 0;
 277     unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
 278     return count;
 279 }
 280
 281 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
 282 {
 283     TextPosition m = strlen((char *)pattern);
 284     if (m == 0)
 285         return numberOfAllTexts;
 286
 287     TextPosition sp = 0, ep = 0;
 288     Search(pattern, m, &sp, &ep);
 289
 290     // Count end-markers in result interval
 291     return CountEndmarkers(sp, ep);
 292 }
 293
 294 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
 295 {
 296     TextPosition m = strlen((char *)pattern);
 297     if (m == 0)
 298         return numberOfAllTexts;
 299
 300     TextPosition sp = 0, ep = 0;
 301     Search(pattern, m, &sp, &ep);
 302
 303     // Count end-markers in result interval
 304     return CountEndmarkers(sp, ep, begin, end);
 305 }
 306
 307 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
 308 {
 309     TextPosition m = strlen((char *)pattern);
 310     if (m == 0)
 311         return numberOfAllTexts;
 312
 313     TextPosition sp = 0, ep = 0;
 314     // Search with end-marker
 315     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
 316
 317     return count;
 318 }
 319
 320 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
 321 {
 322     TextPosition m = strlen((char *)pattern);
 323     if (m == 0)
 324         return numberOfAllTexts;
 325
 326     TextPosition sp = 0, ep = 0;
 327     // Search with end-marker
 328     unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
 329
 330     return count;
 331 }
 332
 333 unsigned TCImplementation::CountEqual(uchar const *pattern) const
 334 {
 335     TextPosition m = strlen((char const *)pattern);
 336     if (m == 0)
 337         return numberOfAllTexts - numberOfTexts; // Empty texts.
 338
 339     TextPosition sp = 0, ep = 0;
 340     // Match including end-marker
 341     Search(pattern, m+1, &sp, &ep);
 342
 343     // Count end-markers in result interval
 344     return CountEndmarkers(sp, ep);
 345 }
 346
 347 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
 348 {
 349     TextPosition m = strlen((char const *)pattern);
 350     if (m == 0)
 351         return numberOfAllTexts - numberOfTexts; // Empty texts.
 352
 353     TextPosition sp = 0, ep = 0;
 354     // Match including end-marker
 355     Search(pattern, m+1, &sp, &ep, begin, end);
 356
 357     // Count end-markers in result interval
 358     return CountEndmarkers(sp, ep); // Already within [begin, end]
 359 }
 360
 361 unsigned TCImplementation::CountContains(uchar const * pattern) const
 362 {
 363     TextPosition m = strlen((char const *)pattern);
 364     if (m == 0)
 365         return numberOfAllTexts; // Total number of texts.
 366
 367     // Here counting is as slow as fetching the result set
 368     // because we have to filter out occ's that fall within same document.
 369     TextCollection::document_result result = Contains(pattern);
 370     return result.size();
 371 }
 372
 373 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
 374 {
 375     TextPosition m = strlen((char const *)pattern);
 376     if (m == 0)
 377         return numberOfAllTexts; // Total number of texts.
 378
 379     // Here counting is as slow as fetching the result set
 380     // because we have to filter out occ's that fall within same document.
 381     TextCollection::document_result result = Contains(pattern, begin, end);
 382     return result.size();
 383 }
 384
 385 // Less than or equal
 386 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
 387 {
 388     TextPosition m = strlen((char const *)pattern);
 389     if (m == 0)
 390         return numberOfAllTexts - numberOfTexts; // Empty texts.
 391
 392     TextPosition sp = 0, ep = 0;
 393     SearchLessThan(pattern, m, &sp, &ep);
 394
 395     // Count end-markers in result interval
 396     return CountEndmarkers(sp, ep);
 397 }
 398
 399 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
 400 {
 401     TextPosition m = strlen((char const *)pattern);
 402     if (m == 0)
 403         return numberOfAllTexts - numberOfTexts; // Empty texts.
 404
 405     TextPosition sp = 0, ep = 0;
 406     SearchLessThan(pattern, m, &sp, &ep);
 407
 408     // Count end-markers in result interval
 409     return CountEndmarkers(sp, ep, begin, end);
 410 }
 411
 412 /**
 413  * Document reporting queries
 414  */
 415 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
 416 {
 417     TextPosition m = strlen((char *)pattern);
 418     if (m == 0)
 419         return TextCollection::document_result(); // FIXME Should return all 1...k
 420
 421     TextPosition sp = 0, ep = 0;
 422     Search(pattern, m, &sp, &ep);
 423
 424     TextCollection::document_result result;
 425
 426     // Report end-markers in result interval
 427     unsigned resultSize = CountEndmarkers(sp, ep);
 428     if (resultSize == 0)
 429         return result;
 430
 431     result.reserve(resultSize); // Try to avoid reallocation.
 432
 433     // Iterate through end-markers in [sp,ep]:
 434     unsigned i = 0;
 435     if (sp > 0)
 436         i = alphabetrank->rank(0, sp - 1);
 437     while (resultSize)
 438     {
 439         // End-marker we found belongs to the "preceeding" doc in the collection
 440         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 441         // Map to doc ID:
 442         docId = emptyTextRank->select0(docId+1);
 443         result.push_back(docId);
 444
 445         -- resultSize;
 446         ++ i;
 447     }
 448
 449     return result;
 450 }
 451
 452 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
 453 {
 454     TextPosition m = strlen((char *)pattern);
 455     if (m == 0)
 456         return TextCollection::document_result(); // FIXME Should return all 1...k
 457
 458     TextPosition sp = 0, ep = 0;
 459     Search(pattern, m, &sp, &ep);
 460
 461     TextCollection::document_result result;
 462
 463     // Report end-markers in result interval
 464     unsigned resultSize = CountEndmarkers(sp, ep);
 465     if (resultSize == 0)
 466         return result;
 467
 468     result.reserve(resultSize); // Try to avoid reallocation.
 469
 470     // Iterate through end-markers in [sp,ep] and [begin, end]:
 471     unsigned i = 0;
 472     if (sp > 0)
 473         i = alphabetrank->rank(0, sp - 1);
 474     while (resultSize)
 475     {
 476         // End-marker we found belongs to the "preceeding" doc in the collection
 477         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 478         // Map to doc ID:
 479         docId = emptyTextRank->select0(docId+1);
 480         if (docId >= begin && docId <= end)
 481             result.push_back(docId);
 482
 483         -- resultSize;
 484         ++ i;
 485     }
 486
 487     return result;
 488 }
 489
 490 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
 491 {
 492     TextPosition m = strlen((char *)pattern);
 493     if (m == 0)
 494         return TextCollection::document_result(); // FIXME Should return all 1...k
 495
 496     TextPosition sp = 0, ep = 0;
 497     // Search with end-marker
 498     Search(pattern, m+1, &sp, &ep);
 499
 500     TextCollection::document_result result;
 501     result.reserve(ep-sp+1); // Try to avoid reallocation.
 502
 503     ulong sampled_rank_i = 0;
 504     // Check each occurrence
 505     for (; sp <= ep; ++sp)
 506     {
 507         TextPosition i = sp;
 508
 509         uchar c = alphabetrank->access(i);
 510         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 511         {
 512             i = C[c]+alphabetrank->rank(c,i)-1;
 513             c = alphabetrank->access(i);
 514         }
 515         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 516
 517         if (c == '\0')
 518         {
 519             // Rank among the end-markers in BWT
 520             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 521
 522             // End-marker that we found belongs to the "preceeding" doc in collection:
 523             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 524             // Map to doc ID:
 525             docId = emptyTextRank->select0(docId+1);
 526             result.push_back(docId);
 527         }
 528         else // Sampled position
 529         {
 530             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 531             // Map to doc ID:
 532             docId = emptyTextRank->select0(docId+1);
 533             result.push_back(docId);
 534         }
 535     }
 536
 537     return result;
 538 }
 539
 540 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
 541 {
 542     TextPosition m = strlen((char *)pattern);
 543     if (m == 0)
 544         return TextCollection::document_result(); // FIXME Should return all 1...k
 545
 546     TextPosition sp = 0, ep = 0;
 547     // Search with end-marker
 548     Search(pattern, m+1, &sp, &ep, begin, end);
 549
 550     TextCollection::document_result result;
 551     result.reserve(ep-sp+1); // Try to avoid reallocation.
 552
 553     ulong sampled_rank_i = 0;
 554     // Check each occurrence, already within [begin, end]
 555     for (; sp <= ep; ++sp)
 556     {
 557         TextPosition i = sp;
 558
 559         uchar c = alphabetrank->access(i);
 560         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 561         {
 562             i = C[c]+alphabetrank->rank(c,i)-1;
 563             c = alphabetrank->access(i);
 564         }
 565         // Assert: c == '\0'  OR  sampled->IsBitSet(i)
 566
 567         if (c == '\0')
 568         {
 569             // Rank among the end-markers in BWT
 570             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 571
 572             // End-marker that we found belongs to the "preceeding" doc in collection:
 573             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 574             // Map to doc ID:
 575             docId = emptyTextRank->select0(docId+1);
 576             result.push_back(docId);
 577         }
 578         else // Sampled position
 579         {
 580             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 581             // Map to doc ID:
 582             docId = emptyTextRank->select0(docId+1);
 583             result.push_back(docId);
 584         }
 585     }
 586
 587     return result;
 588 }
 589
 590
 591 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
 592 {
 593     TextPosition m = strlen((char const *)pattern);
 594     if (m == 0)
 595         return TextCollection::document_result(); // FIXME Should return all empty texts
 596
 597     TextPosition sp = 0, ep = 0;
 598     // Match including end-marker
 599     Search(pattern, m+1, &sp, &ep);
 600
 601     TextCollection::document_result result;
 602
 603     // Report end-markers in result interval
 604     unsigned resultSize = CountEndmarkers(sp, ep);
 605     if (resultSize == 0)
 606         return result;
 607
 608     result.reserve(resultSize); // Try to avoid reallocation.
 609
 610     unsigned i = 0;
 611     if (sp > 0)
 612         i = alphabetrank->rank(0, sp - 1);
 613     while (resultSize)
 614     {
 615         // End-marker we found belongs to the "previous" doc in the collection
 616         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 617         // Map to doc ID:
 618         docId = emptyTextRank->select0(docId+1);
 619         result.push_back(docId);
 620
 621         -- resultSize;
 622         ++ i;
 623     }
 624
 625     return result;
 626 }
 627
 628 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
 629 {
 630     TextPosition m = strlen((char const *)pattern);
 631     if (m == 0)
 632         return TextCollection::document_result(); // FIXME Should return all empty texts
 633
 634     TextPosition sp = 0, ep = 0;
 635     // Match including end-marker
 636     Search(pattern, m+1, &sp, &ep, begin, end);
 637
 638     TextCollection::document_result result;
 639
 640     // Report end-markers in result interval
 641     unsigned resultSize = CountEndmarkers(sp, ep);
 642     if (resultSize == 0)
 643         return result;
 644
 645     result.reserve(resultSize); // Try to avoid reallocation.
 646
 647     unsigned i = 0;
 648     if (sp > 0)
 649         i = alphabetrank->rank(0, sp - 1);
 650     while (resultSize)
 651     {
 652         // End-marker we found belongs to the "previous" doc in the collection
 653         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 654         // Map to doc ID:
 655         docId = emptyTextRank->select0(docId+1);
 656         result.push_back(docId); // already within [begin, end]
 657
 658         -- resultSize;
 659         ++ i;
 660     }
 661
 662     return result;
 663 }
 664
 665
 666 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
 667 {
 668     TextPosition m = strlen((char *)pattern);
 669     if (m == 0)
 670         return TextCollection::document_result();
 671
 672     TextPosition sp = 0, ep = 0;
 673     // Search all occurrences
 674     Search(pattern, m, &sp, &ep);
 675
 676     // We want unique document indentifiers, using std::set to collect them
 677     std::set<DocId> resultSet;
 678
 679     ulong sampled_rank_i = 0;
 680     // Check each occurrence
 681     for (; sp <= ep; ++sp)
 682     {
 683         TextPosition i = sp;
 684         uchar c = alphabetrank->access(i);
 685         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 686         {
 687             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 688             c = alphabetrank->access(i);
 689         }
 690         if (c == '\0')
 691         {
 692             // Rank among the end-markers in BWT
 693             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 694
 695             // End-marker that we found belongs to the "preceeding" doc in collection:
 696             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 697             resultSet.insert(docId);
 698         }
 699         else
 700         {
 701             DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 702             assert((unsigned)di < numberOfTexts);
 703             resultSet.insert(di);
 704         }
 705     }
 706
 707     // Convert std::set to std::vector
 708     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 709     // Map to doc ID's
 710     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 711         *it = emptyTextRank->select0(*it+1);
 712     return result;
 713 }
 714
 715 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
 716 {
 717     TextPosition m = strlen((char *)pattern);
 718     if (m == 0)
 719         return TextCollection::document_result();
 720
 721     TextPosition sp = 0, ep = 0;
 722     // Search all occurrences
 723     Search(pattern, m, &sp, &ep);
 724
 725     // We want unique document indentifiers, using std::set to collect them
 726     std::set<DocId> resultSet;
 727
 728     ulong sampled_rank_i = 0;
 729     // Check each occurrence
 730     for (; sp <= ep; ++sp)
 731     {
 732         TextPosition i = sp;
 733         uchar c = alphabetrank->access(i);
 734         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 735         {
 736             i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
 737             c = alphabetrank->access(i);
 738         }
 739         if (c == '\0')
 740         {
 741             // Rank among the end-markers in BWT
 742             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 743
 744             // End-marker that we found belongs to the "preceeding" doc in collection:
 745             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 746             if (docId >= begin && docId <= end)
 747                 resultSet.insert(docId);
 748         }
 749         else
 750         {
 751             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 752             assert((unsigned)docId < numberOfTexts);
 753             if (docId >= begin && docId <= end)
 754                 resultSet.insert(docId);
 755         }
 756     }
 757
 758     // Convert std::set to std::vector
 759     TextCollection::document_result result(resultSet.begin(), resultSet.end());
 760     // Map to doc ID's
 761     for (document_result::iterator it = result.begin(); it != result.end(); ++it)
 762         *it = emptyTextRank->select0(*it+1);
 763     return result;
 764 }
 765
 766 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
 767 {
 768     TextPosition m = strlen((char *)pattern);
 769     if (m == 0)
 770         return TextCollection::document_result(); // empty result set
 771
 772     TextPosition sp = 0, ep = 0;
 773     SearchLessThan(pattern, m, &sp, &ep);
 774
 775     TextCollection::document_result result;
 776
 777     // Report end-markers in result interval
 778     unsigned resultSize = CountEndmarkers(sp, ep);
 779     if (resultSize == 0)
 780         return result;
 781
 782     result.reserve(resultSize); // Try to avoid reallocation.
 783
 784     // Iterate through end-markers in [sp,ep]:
 785     unsigned i = 0;
 786     if (sp > 0)
 787         i = alphabetrank->rank(0, sp - 1);
 788     while (resultSize)
 789     {
 790         // End-marker we found belongs to the "preceeding" doc in the collection
 791         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 792         // Map to doc ID:
 793         docId = emptyTextRank->select0(docId+1);
 794         result.push_back(docId);
 795
 796         -- resultSize;
 797         ++ i;
 798     }
 799
 800     return result;
 801 }
 802
 803 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
 804 {
 805     TextPosition m = strlen((char *)pattern);
 806     if (m == 0)
 807         return TextCollection::document_result(); // empty result set
 808
 809     TextPosition sp = 0, ep = 0;
 810     SearchLessThan(pattern, m, &sp, &ep);
 811
 812     TextCollection::document_result result;
 813
 814     // Report end-markers in result interval
 815     unsigned resultSize = CountEndmarkers(sp, ep);
 816     if (resultSize == 0)
 817         return result;
 818
 819     result.reserve(resultSize); // Try to avoid reallocation.
 820
 821     // Iterate through end-markers in [sp,ep] and [begin, end]:
 822     unsigned i = 0;
 823     if (sp > 0)
 824         i = alphabetrank->rank(0, sp - 1);
 825     while (resultSize)
 826     {
 827         // End-marker we found belongs to the "preceeding" doc in the collection
 828         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
 829         // Map to doc ID:
 830         docId = emptyTextRank->select0(docId+1);
 831         if (docId >= begin && docId <= end)
 832             result.push_back(docId);
 833
 834         -- resultSize;
 835         ++ i;
 836     }
 837
 838     return result;
 839 }
 840
 841 /**
 842  * Full result set queries
 843  */
 844 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
 845 {
 846     TextPosition m = strlen((char *)pattern);
 847     if (m == 0)
 848         return full_result(); // FIXME Throw exception?
 849
 850     TextPosition sp = 0, ep = 0;
 851     // Search all occurrences
 852     Search(pattern, m, &sp, &ep);
 853
 854     full_result result;
 855     result.reserve(ep-sp+1); // Try to avoid reallocation.
 856
 857     ulong sampled_rank_i = 0;
 858     // Report each occurrence
 859     for (; sp <= ep; ++sp)
 860     {
 861         TextPosition i = sp;
 862         TextPosition dist = 0;
 863         uchar c = alphabetrank->access(i);
 864         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 865         {
 866             i = C[c]+alphabetrank->rank(c,i)-1;
 867             c = alphabetrank->access(i);
 868             ++ dist;
 869         }
 870         if (c == '\0')
 871         {
 872             // Rank among the end-markers in BWT
 873             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 874
 875             // End-marker that we found belongs to the "preceeding" doc in collection:
 876             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 877             // Map to doc ID:
 878             docId = emptyTextRank->select0(docId+1);
 879             result.push_back(make_pair(docId, dist));
 880         }
 881         else
 882         {
 883             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
 884             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 885 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
 886
 887             // Map to doc ID:
 888             docId = emptyTextRank->select0(docId+1);
 889             result.push_back(make_pair(docId, textPos));
 890         }
 891     }
 892
 893     return result;
 894 }
 895
 896 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
 897 {
 898     TextPosition m = strlen((char *)pattern);
 899     if (m == 0)
 900         return full_result(); // FIXME Throw exception?
 901
 902     TextPosition sp = 0, ep = 0;
 903     // Search all occurrences
 904     Search(pattern, m, &sp, &ep);
 905
 906     full_result result;
 907     result.reserve(ep-sp+1); // Try to avoid reallocation.
 908
 909     ulong sampled_rank_i = 0;
 910     // Report each occurrence
 911     for (; sp <= ep; ++sp)
 912     {
 913         TextPosition i = sp;
 914         TextPosition dist = 0;
 915         uchar c = alphabetrank->access(i);
 916         while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
 917         {
 918             i = C[c]+alphabetrank->rank(c,i)-1;
 919             c = alphabetrank->access(i);
 920             ++ dist;
 921         }
 922         if (c == '\0')
 923         {
 924             // Rank among the end-markers in BWT
 925             unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
 926
 927             // End-marker that we found belongs to the "preceeding" doc in collection:
 928             DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
 929             // Map to doc ID:
 930             docId = emptyTextRank->select0(docId+1);
 931             if (docId >= begin && docId <= end)
 932                 result.push_back(make_pair(docId, dist));
 933         }
 934         else
 935         {
 936             TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
 937             DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
 938 //            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
 939
 940             // Map to doc ID:
 941             docId = emptyTextRank->select0(docId+1);
 942             if (docId >= begin && docId <= end)
 943                 result.push_back(make_pair(docId, textPos));
 944         }
 945     }
 946
 947     return result;
 948 }
 949
 950
 951 /**
 952  * Save index to a file handle
 953  *
 954  * Throws a std::runtime_error exception on i/o error.
 955  * First byte that is saved represents the version number of the save file.
 956  * In version 2 files, the data fields are:
 957  *  uchar versionFlag;
 958     TextPosition n;
 959     unsigned samplerate;
 960     unsigned C[256];
 961     TextPosition bwtEndPos;
 962     static_sequence * alphabetrank;
 963     BSGAP *sampled;
 964     BlockArray *suffixes;
 965     BlockArray *suffixDocId;
 966     unsigned numberOfTexts;
 967     unsigned numberOfAllTexts;
 968     ulong maxTextLength;
 969     BlockArray *endmarkerDocId;
 970     BSGAP *emptyTextRank;
 971  */
 972 void TCImplementation::Save(FILE *file) const
 973 {
 974     // Saving version info:
 975     if (std::fwrite(&versionFlag, 1, 1, file) != 1)
 976         throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
 977
 978     if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
 979         throw std::runtime_error("TCImplementation::Save(): file write error (n).");
 980     if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
 981         throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
 982
 983     for(ulong i = 0; i < 256; ++i)
 984         if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
 985             throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
 986
 987     if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
 988         throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
 989
 990     alphabetrank->save(file);
 991     sampled->Save(file);
 992     suffixes->Save(file);
 993     suffixDocId->Save(file);
 994
 995     if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
 996         throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
 997     if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
 998         throw std::runtime_error("TCImplementation::Save(): file write error (numberOfAllTexts).");
 999     if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1000         throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
1001
1002     endmarkerDocId->Save(file);
1003     emptyTextRank->Save(file);
1004     fflush(file);
1005 }
1006
1007
1008 /**
1009  * Load index from a file handle
1010  *
1011  * Throws a std::runtime_error exception on i/o error.
1012  * For more info, see TCImplementation::Save().
1013  */
1014 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
1015     : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
1016       suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
1017       emptyTextRank(0)
1018 {
1019     uchar verFlag = 0;
1020     if (std::fread(&verFlag, 1, 1, file) != 1)
1021         throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
1022     if (verFlag != TCImplementation::versionFlag)
1023         throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
1024
1025     if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
1026         throw std::runtime_error("TCImplementation::Load(): file read error (n).");
1027     if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
1028         throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
1029 // FIXME samplerate can not be changed during load.
1030 //    if (this->samplerate == 0)
1031 //        this->samplerate = samplerate;
1032
1033     for(ulong i = 0; i < 256; ++i)
1034         if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
1035             throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
1036
1037     if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1038         throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
1039
1040     alphabetrank = static_sequence::load(file);
1041     sampled = new BSGAP(file);
1042     suffixes = new BlockArray(file);
1043     suffixDocId = new BlockArray(file);
1044
1045     if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1046         throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
1047     if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1048         throw std::runtime_error("TCImplementation::Load(): file read error (numberOfAllTexts).");
1049     if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1050         throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
1051
1052     endmarkerDocId = new BlockArray(file);
1053     emptyTextRank = new BSGAP(file);
1054
1055     // FIXME Construct data structures with new samplerate
1056     //maketables();
1057 }
1058
1059
1060
1061 /**
1062  * Rest of the functions follow...
1063  */
1064
1065
1066 // FIXME Use 2D-search structure
1067 unsigned TCImplementation::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
1068 {
1069     if (sp > ep)
1070         return 0;
1071
1072     ulong ranksp = 0;
1073     if (sp != 0)
1074         ranksp = alphabetrank->rank(0, sp - 1);
1075
1076     unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
1077     if (resultSize == 0)
1078         return 0;
1079
1080     // Count end-markers in result interval and within [begin, end]
1081     unsigned count = 0;
1082     unsigned i = ranksp;
1083     while (resultSize)
1084     {
1085         // End-marker we found belongs to the "previous" doc in the collection
1086         DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1087         // Map to doc ID:
1088         docId = emptyTextRank->select0(docId+1);
1089         if (docId >= begin && docId <= end)
1090             ++ count;
1091
1092         -- resultSize;
1093         ++ i;
1094     }
1095     return count;
1096 }
1097
1098 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1099 {
1100     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1101     int c = (int)pattern[m-1];
1102     TextPosition i=m-1;
1103     TextPosition sp = C[c];
1104     TextPosition ep = C[c+1]-1;
1105     while (sp<=ep && i>=1)
1106     {
1107 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1108         c = (int)pattern[--i];
1109         sp = C[c]+alphabetrank->rank(c,sp-1);
1110         ep = C[c]+alphabetrank->rank(c,ep)-1;
1111     }
1112     *spResult = sp;
1113     *epResult = ep;
1114     if (sp<=ep)
1115         return ep - sp + 1;
1116     else
1117         return 0;
1118 }
1119
1120 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
1121 {
1122     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1123     int c = (int)pattern[m-1];
1124     assert(c == 0); // Start from endmarkers
1125     TextPosition i=m-1;
1126     TextPosition sp = begin;
1127     TextPosition ep = end;
1128     while (sp<=ep && i>=1)
1129     {
1130 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1131         c = (int)pattern[--i];
1132         sp = C[c]+alphabetrank->rank(c,sp-1);
1133         ep = C[c]+alphabetrank->rank(c,ep)-1;
1134     }
1135     *spResult = sp;
1136     *epResult = ep;
1137     if (sp<=ep)
1138         return ep - sp + 1;
1139     else
1140         return 0;
1141 }
1142
1143
1144 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1145 {
1146     // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1147     uint c = (int)pattern[m-1];
1148     TextPosition i=m-1;
1149     TextPosition sp = 1;
1150     TextPosition ep = C[c+1]-1;
1151     while (sp<=ep && i>=1)
1152     {
1153 //         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1154         c = (int)pattern[--i];
1155         uint result = alphabetrank->rank(c,ep);
1156         if (result == ~0u)
1157             ep = 0;
1158         else
1159             ep = C[c]+result-1;
1160     }
1161     *spResult = sp;
1162     *epResult = ep;
1163     if (sp<=ep)
1164         return ep - sp + 1;
1165     else
1166         return 0;
1167 }
1168
1169
1170 TCImplementation::~TCImplementation() {
1171 #ifdef CSA_TEST_BWT
1172     delete dynFMI;
1173 #endif
1174     delete alphabetrank;
1175     delete sampled;
1176     delete suffixes;
1177     delete suffixDocId;
1178
1179     delete endmarkerDocId;
1180     delete emptyTextRank;
1181 }
1182
1183 void TCImplementation::makewavelet(uchar *bwt)
1184 {
1185     ulong i, min = 0,
1186              max;
1187     for (i=0;i<256;i++)
1188         C[i]=0;
1189     for (i=0;i<n;++i)
1190         C[(int)bwt[i]]++;
1191     for (i=0;i<256;i++)
1192         if (C[i]>0) {min = i; break;}
1193     for (i=255;i>=min;--i)
1194         if (C[i]>0) {max = i; break;}
1195
1196     ulong prev=C[0], temp;
1197     C[0]=0;
1198     for (i=1;i<256;i++) {
1199         temp = C[i];
1200         C[i]=C[i-1]+prev;
1201         prev = temp;
1202     }
1203 //    this->codetable = node::makecodetable(bwt,n);
1204 //    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1205 //    delete [] bwt;
1206     //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1207 //  std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1208 //    std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1209
1210 //    HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1211
1212     alphabet_mapper * am = new alphabet_mapper_none();
1213     static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1214 //    static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1215     wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1216     alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1217     delete bmb;
1218     bwt = 0; // already deleted
1219
1220 //    std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1221 //    std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1222 }
1223
1224 void TCImplementation::maketables()
1225 {
1226     // Calculate BWT end-marker position (of last inserted text)
1227     {
1228         ulong i = 0;
1229         uint alphabetrank_i_tmp = 0;
1230         uchar c  = alphabetrank->access(i, alphabetrank_i_tmp);
1231         while (c != '\0')
1232         {
1233             i = C[c]+alphabetrank_i_tmp-1;
1234             c = alphabetrank->access(i, alphabetrank_i_tmp);
1235         }
1236
1237         this->bwtEndPos = i;
1238     }
1239
1240     // Build up arrays for text length and starting positions
1241     // FIXME Temp, remove
1242     //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1243     BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1244     //(*textLength)[0] = l;
1245     (*textStartPos)[0] = 0;
1246
1247     // Construct samples
1248     ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1249     unsigned ceilLog2n = Tools::CeilLog2(n);
1250     BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1251     BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1252
1253     // Mapping from end-markers to doc ID's:
1254     endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1255
1256     ulong *sampledpositions = new ulong[n/W+1];
1257     for (ulong i=0;i<n/W+1;i++)
1258         sampledpositions[i]=0lu;
1259
1260     ulong x,p=bwtEndPos;
1261     ulong sampleCount = 0;
1262     // Keeping track of text position of prev. end-marker seen
1263     ulong posOfSuccEndmarker = n;
1264     DocId textId = numberOfTexts;
1265     ulong ulongmax = 0;
1266     ulongmax--;
1267     uint alphabetrank_i_tmp =0;
1268
1269     //positions:
1270     for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1271       // i substitutes SA->GetPos(i)
1272         x=(i==n-1)?0:i+1;
1273
1274         if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1275             Tools::SetField(sampledpositions,1,p,1);
1276             (*positions)[sampleCount] = p;
1277             (*tmpSuffix)[sampleCount] = x; // FIXME remove
1278             sampleCount ++;
1279         }
1280
1281         uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1282         if (c == '\0')
1283         {
1284             --textId;
1285
1286             // Record the order of end-markers in BWT:
1287             ulong endmarkerRank = alphabetrank_i_tmp - 1;
1288             (*endmarkerDocId)[endmarkerRank] = textId;
1289
1290             // Store text length and text start position:
1291             if (textId < (DocId)numberOfTexts - 1)
1292             {
1293                 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1294                 (*textStartPos)[textId + 1] = x;  // x is the position of end-marker.
1295                 posOfSuccEndmarker = x;
1296             }
1297
1298             // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1299             p = textId; // Correct LF-mapping to the last char of the previous text.
1300         }
1301         else
1302             p = C[c]+alphabetrank_i_tmp-1;
1303     }
1304     assert(textId == 0);
1305
1306     sampled = new BSGAP(sampledpositions,n,true);
1307     sampleLength = sampled->rank(n-1);
1308     assert(sampleCount == sampleLength);
1309
1310     // Suffixes store an offset from the text start position
1311     suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1312     suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1313
1314     for(ulong i=0; i<sampleLength; i++) {
1315         assert((*positions)[i] < n);
1316         ulong j = sampled->rank((*positions)[i]);
1317         if (j==0) j=sampleLength;
1318         TextPosition textPos = (*tmpSuffix)[i];
1319         (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1320
1321         assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1322         assert((*suffixDocId)[j-1] < numberOfTexts);
1323         // calculate offset from text start:
1324         (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1325     }
1326     // FIXME Temp, remove
1327     delete tmpSuffix;
1328     delete positions;
1329 //    delete textLength;
1330     delete textStartPos;
1331 }
1332
1333
1334 /**
1335  * Finds document identifier for given text position
1336  *
1337  * Starting text position of the document is stored into second parameter.
1338  * Binary searching on text starting positions.
1339  */
1340 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1341 {
1342     assert(i < n);
1343
1344     DocId a = 0;
1345     DocId b = numberOfTexts - 1;
1346     while (a < b)
1347     {
1348         DocId c = a + (b - a)/2;
1349         if ((*textStartPos)[c] > i)
1350             b = c - 1;
1351         else if ((*textStartPos)[c+1] > i)
1352             return c;
1353         else
1354             a = c + 1;
1355     }
1356
1357     assert(a < (DocId)numberOfTexts);
1358     assert(i >= (*textStartPos)[a]);
1359     assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1360     return a;
1361 }
1362
1363
1364 } // namespace SXSI
1365