Added RLCSA index option
[SXSI/TextCollection.git] / incbwt / rlcsa.cpp
index 8e3d2d9..47d2661 100644 (file)
@@ -1,12 +1,20 @@
 #include <algorithm>
 #include <cstdlib>
-#include <ctime>
+#include <cstring>
 #include <fstream>
 #include <iostream>
+#include <list>
 
 #include "rlcsa.h"
 #include "misc/utils.h"
 #include "qsufsort/qsufsort.h"
+#include "bits/vectors.h"
+
+
+#ifdef MULTITHREAD_SUPPORT
+#include <omp.h>
+#endif
+
 
 
 namespace CSA
@@ -32,11 +40,11 @@ RLCSA::RLCSA(const std::string& base_name, bool print) :
   array_file.read((char*)distribution, CHARS * sizeof(usint));
   this->buildCharIndexes(distribution);
 
-  Parameters parameters;
-  parameters.read(base_name + PARAMETERS_EXTENSION);
+//  Parameters parameters;
+//  parameters.read(base_name + PARAMETERS_EXTENSION);
   for(usint c = 0; c < CHARS; c++)
   {
-    if(!isEmpty(this->index_ranges[c])) { this->array[c] = new RLEVector(array_file); }
+    if(!isEmpty(this->index_ranges[c])) { this->array[c] = new PsiVector(array_file); }
   }
 
   this->end_points = new DeltaVector(array_file);
@@ -45,8 +53,8 @@ RLCSA::RLCSA(const std::string& base_name, bool print) :
   array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate));
   array_file.close();
 
-  this->support_locate = parameters.get(SUPPORT_LOCATE);
-  this->support_display = parameters.get(SUPPORT_DISPLAY);
+  this->support_locate = true;// parameters.get(SUPPORT_LOCATE);
+  this->support_display = true; //parameters.get(SUPPORT_DISPLAY);
 
   if(this->support_locate || this->support_display)
   {
@@ -61,7 +69,7 @@ RLCSA::RLCSA(const std::string& base_name, bool print) :
     sa_sample_file.close();
   }
 
-  if(print) { parameters.print(); }
+//  if(print) { parameters.print(); }
 
   this->ok = true;
 }
@@ -165,10 +173,10 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b
 
 
   // Build suffix array.
-  sint* inverse = new sint[bytes + 1];
+  int* inverse = new int[bytes + 1];
   if(multiple_sequences)
   {
-    sint zeros = 0;
+    int zeros = 0;
     for(usint i = 0; i < bytes; i++)
     {
       if(data[i] == 0)
@@ -178,35 +186,29 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b
       }
       else
       {
-        inverse[i] = (sint)data[i] + this->number_of_sequences;
+        inverse[i] = (int)(data[i] + this->number_of_sequences);
       }
     }
   }
   else
   {
-    for(usint i = 0; i < bytes; i++) { inverse[i] = (sint)data[i]; }
+    for(usint i = 0; i < bytes; i++) { inverse[i] = (int)data[i]; }
   }
   if(delete_data) { delete[] data; }
-  sint* sa = new sint[bytes + 1];
+  int* sa = new int[bytes + 1];
   suffixsort(inverse, sa, bytes, high + 1, low);
 
 
   // Sample SA.
-  usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1);  // No e_of_s markers in SA.
   if(this->support_locate || this->support_display)
   {
     if(multiple_sequences)
     {
-      std::cout << "We shouldn't be here!" << std::endl;
-      // Real SA starts at sa + incr.
-      // for each sequence
-      //   find starting position
-      //   sample relative to starting position
-      // sort samples to SA order
+      this->sa_samples = new SASamples((uint*)inverse, this->end_points, this->data_size, this->sample_rate);
     }
     else
     {
-      this->sa_samples = new SASamples((usint*)(sa + incr), this->data_size, this->sample_rate);
+      this->sa_samples = new SASamples((uint*)(sa + 1), this->data_size, this->sample_rate);
     }
   }
 
@@ -220,14 +222,15 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b
 
 
   // Build RLCSA.
+  usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1);  // No e_of_s markers in SA.
   usint decr = (multiple_sequences ? 1 : 0);  // Ignore the last e_of_s marker if multiple sequences.
   for(usint c = 0; c < CHARS; c++)
   {
     if(distribution[c] == 0) { this->array[c] = 0; continue; }
 
-    usint* curr = (usint*)(sa + index_ranges[c].first + incr);
-    usint* limit = (usint*)(sa + index_ranges[c].second + incr + 1);
-    RLEEncoder encoder(block_size);
+    uint* curr = (uint*)(sa + index_ranges[c].first + incr);
+    uint* limit = (uint*)(sa + index_ranges[c].second + incr + 1);
+    PsiVector::Encoder encoder(block_size);
     pair_type run(*curr, 1);
     curr++;
 
@@ -242,7 +245,7 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b
     }
     encoder.setRun(run.first - decr, run.second);
 
-    this->array[c] = new RLEVector(encoder, this->data_size + incr - decr);
+    this->array[c] = new PsiVector(encoder, this->data_size + incr - decr);
   }
   delete[] sa;
 
@@ -250,7 +253,7 @@ RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, b
   this->ok = true;
 }
 
-RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) :
+RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size, usint threads) :
   ok(false),
   sa_samples(0),
   end_points(0)
@@ -269,7 +272,6 @@ RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size)
   if(index.sample_rate != increment.sample_rate)
   {
     std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl;
-    std::cout << "Index: " << index.sample_rate << ", increment: " << increment.sample_rate << std::endl;
     return;
   }
 
@@ -291,61 +293,40 @@ RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size)
   }
   this->buildCharIndexes(distribution);
   this->sample_rate = index.sample_rate;
-
-
-  // Combine end points of sequences.
   this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences;
-  DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE);
-
-  endings->setBit(index.end_points->select(0));
-  for(usint i = 1; i < index.number_of_sequences; i++)
-  {
-    endings->setBit(index.end_points->selectNext());
-  }
-  usint sum = index.end_points->getSize();
-  delete index.end_points; index.end_points = 0;
-
-  endings->setBit(sum + increment.end_points->select(0));
-  for(usint i = 1; i < increment.number_of_sequences; i++)
-  {
-    endings->setBit(sum + increment.end_points->selectNext());
-  }
-  sum += increment.end_points->getSize();
-  delete increment.end_points; increment.end_points = 0;
-
-  this->end_points = new DeltaVector(*endings, sum);
-  delete endings;
-
 
-  // Combine Psi.
+  // Merge end points, SA samples, and Psi.
   usint psi_size = this->data_size + this->number_of_sequences;
-  for(usint c = 0; c < CHARS; c++)
-  {
-    if(distribution[c] == 0) { this->array[c] = 0; continue; }
-    this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size);
-    index.array[c] = 0;
-    increment.array[c] = 0;
-
-    if(this->array[c] == 0)
-    {
-      std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl;
-      return;
-    }
-  }
-
+  bool should_be_ok = true;
 
-  // Combine suffix array samples.
-  if(this->support_locate || this->support_display)
+  #ifdef MULTITHREAD_SUPPORT
+  omp_set_num_threads(threads);
+  #pragma omp parallel
   {
-    positions += increment.number_of_sequences;
-    for(usint i = 0; i < increment.data_size; i++)
+    #pragma omp for schedule(dynamic, 1)
+  #endif
+    for(int c = -2; c < (int)CHARS; c++)
     {
-      positions[i] -= this->number_of_sequences;
+      if(c == -2)      { this->mergeEndPoints(index, increment); }
+      else if(c == -1) { this->mergeSamples(index, increment, positions);  }
+      else if(distribution[c] != 0)
+      {
+        this->array[c] = mergeVectors<PsiVector, PsiVector::Encoder, PsiVector::Iterator>(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size);
+        index.array[c] = 0;
+        increment.array[c] = 0;
+
+        if(this->array[c] == 0)
+        {
+          std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl;
+          should_be_ok = false;
+        }
+      }
     }
-    this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size);
+  #ifdef MULTITHREAD_SUPPORT
   }
+  #endif
 
-  this->ok = true;
+  this->ok = should_be_ok;
 }
 
 RLCSA::~RLCSA()
@@ -355,8 +336,10 @@ RLCSA::~RLCSA()
   delete this->end_points;
 }
 
+//--------------------------------------------------------------------------
+
 void
-RLCSA::writeTo(const std::string& base_name)
+RLCSA::writeTo(const std::string& base_name) const
 {
   std::string array_name = base_name + ARRAY_EXTENSION;
   std::ofstream array_file(array_name.c_str(), std::ios_base::binary);
@@ -400,7 +383,7 @@ RLCSA::writeTo(const std::string& base_name)
 }
 
 bool
-RLCSA::isOk()
+RLCSA::isOk() const
 {
   return this->ok;
 }
@@ -408,24 +391,21 @@ RLCSA::isOk()
 //--------------------------------------------------------------------------
 
 pair_type
-RLCSA::count(const std::string& pattern)
+RLCSA::count(const std::string& pattern) const
 {
   if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); }
 
-  pair_type index_range = this->index_ranges[(usint)*(pattern.rbegin())];
-  if(isEmpty(index_range)) { return index_range; }
+  std::string::const_reverse_iterator iter = pattern.rbegin();
+
+  pair_type index_range = this->index_ranges[(uchar)*iter];
+  if(isEmpty(index_range)) { return EMPTY_PAIR; }
   index_range.first += this->number_of_sequences;
   index_range.second += this->number_of_sequences;
 
-  for(std::string::const_reverse_iterator iter = ++pattern.rbegin(); iter != pattern.rend(); iter++)
+  for(++iter; iter != pattern.rend(); ++iter)
   {
-    RLEVector* vector = this->array[(usint)*iter];
-    usint start = this->index_ranges[(usint)*iter].first;
-
-    index_range.first = start + vector->rank(index_range.first, true) - 1 + this->number_of_sequences;
-    index_range.second = start + vector->rank(index_range.second) - 1 + this->number_of_sequences;
-
-    if(isEmpty(index_range)) { return index_range; }
+    index_range = this->backwardSearchStep(index_range, (uchar)*iter);
+    if(isEmpty(index_range)) { return EMPTY_PAIR; }
   }
 
   // Suffix array indexes are 0-based.
@@ -435,20 +415,23 @@ RLCSA::count(const std::string& pattern)
   return index_range;
 }
 
+//--------------------------------------------------------------------------
+
 void
-RLCSA::reportPositions(uchar* data, usint length, usint* positions)
+RLCSA::reportPositions(uchar* data, usint length, usint* positions) const
 {
   if(data == 0 || length == 0 || positions == 0) { return; }
 
+  PsiVector::Iterator** iters = this->getIterators();
+
   usint current = this->number_of_sequences - 1;
   positions[length] = current; // "immediately after current"
   for(sint i = (sint)(length - 1); i >= 0; i--)
   {
-//     positions[i] = current; // "immediately after current"
     usint c = (usint)data[i];
-    if(array[c])
+    if(this->array[c] != 0)
     {
-      current = this->index_ranges[c].first + this->array[c]->rank(current) - 1 + this->number_of_sequences;
+      current = this->LF(current, c, *(iters[c]));
     }
     else
     {
@@ -463,24 +446,25 @@ RLCSA::reportPositions(uchar* data, usint length, usint* positions)
     }
     positions[i] = current; // "immediately after current"
   }
+
+  this->deleteIterators(iters);
 }
 
 //--------------------------------------------------------------------------
 
-LocateItem*
-RLCSA::locate(pair_type range)
+usint*
+RLCSA::locate(pair_type range) const
 {
   if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; }
 
-  LocateItem* data = new LocateItem[range.second + 1 - range.first];
-  if(!data) { return 0; }
+  usint* data = new usint[length(range)];
   this->locateUnsafe(range, data);
 
   return data;
 }
 
-LocateItem*
-RLCSA::locate(pair_type range, LocateItem* data)
+usint*
+RLCSA::locate(pair_type range, usint* data) const
 {
   if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; }
   this->locateUnsafe(range, data);
@@ -488,14 +472,19 @@ RLCSA::locate(pair_type range, LocateItem* data)
 }
 
 void
-RLCSA::locateUnsafe(pair_type range, LocateItem* data)
+RLCSA::locateUnsafe(pair_type range, usint* data) const
 {
-  usint items = range.second + 1 - range.first;
+  usint items = length(range);
+  usint* offsets = new usint[items];
+  bool* finished = new bool[items];  // FIXME This could be more space efficient...
+
+  PsiVector::Iterator** iters = this->getIterators();
+
   for(usint i = 0, j = range.first; i < items; i++, j++)
   {
-    data[i].value = j + this->number_of_sequences;
-    data[i].offset = 0;
-    data[i].found = false;
+    data[i] = j + this->number_of_sequences;
+    offsets[i] = 0;
+    finished[i] = false;
   }
 
   bool found = false;
@@ -505,7 +494,7 @@ RLCSA::locateUnsafe(pair_type range, LocateItem* data)
     pair_type run = EMPTY_PAIR;
     for(usint i = 0; i < items; i++)
     {
-      if(data[i].found)
+      if(finished[i])
       {
         continue; // The run might continue after this.
       }
@@ -513,22 +502,26 @@ RLCSA::locateUnsafe(pair_type range, LocateItem* data)
       {
         run = pair_type(i, i);
       }
-      else if(data[i].value - data[run.first].value == i - run.first)
+      else if(data[i] - data[run.first] == i - run.first)
       {
         run.second = i;
       }
       else
       {
-        found &= this->processRun(run, data);
+        found &= this->processRun(run, data, offsets, finished, iters);
         run = pair_type(i, i);
       }
     }
-    if(!isEmpty(run)) { found &= this->processRun(run, data); }
+    if(!isEmpty(run)) { found &= this->processRun(run, data, offsets, finished, iters); }
   }
+
+  this->deleteIterators(iters);
+  delete[] offsets;
+  delete[] finished;
 }
 
 bool
-RLCSA::processRun(pair_type run, LocateItem* data)
+RLCSA::processRun(pair_type run, usint* data, usint* offsets, bool* finished, PsiVector::Iterator** iters) const
 {
   bool found = true;
   usint run_start = 0, run_left = 0;
@@ -536,44 +529,45 @@ RLCSA::processRun(pair_type run, LocateItem* data)
 
   for(usint i = run.first; i <= run.second; i++)
   {
-    if(data[i].found)
+    if(finished[i])
     {
       if(run_left > 0) { run_left--; }
       continue;
     }
-    if(data[i].value < this->number_of_sequences) // Implicit sample here.
+    if(data[i] < this->number_of_sequences) // Implicit sample here.
     {
-      data[i].value = this->end_points->select(data[i].value) + 1 - data[i].offset;
-      data[i].found = true;
+      DeltaVector::Iterator iter(*(this->end_points));
+      data[i] = iter.select(data[i]) + 1 - offsets[i];
+      finished[i] = true;
       if(run_left > 0) { run_left--; }
       continue;
     }
-    if(next_sample.first < data[i].value) // Need another sample.
+    if(next_sample.first < data[i]) // Need another sample.
     {
-      next_sample = this->sa_samples->getFirstSampleAfter(data[i].value - this->number_of_sequences);
+      next_sample = this->sa_samples->getFirstSampleAfter(data[i] - this->number_of_sequences);
       next_sample.first += this->number_of_sequences;
     }
-    if(data[i].value < next_sample.first) // No sample found for current position.
+    if(data[i] < next_sample.first) // No sample found for current position.
     {
       if(run_left > 0)
       {
-        data[i].value = data[run_start].value + i - run_start;
+        data[i] = data[run_start] + i - run_start;
         run_left--;
       }
       else
       {
-        pair_type value = this->psi(data[i].value - this->number_of_sequences, run.second - i);
-        data[i].value = value.first;
+        pair_type value = this->psi(data[i] - this->number_of_sequences, run.second - i, iters);
+        data[i] = value.first;
         run_left = value.second;
         run_start = i;
       }
-      data[i].offset++;
+      offsets[i]++;
       found = false;
     }
     else  // Sampled position found.
     {
-      data[i].value = this->sa_samples->getSample(next_sample.second) - data[i].offset;
-      data[i].found = true;
+      data[i] = this->sa_samples->getSample(next_sample.second) - offsets[i];
+      finished[i] = true;
       if(run_left > 0) { run_left--; }
     }
   }
@@ -581,7 +575,7 @@ RLCSA::processRun(pair_type run, LocateItem* data)
 }
 
 usint
-RLCSA::locate(usint index)
+RLCSA::locate(usint index) const
 {
   if(!(this->support_locate) || index >= this->data_size) { return this->data_size; }
 
@@ -591,7 +585,8 @@ RLCSA::locate(usint index)
   {
     if(index < this->number_of_sequences) // Implicit sample here
     {
-      return this->end_points->select(index) + 1 - offset;
+      DeltaVector::Iterator iter(*(this->end_points));
+      return iter.select(index) + 1 - offset;
     }
     pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences);
     next_sample.first += this->number_of_sequences;
@@ -607,29 +602,43 @@ RLCSA::locate(usint index)
 //--------------------------------------------------------------------------
 
 uchar*
-RLCSA::display(usint sequence, pair_type range)
+RLCSA::display(usint sequence) const
+{
+  if(!(this->support_display)) { return 0; }
+
+  pair_type seq_range = this->getSequenceRange(sequence);
+  if(isEmpty(seq_range)) { return 0; }
+
+  uchar* data = new uchar[length(seq_range)+1];
+  this->displayUnsafe(seq_range, data);
+  data[length(seq_range)] = 0;
+
+  return data;
+}
+
+uchar*
+RLCSA::display(usint sequence, pair_type range) const
 {
   if(!(this->support_display) || isEmpty(range)) { return 0; }
 
-  pair_type seq_range = this->getSequence(sequence);
+  pair_type seq_range = this->getSequenceRange(sequence);
   if(isEmpty(seq_range)) { return 0; }
 
   range.first += seq_range.first; range.second += seq_range.first;
   if(range.second > seq_range.second) { return 0; }
 
-  uchar* data = new uchar[range.second + 1 - range.first];
-  if(!data) { return 0; }
+  uchar* data = new uchar[length(range)];
   this->displayUnsafe(range, data);
 
   return data;
 }
 
 uchar*
-RLCSA::display(usint sequence, pair_type range, uchar* data)
+RLCSA::display(usint sequence, pair_type range, uchar* data) const
 {
   if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; }
 
-  pair_type seq_range = this->getSequence(sequence);
+  pair_type seq_range = this->getSequenceRange(sequence);
   if(isEmpty(seq_range)) { return 0; }
 
   range.first += seq_range.first; range.second += seq_range.first;
@@ -639,143 +648,662 @@ RLCSA::display(usint sequence, pair_type range, uchar* data)
   return data;
 }
 
+uchar*
+RLCSA::display(usint position, usint len, usint context, usint& result_length) const
+{
+  if(!(this->support_display)) { return 0; }
+
+  pair_type range = this->getSequenceRangeForPosition(position);
+  if(isEmpty(range)) { return 0; }
+
+  range.first = position - std::min(context, position - range.first);
+  range.second = std::min(range.second, position + len + context - 1);
+  result_length = length(range);
+  if(isEmpty(range)) { return 0; }
+
+  uchar* data = new uchar[length(range)];
+  this->displayUnsafe(range, data);
+
+  return data;
+}
+
+usint
+RLCSA::displayPrefix(usint sequence, usint len, uchar* data) const
+{
+  if(!(this->support_display) || len == 0 || data == 0) { return 0; }
+
+  pair_type seq_range = this->getSequenceRange(sequence);
+  if(isEmpty(seq_range)) { return 0; }
+
+  pair_type range(seq_range.first, std::min(seq_range.second, seq_range.first + len - 1));
+
+  this->displayUnsafe(range, data);
+  return length(range);
+}
+
+usint
+RLCSA::displayFromPosition(usint index, usint max_len, uchar* data) const
+{
+  if(max_len == 0 || data == 0 || index >= this->data_size) { return 0; }
+
+  for(usint i = 0; i < max_len; i++)
+  {
+    data[i] = this->getCharacter(index);
+    index = this->psiUnsafe(index, data[i]);
+    if(index < this->number_of_sequences) { return i + 1; }
+    index -= this->number_of_sequences;
+  }
+
+  return max_len;
+}
+
 void
-RLCSA::displayUnsafe(pair_type range, uchar* data)
+RLCSA::displayUnsafe(pair_type range, uchar* data) const
 {
   usint i = range.first - range.first % this->sa_samples->getSampleRate();
 
   usint pos = this->sa_samples->inverseSA(i);
 
-  for(; i < range.first; i++)
+  if(length(range) >= 1024)
   {
-    pos = this->psi(pos) - this->number_of_sequences;
+    PsiVector::Iterator** iters = this->getIterators();
+    for(; i < range.first; i++)
+    {
+      pos = this->psi(pos, iters) - this->number_of_sequences;
+    }
+    for(; i <= range.second; i++)
+    {
+      usint c = this->getCharacter(pos);
+      data[i - range.first] = c;
+      pos = this->psiUnsafe(pos, c, *(iters[c])) - this->number_of_sequences;
+    }
+    this->deleteIterators(iters);
   }
-  for(; i <= range.second; i++)
+  else
   {
-    data[i - range.first] = this->getCharacter(pos);
-    pos = this->psi(pos) - this->number_of_sequences;
+    for(; i < range.first; i++)
+    {
+      pos = this->psi(pos) - this->number_of_sequences;
+    }
+    for(; i <= range.second; i++)
+    {
+      usint c = this->getCharacter(pos);
+      data[i - range.first] = c;
+      pos = this->psiUnsafe(pos, c) - this->number_of_sequences;
+    }
   }
 }
 
 //--------------------------------------------------------------------------
 
-void
-RLCSA::decompressInto(std::ofstream& psi_file)
+pair_type
+RLCSA::getSequenceRange(usint number) const
 {
-  for(usint c = 0; c < CHARS; c++)
+  if(number >= this->number_of_sequences) { return EMPTY_PAIR; }
+
+  pair_type result;
+  DeltaVector::Iterator iter(*(this->end_points));
+  if(number == 0)
   {
-    if(!(this->array[c])) { continue; }
-    usint value = this->array[c]->select(0);
-    psi_file.write((char*)&(value), sizeof(value));
-    while(this->array[c]->hasNext())
-    {
-      value = this->array[c]->selectNext();
-      psi_file.write((char*)&value, sizeof(value));
-    }
+    result.first = 0;
+    result.second = iter.select(number);
+  }
+  else
+  {
+    result.first = nextMultipleOf(this->sample_rate, iter.select(number - 1));
+    result.second = iter.selectNext();
+  }
+
+  return result;
+}
+
+usint
+RLCSA::getSequenceForPosition(usint value) const
+{
+  if(value == 0) { return 0; }
+  DeltaVector::Iterator iter(*(this->end_points));
+  return iter.rank(value - 1);
+}
+
+pair_type
+RLCSA::getSequenceRangeForPosition(usint value) const
+{
+  return this->getSequenceRange(this->getSequenceForPosition(value));
+}
+
+usint*
+RLCSA::getSequenceForPosition(usint* values, usint len) const
+{
+  if(values == 0) { return 0; }
+
+  DeltaVector::Iterator iter(*(this->end_points));
+  for(usint i = 0; i < len; i++)
+  {
+    if(values[i] > 0) { values[i] = iter.rank(values[i] - 1); }
+  }
+
+  return values;
+}
+
+pair_type
+RLCSA::getRelativePosition(usint value) const
+{
+  DeltaVector::Iterator iter(*(this->end_points));
+  pair_type result(0, value);
+
+  if(value > 0) { result.first = iter.rank(value - 1); }
+  if(result.first > 0)
+  {
+    result.second -= nextMultipleOf(this->sample_rate, iter.select(result.first - 1));
   }
+
+  return result;
+}
+
+//--------------------------------------------------------------------------
+
+uchar*
+RLCSA::readBWT() const
+{
+  return this->readBWT(pair_type(0, this->data_size + this->number_of_sequences - 1));
 }
 
 uchar*
-RLCSA::readBWT()
+RLCSA::readBWT(pair_type range) const
 {
-  usint n = this->data_size + this->number_of_sequences;
+  if(isEmpty(range) || range.second >= this->data_size + this->number_of_sequences) { return 0; }
+
+  usint n = length(range);
 
   uchar* bwt = new uchar[n];
   memset(bwt, 0, n);
 
   for(usint c = 0; c < CHARS; c++)
   {
-    RLEVector* vector = this->array[c];
-    if(vector != 0)
+    if(this->array[c] != 0)
     {
-      bwt[vector->select(0)] = c;
-      while(vector->hasNext()) { bwt[vector->selectNext()] = c; }
+      PsiVector::Iterator iter(*(this->array[c]));
+      usint pos = iter.valueAfter(range.first).first;
+      while(pos <= range.second)
+      {
+        bwt[pos - range.first] = c;
+        pos = iter.selectNext();
+      }
     }
   }
 
   return bwt;
 }
 
+usint
+RLCSA::countRuns() const
+{
+  usint runs = 0;
+  for(usint c = 0; c < CHARS; c++)
+  {
+    if(this->array[c] != 0)
+    {
+      PsiVector::Iterator iter(*(this->array[c]));
+      runs += iter.countRuns();
+    }
+  }
+
+  return runs;
+}
+
 //--------------------------------------------------------------------------
 
-usint
-RLCSA::psi(usint index)
+PsiVector::Iterator**
+RLCSA::getIterators() const
 {
-  if(index >= this->data_size)
+  PsiVector::Iterator** iters = new PsiVector::Iterator*[CHARS];
+  for(usint i = 0; i < CHARS; i++)
   {
-    return this->data_size + this->number_of_sequences;
+    if(this->array[i] == 0) { iters[i] = 0; }
+    else                    { iters[i] = new PsiVector::Iterator(*(this->array[i])); }
   }
+  return iters;
+}
 
-  usint c = this->getCharacter(index);
-  return this->array[c]->select(index - this->index_ranges[c].first);
+void
+RLCSA::deleteIterators(PsiVector::Iterator** iters) const
+{
+  if(iters == 0) { return; }
+  for(usint i = 0; i < CHARS; i++) { delete iters[i]; }
+  delete[] iters;
 }
 
-pair_type
-RLCSA::psi(usint index, usint max_length)
+//--------------------------------------------------------------------------
+
+usint
+RLCSA::reportSize(bool print) const
 {
-  if(index >= this->data_size)
+  usint bytes = 0, temp = 0, bwt = 0;
+
+  for(usint c = 0; c < CHARS; c++)
+  {
+    if(this->array[c])
+    {
+      bytes += this->array[c]->reportSize();
+      bwt += this->array[c]->getCompressedSize();
+    }
+  }
+  bytes += sizeof(*this) + this->end_points->reportSize();
+  if(print)
   {
-    return pair_type(this->data_size + this->number_of_sequences, 0);
+    std::cout << "RLCSA:           " << (bytes / (double)MEGABYTE) << " MB" << std::endl;
+    std::cout << "  BWT only:      " << (bwt / (double)MEGABYTE) << " MB" << std::endl;
   }
 
-  usint c = this->getCharacter(index);
-  return this->array[c]->selectRun(index - this->index_ranges[c].first, max_length);
+  if(this->support_locate || this->support_display)
+  {
+    temp = this->sa_samples->reportSize();
+    if(print) { std::cout << "SA samples:      " << (temp / (double)MEGABYTE) << " MB" << std::endl; }
+    bytes += temp;
+  }
+
+  if(print)
+  {
+    std::cout << "Total size:      " << (bytes / (double)MEGABYTE) << " MB" << std::endl;
+    std::cout << std::endl;
+  }
+
+  return bytes;
+}
+
+void
+RLCSA::printInfo() const
+{
+  double megabytes = this->data_size / (double)MEGABYTE;
+
+  std::cout << "Sequences:       " << this->number_of_sequences << std::endl;
+  std::cout << "Original size:   " << megabytes << " MB" << std::endl;
+  std::cout << "Block size:      " << (this->getBlockSize() * sizeof(usint)) << " bytes" << std::endl;
+  if(this->support_locate || this->support_display)
+  {
+    std::cout << "Sample rate:     " << this->sample_rate << std::endl;
+  }
+  std::cout << std::endl;
 }
 
 //--------------------------------------------------------------------------
 
-pair_type
-RLCSA::getSequence(usint number)
+RLEVector*
+RLCSA::buildPLCP(usint block_size) const
 {
-  if(number >= this->number_of_sequences) { return EMPTY_PAIR; }
+  if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0)
+  {
+    std::cerr << "PLCP: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl;
+    return 0;
+  }
 
-  pair_type result;
-  if(number == 0)
+  PsiVector::Iterator** iters = this->getIterators();
+
+  RLEEncoder plcp(block_size);
+  std::list<pair_type> matches;
+  pair_type prev_range = EMPTY_PAIR;
+  for(usint j = 0; j < this->number_of_sequences; j++)
   {
-    result.first = 0;
-    result.second = this->end_points->select(number);
+    // Encode the padding as a single run.
+    pair_type seq_range = this->getSequenceRange(j);
+    if(j > 0 && prev_range.second + 1 < seq_range.first)
+    {
+      this->encodePLCPRun(plcp, prev_range.second + 1, seq_range.first, seq_range.first - prev_range.second - 2);
+    }
+    prev_range = seq_range;
+
+    usint maximal = seq_range.first;
+    usint x = this->sa_samples->inverseSA(seq_range.first), next_x;
+
+    // Invariant: x == inverseSA(i)
+    for(usint i = seq_range.first; i <= seq_range.second; i++, x = next_x)
+    {
+      usint c = this->getCharacter(x);
+
+      // T[i,n] is lexicographically the first suffix beginning with c.
+      if(x == this->index_ranges[c].first)
+      {
+        if(!matches.empty())
+        {
+          maximal = (*(matches.begin())).first;
+          matches.clear();
+        }
+        this->encodePLCPRun(plcp, maximal, i + 1, i - maximal);
+        maximal = i + 1;
+        next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences;
+        continue;
+      }
+
+      // Process the previous left matches we are still following.
+      usint low = this->index_ranges[c].first + this->number_of_sequences;
+      usint start, stop; start = stop = maximal;
+      for(std::list<pair_type>::iterator iter = matches.begin(); iter != matches.end();)
+      {
+        pair_type match = *iter;
+        if(match.second < low)  // These no longer match the current character.
+        {
+          if(match.first < start) { start = match.first; }  // Form a single run from then.
+          iter = matches.erase(iter);
+        }
+        else
+        {
+          if(match.first < stop) { stop = match.first; }  // End of the run to be encoded.
+          (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c]));
+          ++iter;
+        }
+      }
+      if(start < stop) { this->encodePLCPRun(plcp, start, stop, i - start); }
+
+      // If PLCP[i] is minimal, we add the left match to the list.
+      // We add pairs of type (j, inverseSA(j) + n_of_s) as inverseSA is < 0 for end markers.
+      usint next_y = this->psiUnsafe(x - 1, c, *(iters[c]));
+      next_x = this->psiUnsafeNext(c, *(iters[c])) - this->number_of_sequences;
+      if(next_y != next_x + this->number_of_sequences - 1)
+      {
+        matches.push_back(pair_type(maximal, next_y));
+        maximal = i + 1;
+      }
+    }
+
+    if(!matches.empty())
+    {
+      maximal = (*(matches.begin())).first;
+      matches.clear();
+    }
+    if(maximal <= seq_range.second)
+    {
+      this->encodePLCPRun(plcp, maximal, seq_range.second + 1, seq_range.second + 1 - maximal);
+    }
   }
-  else
+
+  this->deleteIterators(iters);
+
+  return new RLEVector(plcp, 2 * (this->end_points->getSize() + 1));
+}
+
+usint
+RLCSA::sampleLCP(usint sample_rate, pair_type*& sampled_values, bool report) const
+{
+  if(sample_rate == 0)
   {
-    if(this->sa_samples == 0) { return EMPTY_PAIR; }
-    usint d = this->sa_samples->getSampleRate();
-    result.first = d * ((this->end_points->select(number - 1) / d) + 1);
-    result.second = this->end_points->selectNext();
+    sample_rate = this->data_size + 1;
   }
 
-  return result;
+  PsiVector::Iterator** iters = this->getIterators();
+
+  usint runs = this->countRuns();
+  usint samples = 0, minimal_sum = 0, nonstrict_minimal_sum = 0, minimal_samples = 0;
+  usint max_samples = runs + (this->data_size - runs) / sample_rate;
+  sampled_values = new pair_type[max_samples];
+
+  std::list<Triple> matches;
+  for(usint j = 0; j < this->number_of_sequences; j++)
+  {
+    pair_type seq_range = this->getSequenceRange(j);
+    usint first_sample = samples; // First minimal sample of the current sequence.
+    usint minimal = seq_range.second + 1; // Last minimal position.
+    usint i, x = this->sa_samples->inverseSA(seq_range.first), next_x, prev_x = 0;
+
+    // Invariant: x == inverseSA(i)
+    for(i = seq_range.first; i <= seq_range.second; i++, x = next_x)
+    {
+      usint c = this->getCharacter(x);
+
+      // T[i,n] is lexicographically the first suffix beginning with c.
+      if(x == this->index_ranges[c].first)
+      {
+        if(!matches.empty())
+        {
+          for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end(); ++iter)
+          {
+            nonstrict_minimal_sum += i - (*iter).first;
+          }
+          matches.clear();
+        }
+
+        // Minimal sample: SA[x] = i, LCP[x] = 0
+        sampled_values[samples] = pair_type(x, 0); minimal = i;
+        samples++; minimal_samples++;
+        next_x = this->psiUnsafe(x, c, *(iters[c])) - this->number_of_sequences; prev_x = x;
+        continue;
+      }
+
+      // Process the previous left matches we are still following.
+      usint low = this->index_ranges[c].first + this->number_of_sequences;
+      pair_type potential_sample(this->data_size, this->data_size);
+      for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end();)
+      {
+        Triple match = *iter;
+        if(match.second < low)  // These no longer match the current character.
+        {
+          if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; }
+
+          // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first
+          potential_sample = pair_type(match.third, i - match.first);
+          iter = matches.erase(iter);
+        }
+        else
+        {
+          (*iter).second = this->psiUnsafe(match.second - this->number_of_sequences, c, *(iters[c]));
+          ++iter;
+        }
+      }
+      if(potential_sample.first != this->data_size)
+      {
+        // Last potential sample is minimal.
+        sampled_values[samples] = potential_sample;
+        samples++; minimal_sum += potential_sample.second; minimal_samples++;
+      }
+
+      // If PLCP[i] is minimal, we add the left match to the list.
+      // We add triples of type (j, inverseSA(j) + n_of_s, x) as inverseSA is < 0 for end markers.
+      // x is the original minimal position.
+      usint next_y = this->psiUnsafe(x - 1, c, *(iters[c]));
+      next_x = this->psiUnsafeNext(c, *(iters[c]));
+      if(next_y != next_x - 1 || next_x < this->number_of_sequences)
+      {
+        matches.push_back(Triple(i, next_y, x)); minimal = i;
+      }
+      next_x -= this->number_of_sequences; prev_x = x;
+    }
+
+    // Are we still following something?
+    if(!matches.empty())
+    {
+      pair_type potential_sample(this->data_size, this->data_size);
+      for(std::list<Triple>::iterator iter = matches.begin(); iter != matches.end(); ++iter)
+      {
+        Triple match = *iter;
+        if(potential_sample.first != this->data_size) { nonstrict_minimal_sum += potential_sample.second; }
+
+        // Potential minimal sample: SA[match.third] = match.first, LCP[match.third] = i - match.first
+        potential_sample = pair_type(match.third, i - match.first);
+      }
+      matches.clear();
+      if(potential_sample.first != this->data_size)
+      {
+        // Last potential sample is minimal.
+        sampled_values[samples] = potential_sample;
+        samples++; minimal_sum += potential_sample.second; minimal_samples++;
+      }
+    }
+
+    // Add the non-minimal samples.
+    if(sample_rate <= this->data_size)
+    {
+      usint last_sample = samples - 1;
+      i = seq_range.first; x = this->sa_samples->inverseSA(seq_range.first);
+      for(usint current_sample = first_sample; current_sample <= last_sample; current_sample++)
+      {
+        // Find the next minimal sample and add nonminimal samples if needed.
+        pair_type first_nonminimal(i + sample_rate - 1, samples);
+        pair_type next_nonminimal = first_nonminimal;
+        while(x != sampled_values[current_sample].first)
+        {
+          if(i == next_nonminimal.first)
+          {
+            sampled_values[samples] = pair_type(x, 0); samples++;
+            next_nonminimal.first += sample_rate; next_nonminimal.second++;
+          }
+          i++; x = this->psi(x, iters) - this->number_of_sequences;
+        }
+
+        // Reduce the nonminimal samples to the current minimal sample.
+        for(next_nonminimal = first_nonminimal; next_nonminimal.second < samples; next_nonminimal.first += sample_rate, next_nonminimal.second++)
+        {
+          sampled_values[next_nonminimal.second].second = sampled_values[current_sample].second + i - next_nonminimal.first;
+        }
+        i++; x = this->psi(x, iters) - this->number_of_sequences;
+      }
+    }
+  }
+
+  std::sort(sampled_values, sampled_values + samples);
+
+  if(report)
+  {
+    std::cout << "Samples: " << samples << " (total) / " << minimal_samples << " (minimal)" << std::endl;
+    std::cout << "Upper bounds: " << max_samples << " (total) / " << runs << " (minimal)" << std::endl;
+    std::cout << "Sum of minimal samples: " << (minimal_sum + nonstrict_minimal_sum) << " (total) / " << minimal_sum << " (strict)" << std::endl;
+    std::cout << std::endl;
+  }
+
+  this->deleteIterators(iters);
+
+  return samples;
 }
 
 usint
-RLCSA::reportSize(bool print)
+RLCSA::lcp(usint index, const LCPSamples& lcp_samples) const
 {
-  usint bytes = 0, temp = 0;
+  if(index >= this->data_size) { return this->data_size; }
 
-  for(usint c = 0; c < CHARS; c++)
+  usint offset = 0;
+  index += this->number_of_sequences;
+  while(true)
   {
-    if(this->array[c]) { temp += this->array[c]->reportSize(); }
+    pair_type next_sample = lcp_samples.getFirstSampleAfter(index - this->number_of_sequences);
+    next_sample.first += this->number_of_sequences;
+    if(next_sample.first == index)
+    {
+      return lcp_samples.getSample(next_sample.second) + offset;
+    }
+    index = this->psi(index - this->number_of_sequences);
+    offset++;
   }
-  if(print) { std::cout << "Psi array:       " << temp << std::endl; }
-  bytes += temp;
+}
 
-  if(this->support_locate || this->support_display)
+usint
+RLCSA::lcp(usint index, const LCPSamples& lcp_samples, const RLEVector& plcp) const
+{
+  if(index >= this->data_size) { return this->data_size; }
+
+  usint offset = 0;
+  while(true)
   {
-    temp = this->sa_samples->reportSize();
-    if(print) { std::cout << "SA samples:      " << temp << std::endl; }
-    bytes += temp;
+    usint c = this->getCharacter(index);
+    PsiVector::Iterator iter(*(this->array[c]));
+
+    if(index == this->index_ranges[c].first)
+    {
+      return lcp_samples.getSampleAtPosition(index) + offset;
+    }
+    usint next_y = this->psiUnsafe(index - 1, c, iter);
+    usint next_x = this->psiUnsafeNext(c, iter);
+    if(next_y != next_x - 1 || next_x < this->number_of_sequences)
+    {
+      pair_type sample = lcp_samples.getFirstSampleAfter(index);
+      if(sample.first == index)
+      {
+        return lcp_samples.getSample(sample.second) + offset;
+      }
+    }
+
+    pair_type next_sample = this->sa_samples->getFirstSampleAfter(index);
+    if(next_sample.first == index)
+    {
+      index = this->sa_samples->getSample(next_sample.second) - offset;
+      RLEVector::Iterator plcp_iter(plcp);
+      return plcp_iter.select(index) - 2 * index;
+    }
+    index = next_x - this->number_of_sequences;
+    offset++;
+  }
+}
+
+usint
+RLCSA::lcpDirect(usint index) const
+{
+  if(index >= this->data_size || index == 0)
+  {
+    return 0;
   }
 
-  temp = sizeof(*this) + this->end_points->reportSize();
-  if(print) { std::cout << "RLCSA overhead:  " << temp << std::endl; }
-  bytes += temp;
+  usint match = index - 1;
+  usint value = 0;
 
-  if(print)
+  usint c = this->getCharacter(index);
+  usint low = this->index_ranges[c].first;
+  while(match >= low)
   {
-    std::cout << "Total size:      " << bytes << std::endl;
-    std::cout << std::endl;
+    PsiVector::Iterator iter(*(this->array[c]));
+    match = this->psiUnsafe(match, c, iter);
+    index = this->psiUnsafe(index, c, iter);
+    value++;
+
+    if(match < this->number_of_sequences || index < this->number_of_sequences)
+    {
+      break;
+    }
+    match -= this->number_of_sequences;
+    index -= this->number_of_sequences;
+
+    c = this->getCharacter(index);
+    low = this->index_ranges[c].first;
   }
 
-  return bytes;
+  return value;
+}
+
+//--------------------------------------------------------------------------
+
+void
+RLCSA::mergeEndPoints(RLCSA& index, RLCSA& increment)
+{
+  DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE);
+
+  DeltaVector::Iterator index_iter(*(index.end_points));
+  DeltaVector::Iterator increment_iter(*(increment.end_points));
+
+  endings->setBit(index_iter.select(0));
+  for(usint i = 1; i < index.number_of_sequences; i++)
+  {
+    endings->setBit(index_iter.selectNext());
+  }
+  usint sum = index.end_points->getSize();
+  delete index.end_points; index.end_points = 0;
+
+  endings->setBit(sum + increment_iter.select(0));
+  for(usint i = 1; i < increment.number_of_sequences; i++)
+  {
+    endings->setBit(sum + increment_iter.selectNext());
+  }
+  sum += increment.end_points->getSize();
+  delete increment.end_points; increment.end_points = 0;
+
+  this->end_points = new DeltaVector(*endings, sum);
+  delete endings;
+}
+
+
+void
+RLCSA::mergeSamples(RLCSA& index, RLCSA& increment, usint* positions)
+{
+  if(this->support_locate || this->support_display)
+  {
+    positions += increment.number_of_sequences;
+    this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size, this->number_of_sequences);
+  }
 }
 
 //--------------------------------------------------------------------------