Added RLCSA index option
[SXSI/TextCollection.git] / incbwt / sasamples.cpp
index 8b1aef3..6747370 100644 (file)
@@ -1,10 +1,17 @@
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <vector>
 
 #include "sasamples.h"
 #include "misc/utils.h"
 
+#ifdef MULTITHREAD_SUPPORT
+#ifndef _GLIBCXX_PARALLEL
+#include <mcstl.h>
+#endif
+#endif
+
 
 namespace CSA
 {
@@ -19,38 +26,91 @@ SASamples::SASamples(std::ifstream& sample_file, usint sample_rate) :
   this->items = indexes->getNumberOfItems();
   this->integer_bits = length(this->items - 1);
 
-  this->samples = new FastBitBuffer(sample_file, this->items, this->integer_bits);
+  this->samples = new ReadBuffer(sample_file, this->items, this->integer_bits);
   this->buildInverseSamples();
 }
 
-SASamples::SASamples(usint* array, usint data_size, usint sample_rate) :
+SASamples::SASamples(uint* array, usint data_size, usint sample_rate) :
   rate(sample_rate),
   size(data_size)
 {
   this->items = (this->size - 1) / this->rate + 1;
   this->integer_bits = length(this->items - 1);
 
-  this->samples = new FastBitBuffer(this->items, this->integer_bits);
-  DeltaEncoder encoder(SASamples::BLOCK_SIZE);
+  WriteBuffer sample_buffer(this->items, this->integer_bits);
+  DeltaEncoder encoder(INDEX_BLOCK_SIZE);
   for(usint i = 0; i < this->size; i++)
   {
-    if(array[i] % rate == 0)
+    if(array[i] % sample_rate == 0)
     {
       encoder.setBit(i);
-      this->samples->writeItem(array[i] / sample_rate);
+      sample_buffer.writeItem(array[i] / sample_rate);
     }
   }
+
   this->indexes = new DeltaVector(encoder, this->size);
+  this->samples = sample_buffer.getReadBuffer();
+  this->buildInverseSamples();
+}
+
+SASamples::SASamples(uint* inverse, DeltaVector* end_points, usint data_size, usint sample_rate) :
+  rate(sample_rate),
+  items(0)
+{
+  usint sequences = end_points->getNumberOfItems();
+
+  DeltaVector::Iterator iter(*(end_points));
+
+  // Determine the samples, insert them into a vector, and sort them.
+  usint start = 0, end = iter.select(0);  // Closed range in padded collection.
+  usint seq_start = 0, seq_end = end;            // Closed range in inverse SA.
+  std::vector<pair_type>* vec = new std::vector<pair_type>();
+  for(usint i = 0; i < sequences; i++)
+  {
+    for(usint j = seq_start; j <= seq_end; j += this->rate)
+    {
+      vec->push_back(pair_type(inverse[j] - sequences - 1, this->items));
+      this->items++;
+    }
+    start = nextMultipleOf(this->rate, end);
+    end = iter.selectNext();
+    seq_start = seq_end + 2;
+    seq_end = seq_start + end - start;
+  }
+  #ifdef MULTITHREAD_SUPPORT
+    #ifdef _GLIBCXX_PARALLEL
+      std::sort(vec->begin(), vec->end(), __gnu_parallel::sequential_tag());
+    #else
+      std::sort(vec->begin(), vec->end(), mcstl::sequential_tag());
+    #endif
+  #else
+    std::sort(vec->begin(), vec->end());
+  #endif
+
+  // Compress the samples.
+  this->integer_bits = length(this->items - 1);
+  this->size = end + 1;
+  WriteBuffer sample_buffer(this->items, this->integer_bits);
+  DeltaEncoder encoder(INDEX_BLOCK_SIZE);
+  for(usint i = 0; i < this->items; i++)
+  {
+    pair_type sample = (*vec)[i];
+    encoder.setBit(sample.first);
+    sample_buffer.writeItem(sample.second);
+  }
+  delete vec;
 
+  this->indexes = new DeltaVector(encoder, this->size);
+  this->samples = sample_buffer.getReadBuffer();
   this->buildInverseSamples();
 }
 
-SASamples::SASamples(SASamples& index, SASamples& increment, usint* positions, usint number_of_positions) :
+SASamples::SASamples(SASamples& index, SASamples& increment, usint* positions, usint number_of_positions, usint number_of_sequences) :
   rate(index.rate),
   size(index.size + increment.size),
   items(index.items + increment.items)
 {
-  this->mergeSamples(index, increment, positions, number_of_positions);
+  this->mergeSamples(index, increment, positions, number_of_positions, number_of_sequences);
 
   index.indexes = 0;
   index.samples = 0;
@@ -69,37 +129,19 @@ SASamples::~SASamples()
   delete this->inverse_samples;
 }
 
-void
-SASamples::writeTo(std::ofstream& sample_file)
-{
-  this->indexes->writeTo(sample_file);
-  this->samples->writeBuffer(sample_file, false);
-}
-
 //--------------------------------------------------------------------------
 
-usint
-SASamples::inverseSA(usint value)
-{
-  if(value >= this->size) { return this->size; }
-  usint i = this->inverse_samples->readItem(value / this->rate);
-  return this->indexes->select(i);
-}
-
-pair_type
-SASamples::getFirstSampleAfter(usint index)
+void
+SASamples::writeTo(std::ofstream& sample_file) const
 {
-  if(index >= this->size) { return pair_type(this->size, this->size); }
-
-  return this->indexes->valueAfter(index);
+  this->indexes->writeTo(sample_file);
+  this->samples->writeBuffer(sample_file);
 }
 
-//--------------------------------------------------------------------------
-
 usint
-SASamples::reportSize()
+SASamples::reportSize() const
 {
-  usint bytes = sizeof(*this) - sizeof(this->indexes);
+  usint bytes = sizeof(*this);
   bytes += this->indexes->reportSize();
   bytes += this->samples->reportSize();
   bytes += this->inverse_samples->reportSize();
@@ -111,46 +153,47 @@ SASamples::reportSize()
 void
 SASamples::buildInverseSamples()
 {
-  this->inverse_samples = new FastBitBuffer(this->items, this->integer_bits);
+  WriteBuffer inverse_buffer(this->items, this->integer_bits);
   this->samples->goToItem(0);
   for(usint i = 0; i < this->items; i++)
   {
-    this->inverse_samples->goToItem(this->samples->readItem());
-    this->inverse_samples->writeItem(i);
+    inverse_buffer.goToItem(this->samples->readItem());
+    inverse_buffer.writeItem(i);
   }
+
+  this->inverse_samples = inverse_buffer.getReadBuffer();
 }
 
 
 //--------------------------------------------------------------------------
 
-
 void
-SASamples::mergeSamples(SASamples& index, SASamples& increment, usint* positions, usint n)
+SASamples::mergeSamples(SASamples& index, SASamples& increment, usint* positions, usint n, usint skip)
 {
-  DeltaVector* first = index.indexes;
-  DeltaVector* second = increment.indexes;
-  FastBitBuffer* first_samples = index.samples;
-  FastBitBuffer* second_samples = increment.samples;
+  DeltaVector::Iterator first(*(index.indexes));
+  DeltaVector::Iterator second(*(increment.indexes));
+  ReadBuffer* first_samples = index.samples;
+  ReadBuffer* second_samples = increment.samples;
 
-  usint first_bit = first->select(0);
+  usint first_bit = first.select(0);
   bool first_finished = false;
-  usint second_bit = second->select(0);
+  usint second_bit = second.select(0);
   usint sum = index.items;
   first_samples->goToItem(0);
   second_samples->goToItem(0);
 
-  DeltaEncoder encoder(SASamples::BLOCK_SIZE);
+  DeltaEncoder encoder(INDEX_BLOCK_SIZE);
   this->integer_bits = length(this->items - 1);
-  this->samples = new FastBitBuffer(this->items, this->integer_bits);
+  WriteBuffer sample_buffer(this->items, this->integer_bits);
   for(usint i = 0; i < n; i++, first_bit++)
   {
-    while(!first_finished && first_bit < positions[i])
+    while(!first_finished && first_bit < positions[i] - skip)
     {
       encoder.setBit(first_bit);
-      this->samples->writeItem(first_samples->readItem());
-      if(first->hasNext())
+      sample_buffer.writeItem(first_samples->readItem());
+      if(first.hasNext())
       {
-        first_bit = first->selectNext() + i;
+        first_bit = first.selectNext() + i;
       }
       else
       {
@@ -160,18 +203,18 @@ SASamples::mergeSamples(SASamples& index, SASamples& increment, usint* positions
 
     if(i == second_bit) // positions[i] is one
     {
-      encoder.setBit(positions[i]);
-      this->samples->writeItem(second_samples->readItem() + sum);
-      second_bit = second->selectNext();
+      encoder.setBit(positions[i] - skip);
+      sample_buffer.writeItem(second_samples->readItem() + sum);
+      second_bit = second.selectNext();
     }
   }
 
   while(!first_finished)
   {
     encoder.setBit(first_bit);
-    this->samples->writeItem(first_samples->readItem());
-    if(!first->hasNext()) { break; }
-    first_bit = first->selectNext() + n;
+    sample_buffer.writeItem(first_samples->readItem());
+    if(!first.hasNext()) { break; }
+    first_bit = first.selectNext() + n;
   }
 
   delete index.indexes;
@@ -182,6 +225,7 @@ SASamples::mergeSamples(SASamples& index, SASamples& increment, usint* positions
   delete increment.inverse_samples;
 
   this->indexes = new DeltaVector(encoder, size);
+  this->samples = sample_buffer.getReadBuffer();
 }