#ifndef _TextStorage_H_
#define _TextStorage_H_
+#include "TextCollection.h"
+#include "Tools.h"
+
#include "incbwt/bits/deltavector.h"
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+# define W 64
+#else
+# define W 32
+#endif
+
+#include <cassert>
+#include <stdexcept>
+
namespace SXSI
{
+class TextStorageBuilder;
+class TextStoragePlainText;
+class TextStorageLzIndex;
+
/**
* Text collection that supports fast extraction.
+ * Defines an abstact interface class.
+ * See subclasses TextStorageLzIndex and TextStoragePlainText
+ * below.
*/
class TextStorage
{
public:
+ // Storage type
+ const static char TYPE_PLAIN_TEXT = 0;
+ const static char TYPE_LZ_INDEX = 1;
+
+ // Call DeleteText() for each pointer returned by GetText()
+ // to avoid possible memory leaks.
+ virtual uchar * GetText(TextCollection::DocId docId) const = 0;
+ virtual uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const = 0;
+ virtual void DeleteText(uchar *) const = 0;
+
+ static TextStorage * Load(FILE *file);
+ virtual void Save(FILE *file) const = 0;
+
+ virtual ~TextStorage()
+ {
+ delete offit_;
+ offit_ = 0;
+ delete offsets_;
+ offsets_ = 0;
+ }
+
+ TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i) const
+ {
+ assert(i < n_);
+ return offit_->rank(i)-1;
+ }
+
+ TextCollection::TextPosition TextStartPos(TextCollection::DocId i) const
+ {
+ assert(i < (TextCollection::DocId)numberOfTexts_);
+ return offit_->select(i);
+ }
+
+ bool IsEndmarker(TextCollection::TextPosition i) const
+ {
+ assert(i < n_);
+ if (i >= n_ - 1)
+ return true;
+ return offit_->isSet(i+1);
+ }
+
+
+protected:
// Define a shortcut
typedef TextCollection::TextPosition TextPosition;
// Block size in DeltaVector
const static CSA::usint DV_BLOCK_SIZE = 32;
- TextStorage(uchar *text, TextPosition n)
- : n_(n), text_(text), offsets_(0), numberOfTexts_(0)
+ TextStorage(uchar const * text, TextPosition n)
+ : n_(n), offsets_(0), offit_(0), numberOfTexts_(0)
{
- initOffsets();
+ // Delta encoded bitvector of text offsets.
+ CSA::DeltaEncoder encoder(DV_BLOCK_SIZE);
+ encoder.setBit(0); // Start of the first text.
+
+ // Read offsets by finding text end positions:
+ for (TextPosition i = 0; i < n_ - 1; ++i)
+ if (text[i] == '\0')
+ encoder.setBit(i+1);
+
+
+ offsets_ = new CSA::DeltaVector(encoder, n_);
+ offit_ = new CSA::DeltaVector::Iterator(*(offsets_));
+ numberOfTexts_ = offit_->rank(n_ - 1);
}
+
+ TextStorage(std::FILE *);
+ void Save(FILE *file, char type) const;
- TextStorage(FILE *file)
- : n_(0), text_(0), offsets_(0), numberOfTexts_(0)
+ TextPosition n_;
+ CSA::DeltaVector *offsets_;
+ CSA::DeltaVector::Iterator *offit_;
+ TextPosition numberOfTexts_;
+};
+
+/******************************************************************
+ * Plain text collection.
+ */
+class TextStoragePlainText : public TextStorage
+{
+public:
+ TextStoragePlainText(uchar *text, TextPosition n)
+ : TextStorage(text, n), text_(text)
+ { }
+
+ TextStoragePlainText(FILE *file)
+ : TextStorage(file), text_(0)
{
- if (std::fread(&(this->n_), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("TextStorage::Load(): file read error (n_).");
-
text_ = new uchar[n_];
if (std::fread(this->text_, sizeof(uchar), n_, file) != n_)
throw std::runtime_error("TextStorage::Load(): file read error (text_).");
-
- initOffsets();
}
- void Save(FILE *file)
+ void Save(FILE *file) const
{
- if (std::fwrite(&(this->n_), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("TextStorage::Save(): file write error (n_).");
+ TextStorage::Save(file, TYPE_PLAIN_TEXT);
- if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
+ if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
throw std::runtime_error("TextStorage::Save(): file write error (text_).");
}
- ~TextStorage()
+ ~TextStoragePlainText()
{
- delete offsets_;
- offsets_ = 0;
delete [] text_;
text_ = 0;
n_ = 0;
}
- uchar * GetText(TextCollection::DocId docId)
+ uchar * GetText(TextCollection::DocId docId) const
{
assert(docId < (TextCollection::DocId)numberOfTexts_);
- TextPosition offset = offsets_->select(docId);
+ TextPosition offset = offit_->select(docId);
return &text_[offset];
}
- TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i)
- {
- assert(i < n_);
- return offsets_->rank(i)-1;
- }
-
- TextCollection::TextPosition TextStartPos(TextCollection::DocId i)
+ uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const
{
assert(i < (TextCollection::DocId)numberOfTexts_);
- return offsets_->select(i);
+ assert(j < (TextCollection::DocId)numberOfTexts_);
+
+ TextPosition offset = offit_->select(i);
+ return &text_[offset];
}
+ // No operation, since text is a pointer to this->text_
+ void DeleteText(uchar *text) const
+ { }
+
private:
- void initOffsets()
- {
- // Delta encoded bitvector of text offsets.
- CSA::DeltaEncoder encoder(DV_BLOCK_SIZE);
- encoder.setBit(0); // Start of the first text.
+ uchar *text_;
+}; // class TextStorage
- // Read offsets by finding text end positions:
- for (TextPosition i = 0; i < n_ - 1; ++i)
- if (text_[i] == '\0')
- encoder.setBit(i+1);
- offsets_ = new CSA::DeltaVector(encoder, n_);
- numberOfTexts_ = offsets_->rank(n_ - 1);
+/******************************************************************
+ * LZ-index text collection.
+ */
+struct LzTriePimpl; // Using Pimpl idiom to hide LzTrie implementation.
+
+class TextStorageLzIndex : public TextStorage
+{
+public:
+ TextStorageLzIndex(uchar *text, TextPosition n);
+ TextStorageLzIndex(FILE *file);
+ void Save(FILE *file) const;
+ ~TextStorageLzIndex();
+ uchar * GetText(TextCollection::DocId docId) const;
+ uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const;
+
+ // Free the space malloc'ed in lztrie::extract()
+ void DeleteText(uchar *text) const
+ {
+ free(text);
}
- TextPosition n_;
- uchar *text_; // FIXME Replace with a succinct representation.
- CSA::DeltaVector *offsets_;
- TextPosition numberOfTexts_;
-}; // class TextStorage
+private:
+ struct LzTriePimpl * p_;
+}; // class TextStorageLzIndex
/**
}
// Init TextStorage
- TextStorage * InitTextStorage()
+ // Type defaults to plain text.
+ TextStorage * InitTextStorage(char type = TextStorage::TYPE_PLAIN_TEXT)
{
freeText = false; // Passing text to TextStorage.
- return new TextStorage(text_, n_);
+ switch(type)
+ {
+ case (TextStorage::TYPE_PLAIN_TEXT):
+ return new TextStoragePlainText(text_, n_);
+ case (TextStorage::TYPE_LZ_INDEX):
+ return new TextStorageLzIndex(text_, n_);
+ default:
+ std::cerr << "TextStorageBuilder: Unknown type given!" << std::endl;
+ exit(1);
+ }
}
private:
TextPosition n_;
- uchar *text_; // FIXME Replace with a succinct representation.
+ uchar *text_;
bool freeText;
}; // class TextStorageBuilder