--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2009 Niko Välimäki *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+
+#include "TextStorage.h"
+
+#undef W
+#undef bitsW
+#undef bitset
+#undef bits
+
+#include "lzindex/lztrie.h"
+
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+# define W 64
+#else
+# define W 32
+#endif
+#undef bitset
+#undef bitget
+#undef bits
+
+
+namespace SXSI
+{
+
+/******************************************************************
+ * Class TextStorage
+ */
+
+TextStorage * TextStorage::Load(std::FILE *file)
+{
+ char type = 0;
+ if (std::fread(&type, sizeof(char), 1, file) != 1)
+ throw std::runtime_error("TextStorage::Load(): file read error (type).");
+
+ switch(type)
+ {
+ case (TYPE_PLAIN_TEXT):
+ return new TextStoragePlainText(file);
+ case (TYPE_LZ_INDEX):
+ return new TextStorageLzIndex(file);
+ default:
+ std::cerr << "TextStorage::Load(): Unknown type in save file!" << std::endl;
+ exit(1);
+ }
+}
+
+TextStorage::TextStorage(std::FILE * file)
+ : n_(0), offsets_(0), numberOfTexts_(0)
+{
+ if (std::fread(&(this->n_), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TextStorage::Load(): file read error (n_).");
+
+ if (std::fread(&(this->numberOfTexts_), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TextStorage::Load(): file read error (numberOfTexts_).");
+
+ offsets_ = new CSA::DeltaVector(file);
+}
+
+void TextStorage::Save(FILE *file, char type) const
+{
+ if (std::fwrite(&type, sizeof(char), 1, file) != 1)
+ throw std::runtime_error("TextStorage::Save(): file write error (type).");
+
+ if (std::fwrite(&(this->n_), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TextStorage::Save(): file write error (n_).");
+
+ if (std::fwrite(&(this->numberOfTexts_), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("TextStorage::Save(): file write error (n_).");
+
+ offsets_->writeTo(file);
+}
+
+
+/******************************************************************
+ * Class TextStorageLzIndex
+ */
+
+// Hide the lztrie declaration
+struct LzTriePimpl
+{
+ lztrie lz;
+
+ LzTriePimpl()
+ : lz(0)
+ { }
+};
+
+TextStorageLzIndex::TextStorageLzIndex(uchar *text, TextPosition n)
+ : TextStorage(text, n), p_(new struct LzTriePimpl)
+{
+ for (ulong i = 0; i < n_ - 1; ++i)
+ if (text[i] == 0)
+ text[i] = 1; // '\0' can appear only once.
+ text[n_ - 1] = 0;
+
+ p_->lz = buildLZTrie(text, (uchar)0, n_);
+ delete [] text;
+}
+
+TextStorageLzIndex::TextStorageLzIndex(FILE *file)
+ : TextStorage(file), p_(new struct LzTriePimpl)
+{
+ p_->lz = loadLZTrie(file);
+}
+
+void TextStorageLzIndex::Save(FILE *file) const
+{
+ TextStorage::Save(file, TYPE_LZ_INDEX);
+
+ saveLZTrie(p_->lz, file);
+}
+
+TextStorageLzIndex::~TextStorageLzIndex()
+{
+ destroyLZTrie(p_->lz);
+ delete p_;
+ p_ = 0;
+ n_ = 0;
+}
+
+uchar * TextStorageLzIndex::GetText(TextCollection::DocId docId) const
+{
+ assert(docId < (TextCollection::DocId)numberOfTexts_);
+
+ TextPosition from = offsets_->select(docId);
+ TextPosition to = 0;
+ if (docId < (TextCollection::DocId)numberOfTexts_ - 1)
+ to = offsets_->select(docId + 1) - 1;
+ else
+ to = n_-1;
+
+ uchar *text = 0;
+ ulong l = 0;
+ extract(p_->lz, from, to, &text, &l);
+
+ text[l-1] = 0;
+ return text;
+}
+
+uchar * TextStorageLzIndex::GetText(TextCollection::DocId i, TextCollection::DocId j) const
+{
+ assert(i < (TextCollection::DocId)numberOfTexts_);
+ assert(j < (TextCollection::DocId)numberOfTexts_);
+
+ TextPosition from = offsets_->select(i);
+ TextPosition to = 0;
+ if (j < (TextCollection::DocId)numberOfTexts_ - 1)
+ to = offsets_->select(j + 1) - 1;
+ else
+ to = n_-1;
+
+ uchar *text = 0;
+ ulong l = 0;
+ extract(p_->lz, from, to, &text, &l);
+
+ // Put '\0' bytes back in place
+ while (i < j && i < (TextCollection::DocId)numberOfTexts_)
+ {
+ ++i;
+ text[offsets_->select(i) - 1 - from] = 0;
+ }
+ text[l-1] = 0;
+ return text;
+}
+
+} // namespace SXSI
+
#ifndef _TextStorage_H_
#define _TextStorage_H_
+#include "TextCollection.h"
+#include "Tools.h"
#include "incbwt/bits/deltavector.h"
+#include <cassert>
+#include <stdexcept>
+
namespace SXSI
{
+class TextStorageBuilder;
+class TextStoragePlainText;
+class TextStorageLzIndex;
+
/**
* Text collection that supports fast extraction.
+ * Defines an abstact interface class.
+ * See subclasses TextStorageLzIndex and TextStoragePlainText
+ * below.
*/
class TextStorage
{
public:
+ // Storage type
+ const static char TYPE_PLAIN_TEXT = 0;
+ const static char TYPE_LZ_INDEX = 1;
+
+ // Call DeleteText() for each pointer returned by GetText()
+ // to avoid possible memory leaks.
+ virtual uchar * GetText(TextCollection::DocId docId) const = 0;
+ virtual uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const = 0;
+ virtual void DeleteText(uchar *) const = 0;
+
+ static TextStorage * Load(FILE *file);
+ virtual void Save(FILE *file) const = 0;
+
+ virtual ~TextStorage()
+ {
+ delete offsets_;
+ offsets_ = 0;
+ }
+
+ TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i) const
+ {
+ assert(i < n_);
+ return offsets_->rank(i)-1;
+ }
+
+ TextCollection::TextPosition TextStartPos(TextCollection::DocId i) const
+ {
+ assert(i < (TextCollection::DocId)numberOfTexts_);
+ return offsets_->select(i);
+ }
+
+ bool IsEndmarker(TextCollection::TextPosition i) const
+ {
+ assert(i < n_);
+ if (i >= n_ - 1)
+ return true;
+ return offsets_->isSet(i+1);
+ }
+
+
+protected:
// Define a shortcut
typedef TextCollection::TextPosition TextPosition;
// Block size in DeltaVector
const static CSA::usint DV_BLOCK_SIZE = 32;
- TextStorage(uchar *text, TextPosition n)
- : n_(n), text_(text), offsets_(0), numberOfTexts_(0)
+ TextStorage(uchar const * text, TextPosition n)
+ : n_(n), offsets_(0), numberOfTexts_(0)
{
- initOffsets();
+ // Delta encoded bitvector of text offsets.
+ CSA::DeltaEncoder encoder(DV_BLOCK_SIZE);
+ encoder.setBit(0); // Start of the first text.
+
+ // Read offsets by finding text end positions:
+ for (TextPosition i = 0; i < n_ - 1; ++i)
+ if (text[i] == '\0')
+ encoder.setBit(i+1);
+
+
+ offsets_ = new CSA::DeltaVector(encoder, n_);
+
+/* for (ulong i = 0; i < n_-1; ++i)
+ if ((text[i] == '\0') != IsEndmarker(i))
+ std::cout << "misplaced endmarker at i = " << i << std::endl;
+*/
+ numberOfTexts_ = offsets_->rank(n_ - 1);
}
+
+ TextStorage(std::FILE *);
+ void Save(FILE *file, char type) const;
+
+ TextPosition n_;
+ CSA::DeltaVector *offsets_;
+ TextPosition numberOfTexts_;
+};
- TextStorage(FILE *file)
- : n_(0), text_(0), offsets_(0), numberOfTexts_(0)
+/******************************************************************
+ * Plain text collection.
+ */
+class TextStoragePlainText : public TextStorage
+{
+public:
+ TextStoragePlainText(uchar *text, TextPosition n)
+ : TextStorage(text, n), text_(text)
+ { }
+
+ TextStoragePlainText(FILE *file)
+ : TextStorage(file), text_(0)
{
- if (std::fread(&(this->n_), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("TextStorage::Load(): file read error (n_).");
-
text_ = new uchar[n_];
if (std::fread(this->text_, sizeof(uchar), n_, file) != n_)
throw std::runtime_error("TextStorage::Load(): file read error (text_).");
-
- initOffsets();
}
- void Save(FILE *file)
+ void Save(FILE *file) const
{
- if (std::fwrite(&(this->n_), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("TextStorage::Save(): file write error (n_).");
+ TextStorage::Save(file, TYPE_PLAIN_TEXT);
- if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
+ if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
throw std::runtime_error("TextStorage::Save(): file write error (text_).");
}
- ~TextStorage()
+ ~TextStoragePlainText()
{
- delete offsets_;
- offsets_ = 0;
delete [] text_;
text_ = 0;
n_ = 0;
}
- uchar * GetText(TextCollection::DocId docId)
+ uchar * GetText(TextCollection::DocId docId) const
{
assert(docId < (TextCollection::DocId)numberOfTexts_);
return &text_[offset];
}
- TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i)
- {
- assert(i < n_);
- return offsets_->rank(i)-1;
- }
-
- TextCollection::TextPosition TextStartPos(TextCollection::DocId i)
+ uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const
{
assert(i < (TextCollection::DocId)numberOfTexts_);
- return offsets_->select(i);
+ assert(j < (TextCollection::DocId)numberOfTexts_);
+
+ TextPosition offset = offsets_->select(i);
+ return &text_[offset];
}
+ // No operation, since text is a pointer to this->text_
+ void DeleteText(uchar *text) const
+ { }
+
private:
- void initOffsets()
- {
- // Delta encoded bitvector of text offsets.
- CSA::DeltaEncoder encoder(DV_BLOCK_SIZE);
- encoder.setBit(0); // Start of the first text.
+ uchar *text_;
+}; // class TextStorage
- // Read offsets by finding text end positions:
- for (TextPosition i = 0; i < n_ - 1; ++i)
- if (text_[i] == '\0')
- encoder.setBit(i+1);
- offsets_ = new CSA::DeltaVector(encoder, n_);
- numberOfTexts_ = offsets_->rank(n_ - 1);
+/******************************************************************
+ * LZ-index text collection.
+ */
+struct LzTriePimpl; // Pimpl, declared in .cpp
+
+class TextStorageLzIndex : public TextStorage
+{
+public:
+ TextStorageLzIndex(uchar *text, TextPosition n);
+ TextStorageLzIndex(FILE *file);
+ void Save(FILE *file) const;
+ ~TextStorageLzIndex();
+ uchar * GetText(TextCollection::DocId docId) const;
+ uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const;
+
+ // Free the space malloc'ed in lztrie::extract()
+ void DeleteText(uchar *text) const
+ {
+ free(text);
}
- TextPosition n_;
- uchar *text_; // FIXME Replace with a succinct representation.
- CSA::DeltaVector *offsets_;
- TextPosition numberOfTexts_;
-}; // class TextStorage
+private:
+ struct LzTriePimpl * p_;
+}; // class TextStorageLzIndex
/**
}
// Init TextStorage
- TextStorage * InitTextStorage()
+ // Type defaults to plain text.
+ TextStorage * InitTextStorage(char type = TextStorage::TYPE_PLAIN_TEXT)
{
freeText = false; // Passing text to TextStorage.
- return new TextStorage(text_, n_);
+ switch(type)
+ {
+ case (TextStorage::TYPE_PLAIN_TEXT):
+ return new TextStoragePlainText(text_, n_);
+ case (TextStorage::TYPE_LZ_INDEX):
+ return new TextStorageLzIndex(text_, n_);
+ default:
+ std::cerr << "TextStorageBuilder: Unknown type given!" << std::endl;
+ exit(1);
+ }
}
private: