X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TextStorage.h;h=1f809b8e4878d186aee720a0cdf802dc05685617;hb=refs%2Fheads%2Fmaster;hp=237bc44698ce36fd31a09fe5492ebdea13af1b1c;hpb=b8470e984146c62910517a2ec762d2ede97c2d96;p=SXSI%2FTextCollection.git diff --git a/TextStorage.h b/TextStorage.h index 237bc44..1f809b8 100644 --- a/TextStorage.h +++ b/TextStorage.h @@ -21,91 +21,196 @@ #ifndef _TextStorage_H_ #define _TextStorage_H_ +#include "TextCollection.h" +#include "Tools.h" + #include "incbwt/bits/deltavector.h" +// Re-define word size to ulong: +#undef W +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif + +#include +#include + namespace SXSI { +class TextStorageBuilder; +class TextStoragePlainText; +class TextStorageLzIndex; + /** * Text collection that supports fast extraction. + * Defines an abstact interface class. + * See subclasses TextStorageLzIndex and TextStoragePlainText + * below. */ class TextStorage { public: + // Storage type + const static char TYPE_PLAIN_TEXT = 0; + const static char TYPE_LZ_INDEX = 1; + + // Call DeleteText() for each pointer returned by GetText() + // to avoid possible memory leaks. + virtual uchar * GetText(TextCollection::DocId docId) const = 0; + virtual uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const = 0; + virtual void DeleteText(uchar *) const = 0; + + static TextStorage * Load(FILE *file); + virtual void Save(FILE *file) const = 0; + + virtual ~TextStorage() + { + delete offit_; + offit_ = 0; + delete offsets_; + offsets_ = 0; + } + + TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i) const + { + assert(i < n_); + return offit_->rank(i)-1; + } + + TextCollection::TextPosition TextStartPos(TextCollection::DocId i) const + { + assert(i < (TextCollection::DocId)numberOfTexts_); + return offit_->select(i); + } + + bool IsEndmarker(TextCollection::TextPosition i) const + { + assert(i < n_); + if (i >= n_ - 1) + return true; + return offit_->isSet(i+1); + } + + +protected: // Define a shortcut typedef TextCollection::TextPosition TextPosition; // Block size in DeltaVector - const static CSA::usint DV_BLOCK_SIZE = 16; + const static CSA::usint DV_BLOCK_SIZE = 32; - - TextStorage(uchar *text, TextPosition n) - : n_(n), text_(text), offsets_(0), numberOfTexts_(0) + TextStorage(uchar const * text, TextPosition n) + : n_(n), offsets_(0), offit_(0), numberOfTexts_(0) { - initOffsets(); + // Delta encoded bitvector of text offsets. + CSA::DeltaEncoder encoder(DV_BLOCK_SIZE); + encoder.setBit(0); // Start of the first text. + + // Read offsets by finding text end positions: + for (TextPosition i = 0; i < n_ - 1; ++i) + if (text[i] == '\0') + encoder.setBit(i+1); + + + offsets_ = new CSA::DeltaVector(encoder, n_); + offit_ = new CSA::DeltaVector::Iterator(*(offsets_)); + numberOfTexts_ = offit_->rank(n_ - 1); } + + TextStorage(std::FILE *); + void Save(FILE *file, char type) const; + + TextPosition n_; + CSA::DeltaVector *offsets_; + CSA::DeltaVector::Iterator *offit_; + TextPosition numberOfTexts_; +}; - TextStorage(FILE *file) - : n_(0), text_(0), offsets_(0), numberOfTexts_(0) +/****************************************************************** + * Plain text collection. + */ +class TextStoragePlainText : public TextStorage +{ +public: + TextStoragePlainText(uchar *text, TextPosition n) + : TextStorage(text, n), text_(text) + { } + + TextStoragePlainText(FILE *file) + : TextStorage(file), text_(0) { - if (std::fread(&(this->n_), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("TextStorage::Load(): file read error (n_)."); - text_ = new uchar[n_]; if (std::fread(this->text_, sizeof(uchar), n_, file) != n_) throw std::runtime_error("TextStorage::Load(): file read error (text_)."); - - initOffsets(); } - void Save(FILE *file) + void Save(FILE *file) const { - if (std::fwrite(&(this->n_), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("TextStorage::Save(): file write error (n_)."); + TextStorage::Save(file, TYPE_PLAIN_TEXT); - if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_) + if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_) throw std::runtime_error("TextStorage::Save(): file write error (text_)."); } - ~TextStorage() + ~TextStoragePlainText() { - delete offsets_; - offsets_ = 0; delete [] text_; text_ = 0; n_ = 0; } - uchar * GetText(TextCollection::DocId docId) + uchar * GetText(TextCollection::DocId docId) const { - assert(docId < numberOfTexts_); + assert(docId < (TextCollection::DocId)numberOfTexts_); - TextPosition offset = offsets_->select(docId); + TextPosition offset = offit_->select(docId); return &text_[offset]; } + uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const + { + assert(i < (TextCollection::DocId)numberOfTexts_); + assert(j < (TextCollection::DocId)numberOfTexts_); + TextPosition offset = offit_->select(i); + return &text_[offset]; + } + + // No operation, since text is a pointer to this->text_ + void DeleteText(uchar *text) const + { } private: - void initOffsets() - { - // Delta encoded bitvector of text offsets. - CSA::DeltaEncoder encoder(DV_BLOCK_SIZE); - encoder.setBit(0); // Start of the first text. + uchar *text_; +}; // class TextStorage - // Read offsets by finding text end positions: - for (TextPosition i = 0; i < n_ - 1; ++i) - if (text_[i] == '\0') - encoder.setBit(i+1); - offsets_ = new CSA::DeltaVector(encoder, n_); - numberOfTexts_ = offsets_->rank(n_ - 1); +/****************************************************************** + * LZ-index text collection. + */ +struct LzTriePimpl; // Using Pimpl idiom to hide LzTrie implementation. + +class TextStorageLzIndex : public TextStorage +{ +public: + TextStorageLzIndex(uchar *text, TextPosition n); + TextStorageLzIndex(FILE *file); + void Save(FILE *file) const; + ~TextStorageLzIndex(); + uchar * GetText(TextCollection::DocId docId) const; + uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const; + + // Free the space malloc'ed in lztrie::extract() + void DeleteText(uchar *text) const + { + free(text); } - TextPosition n_; - uchar *text_; // FIXME Replace with a succinct representation. - CSA::DeltaVector *offsets_; - TextPosition numberOfTexts_; -}; // class TextStorage +private: + struct LzTriePimpl * p_; +}; // class TextStorageLzIndex /** @@ -137,15 +242,25 @@ public: } // Init TextStorage - TextStorage * InitTextStorage() + // Type defaults to plain text. + TextStorage * InitTextStorage(char type = TextStorage::TYPE_PLAIN_TEXT) { freeText = false; // Passing text to TextStorage. - return new TextStorage(text_, n_); + switch(type) + { + case (TextStorage::TYPE_PLAIN_TEXT): + return new TextStoragePlainText(text_, n_); + case (TextStorage::TYPE_LZ_INDEX): + return new TextStorageLzIndex(text_, n_); + default: + std::cerr << "TextStorageBuilder: Unknown type given!" << std::endl; + exit(1); + } } private: TextPosition n_; - uchar *text_; // FIXME Replace with a succinct representation. + uchar *text_; bool freeText; }; // class TextStorageBuilder