1 /******************************************************************************
2 * Copyright (C) 2009 Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #ifndef _TextStorage_H_
22 #define _TextStorage_H_
24 #include "TextCollection.h"
26 #include "incbwt/bits/deltavector.h"
34 class TextStorageBuilder;
35 class TextStoragePlainText;
36 class TextStorageLzIndex;
39 * Text collection that supports fast extraction.
40 * Defines an abstact interface class.
41 * See subclasses TextStorageLzIndex and TextStoragePlainText
48 const static char TYPE_PLAIN_TEXT = 0;
49 const static char TYPE_LZ_INDEX = 1;
51 // Call DeleteText() for each pointer returned by GetText()
52 // to avoid possible memory leaks.
53 virtual uchar * GetText(TextCollection::DocId docId) const = 0;
54 virtual uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const = 0;
55 virtual void DeleteText(uchar *) const = 0;
57 static TextStorage * Load(FILE *file);
58 virtual void Save(FILE *file) const = 0;
60 virtual ~TextStorage()
66 TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i) const
69 return offsets_->rank(i)-1;
72 TextCollection::TextPosition TextStartPos(TextCollection::DocId i) const
74 assert(i < (TextCollection::DocId)numberOfTexts_);
75 return offsets_->select(i);
78 bool IsEndmarker(TextCollection::TextPosition i) const
83 return offsets_->isSet(i+1);
89 typedef TextCollection::TextPosition TextPosition;
90 // Block size in DeltaVector
91 const static CSA::usint DV_BLOCK_SIZE = 32;
93 TextStorage(uchar const * text, TextPosition n)
94 : n_(n), offsets_(0), numberOfTexts_(0)
96 // Delta encoded bitvector of text offsets.
97 CSA::DeltaEncoder encoder(DV_BLOCK_SIZE);
98 encoder.setBit(0); // Start of the first text.
100 // Read offsets by finding text end positions:
101 for (TextPosition i = 0; i < n_ - 1; ++i)
106 offsets_ = new CSA::DeltaVector(encoder, n_);
107 numberOfTexts_ = offsets_->rank(n_ - 1);
110 TextStorage(std::FILE *);
111 void Save(FILE *file, char type) const;
114 CSA::DeltaVector *offsets_;
115 TextPosition numberOfTexts_;
118 /******************************************************************
119 * Plain text collection.
121 class TextStoragePlainText : public TextStorage
124 TextStoragePlainText(uchar *text, TextPosition n)
125 : TextStorage(text, n), text_(text)
128 TextStoragePlainText(FILE *file)
129 : TextStorage(file), text_(0)
131 text_ = new uchar[n_];
132 if (std::fread(this->text_, sizeof(uchar), n_, file) != n_)
133 throw std::runtime_error("TextStorage::Load(): file read error (text_).");
136 void Save(FILE *file) const
138 TextStorage::Save(file, TYPE_PLAIN_TEXT);
140 if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
141 throw std::runtime_error("TextStorage::Save(): file write error (text_).");
144 ~TextStoragePlainText()
151 uchar * GetText(TextCollection::DocId docId) const
153 assert(docId < (TextCollection::DocId)numberOfTexts_);
155 TextPosition offset = offsets_->select(docId);
156 return &text_[offset];
159 uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const
161 assert(i < (TextCollection::DocId)numberOfTexts_);
162 assert(j < (TextCollection::DocId)numberOfTexts_);
164 TextPosition offset = offsets_->select(i);
165 return &text_[offset];
168 // No operation, since text is a pointer to this->text_
169 void DeleteText(uchar *text) const
174 }; // class TextStorage
177 /******************************************************************
178 * LZ-index text collection.
180 struct LzTriePimpl; // Using Pimpl idiom to hide LzTrie implementation.
182 class TextStorageLzIndex : public TextStorage
185 TextStorageLzIndex(uchar *text, TextPosition n);
186 TextStorageLzIndex(FILE *file);
187 void Save(FILE *file) const;
188 ~TextStorageLzIndex();
189 uchar * GetText(TextCollection::DocId docId) const;
190 uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const;
192 // Free the space malloc'ed in lztrie::extract()
193 void DeleteText(uchar *text) const
199 struct LzTriePimpl * p_;
200 }; // class TextStorageLzIndex
204 * Builder for TextStorage class
206 class TextStorageBuilder
210 typedef TextCollection::TextPosition TextPosition;
212 // Build up simple uchar array
213 explicit TextStorageBuilder(TextPosition n)
214 : n_(n), text_(new uchar [n]), freeText(true)
217 ~TextStorageBuilder()
225 // Write access to text[]
226 uchar& operator[] (TextPosition i)
232 // Type defaults to plain text.
233 TextStorage * InitTextStorage(char type = TextStorage::TYPE_PLAIN_TEXT)
235 freeText = false; // Passing text to TextStorage.
238 case (TextStorage::TYPE_PLAIN_TEXT):
239 return new TextStoragePlainText(text_, n_);
240 case (TextStorage::TYPE_LZ_INDEX):
241 return new TextStorageLzIndex(text_, n_);
243 std::cerr << "TextStorageBuilder: Unknown type given!" << std::endl;
252 }; // class TextStorageBuilder
256 #endif // #ifndef _TextStorage_H_