1 /******************************************************************************
2 * Copyright (C) 2009 Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #ifndef _TextStorage_H_
22 #define _TextStorage_H_
24 #include "TextCollection.h"
27 #include "incbwt/bits/deltavector.h"
28 // Re-define word size to ulong:
43 class TextStorageBuilder;
44 class TextStoragePlainText;
45 class TextStorageLzIndex;
48 * Text collection that supports fast extraction.
49 * Defines an abstact interface class.
50 * See subclasses TextStorageLzIndex and TextStoragePlainText
57 const static char TYPE_PLAIN_TEXT = 0;
58 const static char TYPE_LZ_INDEX = 1;
60 // Call DeleteText() for each pointer returned by GetText()
61 // to avoid possible memory leaks.
62 virtual uchar * GetText(TextCollection::DocId docId) const = 0;
63 virtual uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const = 0;
64 virtual void DeleteText(uchar *) const = 0;
66 static TextStorage * Load(FILE *file);
67 virtual void Save(FILE *file) const = 0;
69 virtual ~TextStorage()
75 TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i) const
78 return offsets_->rank(i)-1;
81 TextCollection::TextPosition TextStartPos(TextCollection::DocId i) const
83 assert(i < (TextCollection::DocId)numberOfTexts_);
84 return offsets_->select(i);
87 bool IsEndmarker(TextCollection::TextPosition i) const
92 return offsets_->isSet(i+1);
98 typedef TextCollection::TextPosition TextPosition;
99 // Block size in DeltaVector
100 const static CSA::usint DV_BLOCK_SIZE = 32;
102 TextStorage(uchar const * text, TextPosition n)
103 : n_(n), offsets_(0), numberOfTexts_(0)
105 // Delta encoded bitvector of text offsets.
106 CSA::DeltaEncoder encoder(DV_BLOCK_SIZE);
107 encoder.setBit(0); // Start of the first text.
109 // Read offsets by finding text end positions:
110 for (TextPosition i = 0; i < n_ - 1; ++i)
115 offsets_ = new CSA::DeltaVector(encoder, n_);
116 numberOfTexts_ = offsets_->rank(n_ - 1);
119 TextStorage(std::FILE *);
120 void Save(FILE *file, char type) const;
123 CSA::DeltaVector *offsets_;
124 TextPosition numberOfTexts_;
127 /******************************************************************
128 * Plain text collection.
130 class TextStoragePlainText : public TextStorage
133 TextStoragePlainText(uchar *text, TextPosition n)
134 : TextStorage(text, n), text_(text)
137 TextStoragePlainText(FILE *file)
138 : TextStorage(file), text_(0)
140 text_ = new uchar[n_];
141 if (std::fread(this->text_, sizeof(uchar), n_, file) != n_)
142 throw std::runtime_error("TextStorage::Load(): file read error (text_).");
145 void Save(FILE *file) const
147 TextStorage::Save(file, TYPE_PLAIN_TEXT);
149 if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
150 throw std::runtime_error("TextStorage::Save(): file write error (text_).");
153 ~TextStoragePlainText()
160 uchar * GetText(TextCollection::DocId docId) const
162 assert(docId < (TextCollection::DocId)numberOfTexts_);
164 TextPosition offset = offsets_->select(docId);
165 return &text_[offset];
168 uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const
170 assert(i < (TextCollection::DocId)numberOfTexts_);
171 assert(j < (TextCollection::DocId)numberOfTexts_);
173 TextPosition offset = offsets_->select(i);
174 return &text_[offset];
177 // No operation, since text is a pointer to this->text_
178 void DeleteText(uchar *text) const
183 }; // class TextStorage
186 /******************************************************************
187 * LZ-index text collection.
189 struct LzTriePimpl; // Using Pimpl idiom to hide LzTrie implementation.
191 class TextStorageLzIndex : public TextStorage
194 TextStorageLzIndex(uchar *text, TextPosition n);
195 TextStorageLzIndex(FILE *file);
196 void Save(FILE *file) const;
197 ~TextStorageLzIndex();
198 uchar * GetText(TextCollection::DocId docId) const;
199 uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const;
201 // Free the space malloc'ed in lztrie::extract()
202 void DeleteText(uchar *text) const
208 struct LzTriePimpl * p_;
209 }; // class TextStorageLzIndex
213 * Builder for TextStorage class
215 class TextStorageBuilder
219 typedef TextCollection::TextPosition TextPosition;
221 // Build up simple uchar array
222 explicit TextStorageBuilder(TextPosition n)
223 : n_(n), text_(new uchar [n]), freeText(true)
226 ~TextStorageBuilder()
234 // Write access to text[]
235 uchar& operator[] (TextPosition i)
241 // Type defaults to plain text.
242 TextStorage * InitTextStorage(char type = TextStorage::TYPE_PLAIN_TEXT)
244 freeText = false; // Passing text to TextStorage.
247 case (TextStorage::TYPE_PLAIN_TEXT):
248 return new TextStoragePlainText(text_, n_);
249 case (TextStorage::TYPE_LZ_INDEX):
250 return new TextStorageLzIndex(text_, n_);
252 std::cerr << "TextStorageBuilder: Unknown type given!" << std::endl;
261 }; // class TextStorageBuilder
265 #endif // #ifndef _TextStorage_H_