1 /******************************************************************************
2 * Copyright (C) 2009 Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #ifndef _TextStorage_H_
22 #define _TextStorage_H_
24 #include "TextCollection.h"
26 #include "incbwt/bits/deltavector.h"
34 class TextStorageBuilder;
35 class TextStoragePlainText;
36 class TextStorageLzIndex;
39 * Text collection that supports fast extraction.
40 * Defines an abstact interface class.
41 * See subclasses TextStorageLzIndex and TextStoragePlainText
48 const static char TYPE_PLAIN_TEXT = 0;
49 const static char TYPE_LZ_INDEX = 1;
51 // Call DeleteText() for each pointer returned by GetText()
52 // to avoid possible memory leaks.
53 virtual uchar * GetText(TextCollection::DocId docId) const = 0;
54 virtual uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const = 0;
55 virtual void DeleteText(uchar *) const = 0;
57 static TextStorage * Load(FILE *file);
58 virtual void Save(FILE *file) const = 0;
60 virtual ~TextStorage()
66 TextCollection::DocId DocIdAtTextPos(TextCollection::TextPosition i) const
69 return offsets_->rank(i)-1;
72 TextCollection::TextPosition TextStartPos(TextCollection::DocId i) const
74 assert(i < (TextCollection::DocId)numberOfTexts_);
75 return offsets_->select(i);
78 bool IsEndmarker(TextCollection::TextPosition i) const
83 return offsets_->isSet(i+1);
89 typedef TextCollection::TextPosition TextPosition;
90 // Block size in DeltaVector
91 const static CSA::usint DV_BLOCK_SIZE = 32;
93 TextStorage(uchar const * text, TextPosition n)
94 : n_(n), offsets_(0), numberOfTexts_(0)
96 // Delta encoded bitvector of text offsets.
97 CSA::DeltaEncoder encoder(DV_BLOCK_SIZE);
98 encoder.setBit(0); // Start of the first text.
100 // Read offsets by finding text end positions:
101 for (TextPosition i = 0; i < n_ - 1; ++i)
106 offsets_ = new CSA::DeltaVector(encoder, n_);
108 /* for (ulong i = 0; i < n_-1; ++i)
109 if ((text[i] == '\0') != IsEndmarker(i))
110 std::cout << "misplaced endmarker at i = " << i << std::endl;
112 numberOfTexts_ = offsets_->rank(n_ - 1);
115 TextStorage(std::FILE *);
116 void Save(FILE *file, char type) const;
119 CSA::DeltaVector *offsets_;
120 TextPosition numberOfTexts_;
123 /******************************************************************
124 * Plain text collection.
126 class TextStoragePlainText : public TextStorage
129 TextStoragePlainText(uchar *text, TextPosition n)
130 : TextStorage(text, n), text_(text)
133 TextStoragePlainText(FILE *file)
134 : TextStorage(file), text_(0)
136 text_ = new uchar[n_];
137 if (std::fread(this->text_, sizeof(uchar), n_, file) != n_)
138 throw std::runtime_error("TextStorage::Load(): file read error (text_).");
141 void Save(FILE *file) const
143 TextStorage::Save(file, TYPE_PLAIN_TEXT);
145 if (std::fwrite(this->text_, sizeof(uchar), n_, file) != n_)
146 throw std::runtime_error("TextStorage::Save(): file write error (text_).");
149 ~TextStoragePlainText()
156 uchar * GetText(TextCollection::DocId docId) const
158 assert(docId < (TextCollection::DocId)numberOfTexts_);
160 TextPosition offset = offsets_->select(docId);
161 return &text_[offset];
164 uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const
166 assert(i < (TextCollection::DocId)numberOfTexts_);
167 assert(j < (TextCollection::DocId)numberOfTexts_);
169 TextPosition offset = offsets_->select(i);
170 return &text_[offset];
173 // No operation, since text is a pointer to this->text_
174 void DeleteText(uchar *text) const
179 }; // class TextStorage
182 /******************************************************************
183 * LZ-index text collection.
185 struct LzTriePimpl; // Pimpl, declared in .cpp
187 class TextStorageLzIndex : public TextStorage
190 TextStorageLzIndex(uchar *text, TextPosition n);
191 TextStorageLzIndex(FILE *file);
192 void Save(FILE *file) const;
193 ~TextStorageLzIndex();
194 uchar * GetText(TextCollection::DocId docId) const;
195 uchar * GetText(TextCollection::DocId i, TextCollection::DocId j) const;
197 // Free the space malloc'ed in lztrie::extract()
198 void DeleteText(uchar *text) const
204 struct LzTriePimpl * p_;
205 }; // class TextStorageLzIndex
209 * Builder for TextStorage class
211 class TextStorageBuilder
215 typedef TextCollection::TextPosition TextPosition;
217 // Build up simple uchar array
218 explicit TextStorageBuilder(TextPosition n)
219 : n_(n), text_(new uchar [n]), freeText(true)
222 ~TextStorageBuilder()
230 // Write access to text[]
231 uchar& operator[] (TextPosition i)
237 // Type defaults to plain text.
238 TextStorage * InitTextStorage(char type = TextStorage::TYPE_PLAIN_TEXT)
240 freeText = false; // Passing text to TextStorage.
243 case (TextStorage::TYPE_PLAIN_TEXT):
244 return new TextStoragePlainText(text_, n_);
245 case (TextStorage::TYPE_LZ_INDEX):
246 return new TextStorageLzIndex(text_, n_);
248 std::cerr << "TextStorageBuilder: Unknown type given!" << std::endl;
255 uchar *text_; // FIXME Replace with a succinct representation.
257 }; // class TextStorageBuilder
261 #endif // #ifndef _TextStorage_H_