1 /******************************************************************************
2 * Copyright (C) 2010 by Niko Välimäki *
4 * FMIndex implementation for the TextCollection interface *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU Lesser General Public License as published *
8 * by the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU Lesser General Public License for more details. *
16 * You should have received a copy of the GNU Lesser General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
20 *****************************************************************************/
22 #ifndef _SWCSAWrapper_H_
23 #define _SWCSAWrapper_H_
25 #include "TextCollection.h"
27 #include "TextStorage.h"
29 #include "incbwt/bits/deltavector.h"
30 // Re-define word size to ulong:
38 #define SWCSAWRAPPER_VERSION_FLAG 101
40 #include "swcsa/utils/defValues.h"
41 #include "swcsa/utils/valstring.h"
42 #include "swcsa/interface.h"
51 * Partial implementation of the TextCollection interface
53 * Supports index construction, save, load and simple search.
54 * Use FMIndex implementation for full support.
56 class SWCSAWrapper : public SXSI::TextCollection {
58 SWCSAWrapper(uchar * text, ulong length, unsigned samplerate,
59 unsigned numberOfTexts_)
60 : index(0), offsets(0), n(length), seSize(0), numberOfTexts(numberOfTexts_)
62 // Inicializes the arrays used to detect if a char is valid or not.
65 // Delta encoded bitvector of text offsets.
66 CSA::DeltaEncoder encoder(32);
67 encoder.setBit(0); // Start of the first text.
69 // Construct offset map from words to text elements
72 uchar const *pbeg,*pend;
78 //parsing either a word or separator.
83 std::cerr << "SWCSAWrapper constructor: assert failed, *pbeg == 0" << std::endl;
87 if (_Valid[*pbeg]) { //alphanumerical data
88 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
92 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
93 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
96 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
98 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
100 if (_Valid [*pbeg] ) {
101 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
103 else { // a "separator word" ...
104 size++; //the prev BLANK...
105 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
106 }//else { // a "separator word"
107 }//else ... not a unique BLANK AT THE END.
108 }//else ... starting by a BLANK...
113 encoder.setBit(seSize+1);
114 pbeg ++; // Does not affect size of word (skips the 0-byte)
118 std::cerr << "SWCSAWrapper constructor: assert failed, size == 0" << std::endl;
125 fprintf(stderr,"\n-----------------------\nnumber of words = %lu\n------------------------\n", seSize);
126 encoder.setBit(seSize);
127 offsets = new CSA::DeltaVector(encoder, seSize+1);
128 std::cerr << "Number of texts: " << numberOfTexts << " vs " << offsets->rank(seSize - 1) << std::endl;
131 snprintf(opt, 99, "sA=%u;sAinv=%u;sPsi=%u", samplerate, samplerate, samplerate);
133 int r = build_index(text, n, opt, &index);
136 std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl;
143 int r = free_index (index);
146 std::cout << "SWCSAWrapper destructor error: " << error_index(r) << std::endl;
150 delete offsets; offsets = 0;
153 bool EmptyText(DocId k) const
155 assert(k < (DocId)numberOfTexts);
156 return false; // Empty texts are not indexed
160 * Extracting one text.
162 * Call DeleteText() for each pointer returned by GetText()
163 * to avoid possible memory leaks.
165 uchar * GetText(DocId i) const
167 return GetText(i, i);
169 void DeleteText(uchar *text) const
175 * Returns a pointer to the beginning of texts i, i+1, ..., j.
176 * Texts are separated by a '\0' byte.
178 * Call DeleteText() for each pointer returned by GetText()
179 * to avoid possible memory leaks.
181 uchar * GetText(DocId i, DocId j) const
185 from = offsets->select(i);
186 to = offsets->select(i+1); // ADD one 1-bit in to end!!!
188 int r = extractWords(index, from, to, &text, &l);
191 std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl;
198 bool IsPrefix(uchar const *) const { unsupported(); return false; };
199 bool IsSuffix(uchar const *) const { unsupported(); return false; };
200 bool IsEqual(uchar const *) const { unsupported(); return false; };
201 bool IsContains(uchar const *) const { unsupported(); return false; };
202 bool IsLessThan(uchar const *) const { unsupported(); return false; };
204 bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; };
205 bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; };
206 bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; };
207 bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; };
208 bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; };
210 ulong Count(uchar const *pattern) const
213 // FIXME Const correctness is broken!
214 int r = count (index, (uchar *)pattern, std::strlen((char const *)pattern), &occs);
217 std::cout << "SWCSAWrapper::Count error " << error_index(r) << std::endl;
222 unsigned CountPrefix(uchar const *) const { unsupported(); return 0; };
223 unsigned CountSuffix(uchar const *) const { unsupported(); return 0; };
224 unsigned CountEqual(uchar const *) const { unsupported(); return 0; };
225 unsigned CountContains(uchar const *) const { unsupported(); return 0; };
226 unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; };
228 unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
229 unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
230 unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; };
231 unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; };
232 unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; };
234 // Definition of document_result is inherited from SXSI::TextCollection.
235 document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
236 document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
237 document_result Equal(uchar const *) const { unsupported(); return document_result(); };
238 document_result Contains(uchar const *pattern) const
240 ulong *occ = 0, numocc = 0;
241 // FIXME Const correctness is broken!
242 int r = locateWord(index, (uchar *)pattern, std::strlen((char const *)pattern), &occ, &numocc, 0);
245 std::cout << "SWCSAWrapper::Contains error: " << error_index(r) << std::endl;
250 dr.reserve(numocc+1);
251 for (ulong i = 0; i < numocc; ++i)
252 dr.push_back(offsets->rank(occ[i])-1);
257 document_result LessThan(uchar const *) const { unsupported(); return document_result(); };
258 document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); };
259 document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); };
261 document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
262 document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
263 document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
264 document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
265 document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
267 // Definition of full_result is inherited from SXSI::TextCollection.
268 full_result FullContains(uchar const *) const { unsupported(); return full_result(); };
269 full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); };
270 full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); };
271 full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); };
273 // Index from/to disk
274 SWCSAWrapper(FILE *file, char const *filename)
275 : index(0), offsets(0), n(0), seSize(0), numberOfTexts(0)
278 if (std::fread(&verFlag, 1, 1, file) != 1)
279 throw std::runtime_error("SWCSAWrapper::Load(): file read error (version flag).");
280 if (verFlag != SWCSAWRAPPER_VERSION_FLAG)
281 throw std::runtime_error("SWCSAWrapper::Load(): invalid save file version.");
283 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
284 throw std::runtime_error("SWCSAWrapper::Load(): file read error (n).");
285 if (std::fread(&seSize, sizeof(ulong), 1, file) != 1)
286 throw std::runtime_error("SWCSAWrapper::Load(): file read error (seSize).");
287 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
288 throw std::runtime_error("FMIndex::Load(): file read error (numberOfTexts).");
290 offsets = new CSA::DeltaVector(file);
292 // FIXME Const correctness is broken!
293 int r = load_index((char *)filename, &index);
296 std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl;
301 void Save(FILE *file, char const *filename) const
303 const char type = 'W';
305 if (std::fwrite(&type, 1, 1, file) != 1)
306 throw std::runtime_error("SWCSAWrapper::Save(): file write error (type flag).");
308 const uchar ver = SWCSAWRAPPER_VERSION_FLAG;
309 // Saving version info:
310 if (std::fwrite(&ver, 1, 1, file) != 1)
311 throw std::runtime_error("SWCSAWrapper::Save(): file write error (version flag).");
313 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
314 throw std::runtime_error("SWCSAWrapper::Save(): file write error (n).");
315 if (std::fwrite(&(this->seSize), sizeof(ulong), 1, file) != 1)
316 throw std::runtime_error("SWCSAWrapper::Save(): file write error (seSize).");
317 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
318 throw std::runtime_error("SWCSAWrapper::Save(): file write error (numberOfTexts).");
320 offsets->writeTo(file);
322 // FIXME Const correctness is broken!
323 int r = save_index(index, (char *)filename);
326 std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl;
333 CSA::DeltaVector *offsets;
338 // Total number of texts in the collection
339 unsigned numberOfTexts;
341 void unsupported() const
343 std::cerr << std::endl << "-------------------------------------------------------------\n"
344 << "SWCSAWrapper: unsupported method!\nSee SWCSAWrapper.h for more details.\n"
345 << "The default index (FMIndex) implements this method!" << std::endl;
348 }; // class SWCSAWrapper