1 /******************************************************************************
2 * Copyright (C) 2010 by Niko Välimäki *
4 * SWCSA implementation for the TextCollection interface *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU Lesser General Public License as published *
8 * by the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU Lesser General Public License for more details. *
16 * You should have received a copy of the GNU Lesser General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
20 *****************************************************************************/
22 #ifndef _SWCSAWrapper_H_
23 #define _SWCSAWrapper_H_
25 #include "TextCollection.h"
27 #include "TextStorage.h"
29 #include "incbwt/bits/deltavector.h"
30 // Re-define word size to ulong:
38 #define SWCSAWRAPPER_VERSION_FLAG 101
40 #include "swcsa/utils/defValues.h"
41 #include "swcsa/utils/valstring.h"
42 #include "swcsa/interface.h"
51 * Partial implementation of the TextCollection interface
53 * Supports index construction, save, load and simple search.
54 * Use FMIndex implementation for full support.
56 class SWCSAWrapper : public SXSI::TextCollection {
58 SWCSAWrapper(uchar * text, ulong length, unsigned samplerate,
59 unsigned numberOfTexts_)
60 : index(0), offsets(0), offit(0), n(length), seSize(0), numberOfTexts(numberOfTexts_)
62 // Inicializes the arrays used to detect if a char is valid or not.
65 // Delta encoded bitvector of text offsets.
66 CSA::DeltaEncoder encoder(32);
67 encoder.setBit(0); // Start of the first text.
69 // Construct offset map from words to text elements
72 uchar const *pbeg,*pend;
78 //parsing either a word or separator.
83 std::cerr << "SWCSAWrapper constructor: assert failed, *pbeg == 0" << std::endl;
87 if (_Valid[*pbeg]) { //alphanumerical data
88 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
92 if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
93 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
96 else { //a SPACE comes, so we have to test if next character is alphanumerical or not
98 if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
100 if (_Valid [*pbeg] ) {
101 while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
103 else { // a "separator word" ...
104 size++; //the prev BLANK...
105 while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
106 }//else { // a "separator word"
107 }//else ... not a unique BLANK AT THE END.
108 }//else ... starting by a BLANK...
113 encoder.setBit(seSize+1);
114 pbeg ++; // Does not affect size of word (skips the 0-byte)
118 std::cerr << "SWCSAWrapper constructor: assert failed, size == 0" << std::endl;
125 fprintf(stderr,"\n-----------------------\nnumber of words = %lu\n------------------------\n", seSize);
126 encoder.setBit(seSize);
127 offsets = new CSA::DeltaVector(encoder, seSize+1);
128 offit = new CSA::DeltaVector::Iterator(*offsets);
130 std::cerr << "Number of texts: " << numberOfTexts << " vs " << offit->rank(seSize - 1) << std::endl;
133 snprintf(opt, 99, "sA=%u;sAinv=%u;sPsi=%u", samplerate, samplerate, samplerate);
135 int r = build_index(text, n, opt, &index);
138 std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl;
145 int r = free_index (index);
148 std::cout << "SWCSAWrapper destructor error: " << error_index(r) << std::endl;
152 delete offit; offit = 0;
153 delete offsets; offsets = 0;
156 bool EmptyText(DocId k) const
158 assert(k < (DocId)numberOfTexts);
159 return false; // Empty texts are not indexed
163 * Extracting one text.
165 * Call DeleteText() for each pointer returned by GetText()
166 * to avoid possible memory leaks.
168 uchar * GetText(DocId i) const
170 return GetText(i, i);
172 void DeleteText(uchar *text) const
178 * Returns a pointer to the beginning of texts i, i+1, ..., j.
179 * Texts are separated by a '\0' byte.
181 * Call DeleteText() for each pointer returned by GetText()
182 * to avoid possible memory leaks.
184 uchar * GetText(DocId i, DocId j) const
188 from = offit->select(i);
189 to = offit->select(i+1); // ADD one 1-bit in to end!!!
191 int r = extractWords(index, from, to, &text, &l);
194 std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl;
201 bool IsPrefix(uchar const *) const { unsupported(); return false; };
202 bool IsSuffix(uchar const *) const { unsupported(); return false; };
203 bool IsEqual(uchar const *) const { unsupported(); return false; };
204 bool IsContains(uchar const *) const { unsupported(); return false; };
205 bool IsLessThan(uchar const *) const { unsupported(); return false; };
207 bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; };
208 bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; };
209 bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; };
210 bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; };
211 bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; };
213 ulong Count(uchar const *pattern) const
216 // FIXME Const correctness is broken!
217 int r = count (index, (uchar *)pattern, std::strlen((char const *)pattern), &occs);
220 std::cout << "SWCSAWrapper::Count error " << error_index(r) << std::endl;
225 unsigned CountPrefix(uchar const *) const { unsupported(); return 0; };
226 unsigned CountSuffix(uchar const *) const { unsupported(); return 0; };
227 unsigned CountEqual(uchar const *) const { unsupported(); return 0; };
228 unsigned CountContains(uchar const *) const { unsupported(); return 0; };
229 unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; };
231 unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
232 unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
233 unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; };
234 unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; };
235 unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; };
237 // Definition of document_result is inherited from SXSI::TextCollection.
238 document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
239 document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
240 document_result Equal(uchar const *) const { unsupported(); return document_result(); };
241 document_result Contains(uchar const *pattern) const
243 ulong *occ = 0, numocc = 0;
244 // FIXME Const correctness is broken!
245 int r = locateWord(index, (uchar *)pattern, std::strlen((char const *)pattern), &occ, &numocc, 0);
248 std::cout << "SWCSAWrapper::Contains error: " << error_index(r) << std::endl;
253 dr.reserve(numocc+1);
254 for (ulong i = 0; i < numocc; ++i)
255 dr.push_back(offit->rank(occ[i])-1);
260 document_result LessThan(uchar const *) const { unsupported(); return document_result(); };
261 document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); };
262 document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); };
264 document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
265 document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
266 document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
267 document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
268 document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
270 // Definition of full_result is inherited from SXSI::TextCollection.
271 full_result FullContains(uchar const *) const { unsupported(); return full_result(); };
272 full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); };
273 full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); };
274 full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); };
276 // Index from/to disk
277 SWCSAWrapper(FILE *file, char const *filename)
278 : index(0), offsets(0), offit(0), n(0), seSize(0), numberOfTexts(0)
281 if (std::fread(&verFlag, 1, 1, file) != 1)
282 throw std::runtime_error("SWCSAWrapper::Load(): file read error (version flag).");
283 if (verFlag != SWCSAWRAPPER_VERSION_FLAG)
284 throw std::runtime_error("SWCSAWrapper::Load(): invalid save file version.");
286 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
287 throw std::runtime_error("SWCSAWrapper::Load(): file read error (n).");
288 if (std::fread(&seSize, sizeof(ulong), 1, file) != 1)
289 throw std::runtime_error("SWCSAWrapper::Load(): file read error (seSize).");
290 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
291 throw std::runtime_error("FMIndex::Load(): file read error (numberOfTexts).");
293 offsets = new CSA::DeltaVector(file);
294 offit = new CSA::DeltaVector::Iterator(*offsets);
296 // FIXME Const correctness is broken!
297 int r = load_index((char *)filename, &index);
300 std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl;
305 void Save(FILE *file, char const *filename) const
307 const char type = 'W';
309 if (std::fwrite(&type, 1, 1, file) != 1)
310 throw std::runtime_error("SWCSAWrapper::Save(): file write error (type flag).");
312 const uchar ver = SWCSAWRAPPER_VERSION_FLAG;
313 // Saving version info:
314 if (std::fwrite(&ver, 1, 1, file) != 1)
315 throw std::runtime_error("SWCSAWrapper::Save(): file write error (version flag).");
317 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
318 throw std::runtime_error("SWCSAWrapper::Save(): file write error (n).");
319 if (std::fwrite(&(this->seSize), sizeof(ulong), 1, file) != 1)
320 throw std::runtime_error("SWCSAWrapper::Save(): file write error (seSize).");
321 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
322 throw std::runtime_error("SWCSAWrapper::Save(): file write error (numberOfTexts).");
324 offsets->writeTo(file);
326 // FIXME Const correctness is broken!
327 int r = save_index(index, (char *)filename);
330 std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl;
337 CSA::DeltaVector *offsets;
338 CSA::DeltaVector::Iterator *offit;
343 // Total number of texts in the collection
344 unsigned numberOfTexts;
346 void unsupported() const
348 std::cerr << std::endl << "-------------------------------------------------------------\n"
349 << "SWCSAWrapper: unsupported method!\nSee SWCSAWrapper.h for more details.\n"
350 << "The default index (FMIndex) implements this method!" << std::endl;
353 }; // class SWCSAWrapper