--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2010 by Niko Välimäki *
+ * *
+ * FMIndex implementation for the TextCollection interface *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+
+#ifndef _SWCSAWrapper_H_
+#define _SWCSAWrapper_H_
+
+#include "TextCollection.h"
+
+#include "TextStorage.h"
+
+#include "incbwt/bits/deltavector.h"
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+# define W 64
+#else
+# define W 32
+#endif
+
+#define SWCSAWRAPPER_VERSION_FLAG 101
+
+#include "swcsa/utils/defValues.h"
+#include "swcsa/utils/valstring.h"
+#include "swcsa/interface.h"
+
+#include <set>
+#include <string>
+
+namespace SXSI
+{
+
+/**
+ * Partial implementation of the TextCollection interface
+ *
+ * Supports index construction, save, load and simple search.
+ * Use FMIndex implementation for full support.
+ */
+class SWCSAWrapper : public SXSI::TextCollection {
+public:
+ SWCSAWrapper(uchar * text, ulong length, unsigned samplerate,
+ unsigned numberOfTexts_)
+ : index(0), offsets(0), n(length), seSize(0), numberOfTexts(numberOfTexts_)
+ {
+ // Inicializes the arrays used to detect if a char is valid or not.
+ StartValid();
+
+ // Delta encoded bitvector of text offsets.
+ CSA::DeltaEncoder encoder(32);
+ encoder.setBit(0); // Start of the first text.
+
+ // Construct offset map from words to text elements
+ seSize = 0;
+ {
+ uchar const *pbeg,*pend;
+
+ pbeg = text;
+ pend = text+length;
+
+ while (pbeg <pend) {
+ //parsing either a word or separator.
+ unsigned size=0;
+
+ if (*pbeg == 0)
+ {
+ std::cerr << "SWCSAWrapper constructor: assert failed, *pbeg == 0" << std::endl;
+ std::exit(1);
+ }
+
+ if (_Valid[*pbeg]) { //alphanumerical data
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else
+ {
+ if (*pbeg != ' ') { //a separator comes starting in ' ' comes, so it is a new word
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+
+ }
+ else { //a SPACE comes, so we have to test if next character is alphanumerical or not
+ pbeg++;
+ if (pbeg >= pend) {size++;} // a unique BLANK at the end of the file.
+ else {
+ if (_Valid [*pbeg] ) {
+ while ( (size<MAX_SIZE_OF_WORD) && (pbeg<pend)&& ( _Valid[*pbeg] )) {size++;pbeg++;}
+ }
+ else { // a "separator word" ...
+ size++; //the prev BLANK...
+ while ( (size<MAX_SIZE_OF_GAP) && (pbeg<pend) && (!_Valid[*pbeg] && *pbeg != 0)) {size++;pbeg++;}
+ }//else { // a "separator word"
+ }//else ... not a unique BLANK AT THE END.
+ }//else ... starting by a BLANK...
+ }
+
+ if (*pbeg == 0)
+ {
+ encoder.setBit(seSize+1);
+ pbeg ++; // Does not affect size of word (skips the 0-byte)
+ }
+ if (size == 0)
+ {
+ std::cerr << "SWCSAWrapper constructor: assert failed, size == 0" << std::endl;
+ std::exit(1);
+ }
+
+ seSize ++;
+ }//while pbeg<pend
+ }//1st pass ends
+ fprintf(stderr,"\n-----------------------\nnumber of words = %lu\n------------------------\n", seSize);
+ encoder.setBit(seSize);
+ offsets = new CSA::DeltaVector(encoder, seSize+1);
+ std::cerr << "Number of texts: " << numberOfTexts << " vs " << offsets->rank(seSize - 1) << std::endl;
+
+ char opt[100];
+ snprintf(opt, 99, "sA=%u;sAinv=%u;sPsi=%u", samplerate, samplerate, samplerate);
+
+ int r = build_index(text, n, opt, &index);
+ if (r)
+ {
+ std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl;
+ std::exit(r);
+ }
+ }
+
+ ~SWCSAWrapper()
+ {
+ int r = free_index (index);
+ if (r)
+ {
+ std::cout << "SWCSAWrapper destructor error: " << error_index(r) << std::endl;
+ std::exit(r);
+ }
+ index = 0;
+ delete offsets; offsets = 0;
+ }
+
+ bool EmptyText(DocId k) const
+ {
+ assert(k < (DocId)numberOfTexts);
+ return false; // Empty texts are not indexed
+ }
+
+ /**
+ * Extracting one text.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ uchar * GetText(DocId i) const
+ {
+ return GetText(i, i);
+ }
+ void DeleteText(uchar *text) const
+ {
+ free(text);
+ }
+
+ /**
+ * Returns a pointer to the beginning of texts i, i+1, ..., j.
+ * Texts are separated by a '\0' byte.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ uchar * GetText(DocId i, DocId j) const
+ {
+ ulong from, to, l;
+ uchar *text;
+ from = offsets->select(i);
+ to = offsets->select(i+1); // ADD one 1-bit in to end!!!
+
+ int r = extractWords(index, from, to, &text, &l);
+ if (r)
+ {
+ std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl;
+ std::exit(r);
+ }
+ text[l] = 0;
+ return text;
+ }
+
+ bool IsPrefix(uchar const *) const { unsupported(); return false; };
+ bool IsSuffix(uchar const *) const { unsupported(); return false; };
+ bool IsEqual(uchar const *) const { unsupported(); return false; };
+ bool IsContains(uchar const *) const { unsupported(); return false; };
+ bool IsLessThan(uchar const *) const { unsupported(); return false; };
+
+ bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; };
+
+ ulong Count(uchar const *pattern) const
+ {
+ ulong occs = 0;
+ // FIXME Const correctness is broken!
+ int r = count (index, (uchar *)pattern, std::strlen((char const *)pattern), &occs);
+ if (r)
+ {
+ std::cout << "SWCSAWrapper::Count error " << error_index(r) << std::endl;
+ std::exit(r);
+ }
+ return occs;
+ }
+ unsigned CountPrefix(uchar const *) const { unsupported(); return 0; };
+ unsigned CountSuffix(uchar const *) const { unsupported(); return 0; };
+ unsigned CountEqual(uchar const *) const { unsupported(); return 0; };
+ unsigned CountContains(uchar const *) const { unsupported(); return 0; };
+ unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; };
+
+ unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+
+ // Definition of document_result is inherited from SXSI::TextCollection.
+ document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
+ document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
+ document_result Equal(uchar const *) const { unsupported(); return document_result(); };
+ document_result Contains(uchar const *pattern) const
+ {
+ ulong *occ = 0, numocc = 0;
+ // FIXME Const correctness is broken!
+ int r = locateWord(index, (uchar *)pattern, std::strlen((char const *)pattern), &occ, &numocc, 0);
+ if (r)
+ {
+ std::cout << "SWCSAWrapper::Contains error: " << error_index(r) << std::endl;
+ std::exit(r);
+ }
+
+ document_result dr;
+ dr.reserve(numocc+1);
+ for (ulong i = 0; i < numocc; ++i)
+ dr.push_back(offsets->rank(occ[i])-1);
+
+ free(occ);
+ return dr;
+ }
+ document_result LessThan(uchar const *) const { unsupported(); return document_result(); };
+ document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); };
+ document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); };
+
+ document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+
+ // Definition of full_result is inherited from SXSI::TextCollection.
+ full_result FullContains(uchar const *) const { unsupported(); return full_result(); };
+ full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); };
+ full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); };
+ full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); };
+
+ // Index from/to disk
+ SWCSAWrapper(FILE *file, char const *filename)
+ : index(0), offsets(0), n(0), seSize(0), numberOfTexts(0)
+ {
+ uchar verFlag = 0;
+ if (std::fread(&verFlag, 1, 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Load(): file read error (version flag).");
+ if (verFlag != SWCSAWRAPPER_VERSION_FLAG)
+ throw std::runtime_error("SWCSAWrapper::Load(): invalid save file version.");
+
+ if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Load(): file read error (n).");
+ if (std::fread(&seSize, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Load(): file read error (seSize).");
+ if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("FMIndex::Load(): file read error (numberOfTexts).");
+
+ offsets = new CSA::DeltaVector(file);
+
+ // FIXME Const correctness is broken!
+ int r = load_index((char *)filename, &index);
+ if (r)
+ {
+ std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl;
+ std::exit(r);
+ }
+ }
+
+ void Save(FILE *file, char const *filename) const
+ {
+ const char type = 'W';
+ // Saving type info:
+ if (std::fwrite(&type, 1, 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Save(): file write error (type flag).");
+
+ const uchar ver = SWCSAWRAPPER_VERSION_FLAG;
+ // Saving version info:
+ if (std::fwrite(&ver, 1, 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Save(): file write error (version flag).");
+
+ if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Save(): file write error (n).");
+ if (std::fwrite(&(this->seSize), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Save(): file write error (seSize).");
+ if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("SWCSAWrapper::Save(): file write error (numberOfTexts).");
+
+ offsets->writeTo(file);
+
+ // FIXME Const correctness is broken!
+ int r = save_index(index, (char *)filename);
+ if (r)
+ {
+ std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl;
+ std::exit(r);
+ }
+ }
+
+private:
+ void *index;
+ CSA::DeltaVector *offsets;
+
+ TextPosition n;
+ ulong seSize;
+
+ // Total number of texts in the collection
+ unsigned numberOfTexts;
+
+ void unsupported() const
+ {
+ std::cerr << std::endl << "-------------------------------------------------------------\n"
+ << "SWCSAWrapper: unsupported method!\nSee SWCSAWrapper.h for more details.\n"
+ << "The default index (FMIndex) implements this method!" << std::endl;
+ std::exit(5);
+ }
+}; // class SWCSAWrapper
+
+} // namespace SXSI
+
+#endif