X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=RLCSAWrapper.h;fp=RLCSAWrapper.h;h=4e22fe8c83311396a56132a142dae056eca73b6b;hb=13e254b7c0ee22dffbc7c3125cee0408f9b375da;hp=0000000000000000000000000000000000000000;hpb=e4b6bdc7cc2a1372e4d4dae50acac55cebcc7e9b;p=SXSI%2FTextCollection.git diff --git a/RLCSAWrapper.h b/RLCSAWrapper.h new file mode 100644 index 0000000..4e22fe8 --- /dev/null +++ b/RLCSAWrapper.h @@ -0,0 +1,225 @@ +/****************************************************************************** + * Copyright (C) 2010 by Niko Välimäki * + * * + * RLCSA implementation for the TextCollection interface * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + *****************************************************************************/ + +#ifndef _RLCSAWrapper_H_ +#define _RLCSAWrapper_H_ + +#include "TextCollection.h" + +#include "incbwt/rlcsa.h" + +// Re-define word size to ulong: +#undef W +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif + +#include +#include + +namespace SXSI +{ + +/** + * Partial implementation of the TextCollection interface + * + * Supports index construction, save, load and simple search. + * Use FMIndex implementation for full support. + */ +class RLCSAWrapper : public SXSI::TextCollection +{ +public: + RLCSAWrapper(const CSA::RLCSA* index) + : rlcsa(index) + { /* NOP */ } + + ~RLCSAWrapper() + { + delete rlcsa; rlcsa = 0; + } + + bool EmptyText(DocId k) const + { + return false; // Empty texts are not indexed + } + + /** + * Extracting one text. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId i) const + { + return rlcsa->display(i); + } + void DeleteText(uchar *text) const + { + delete [] text; + } + + /** + * Returns a pointer to the beginning of texts i, i+1, ..., j. + * Texts are separated by a '\0' byte. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId i, DocId j) const + { + std::cerr << "RLCSAWrapper::GetText(i,j): unsupported method!" << std::endl + << "Use the default (FMIndex) text collection instead!" << std::endl; + std::exit(1); + } + + bool IsPrefix(uchar const *) const { unsupported(); return false; }; + bool IsSuffix(uchar const *) const { unsupported(); return false; }; + bool IsEqual(uchar const *) const { unsupported(); return false; }; + bool IsContains(uchar const *) const { unsupported(); return false; }; + bool IsLessThan(uchar const *) const { unsupported(); return false; }; + + bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; }; + + ulong Count(uchar const *pattern) const + { +// std::pair p = backwards(pattern); + CSA::pair_type p = rlcsa->count(std::string((char const *)pattern)); + + if (p.first <= p.second) + return p.second - p.first + 1; + else + return 0; + } + + unsigned CountPrefix(uchar const *) const { unsupported(); return 0; }; + unsigned CountSuffix(uchar const *) const { unsupported(); return 0; }; + unsigned CountEqual(uchar const *) const { unsupported(); return 0; }; + unsigned CountContains(uchar const *) const { unsupported(); return 0; }; + unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; }; + + unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + + // Definition of document_result is inherited from SXSI::TextCollection. + document_result Prefix(uchar const *) const { unsupported(); return document_result(); }; + document_result Suffix(uchar const *) const { unsupported(); return document_result(); }; + document_result Equal(uchar const *) const { unsupported(); return document_result(); }; + document_result Contains(uchar const *pattern) const + { + //std::pair p = backwards(pattern); + CSA::pair_type p = rlcsa->count(std::string((char const *)pattern)); + if (p.first > p.second) + return document_result(); + + document_result dr; + dr.reserve(p.second - p.first + 2); + for (ulong i = p.first; i <= p.second; ++i) + dr.push_back(rlcsa->getSequenceForPosition(rlcsa->locate(i))); + return dr; + } + document_result LessThan(uchar const *) const { unsupported(); return document_result(); }; + document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); }; + document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); }; + + document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + + // Definition of full_result is inherited from SXSI::TextCollection. + full_result FullContains(uchar const *) const { unsupported(); return full_result(); }; + full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); }; + full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); }; + full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); }; + + // Index from/to disk + RLCSAWrapper(FILE *file, char const *filename) + : rlcsa(new CSA::RLCSA(std::string(filename))) + { /* NOP */ } + + void Save(FILE *file, char const *filename) const + { + const char type = 'R'; + // Saving type info: + if (std::fwrite(&type, 1, 1, file) != 1) + throw std::runtime_error("RLCSAWrapper::Save(): file write error (type flag)."); + + this->rlcsa->writeTo(std::string(filename)); + } + +private: + const CSA::RLCSA* rlcsa; + +/* std::pair backwards(uchar const *pattern) const + { + TextPosition i = std::strlen((char const *)pattern) - 1; + int c = (int)pattern[i]; + + std::cerr << "\nFirst symb = " << (char)c << std::endl; + + TextPosition sp = rlcsa->C(c); + TextPosition ep = rlcsa->C(c+1)-1; + printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + while (sp<=ep && i>=1) + { + c = (int)pattern[--i]; + sp = LF(c, sp); + ep = LF(c, ep+1)-1; + printf("new: c = %c, sp = %lu, ep = %lu\n", pattern[i], sp, ep); + } + + uchar* found = rlcsa->display(sp, std::strlen((char const *)pattern), 5, i); + std::cerr << "found: " << found << " (" << i << std::endl; + + return std::make_pair(sp, ep); + }*/ + + inline TextCollection::TextPosition + LF(uchar c, TextPosition i) const + { + if(i == (TextPosition)-1 || i < this->rlcsa->getNumberOfSequences()) + { return this->rlcsa->C(c); } + return this->rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c); + } + + + void unsupported() const + { + std::cerr << std::endl << "-------------------------------------------------------------\n" + << "RLCSAWrapper: unsupported method!\nSee RLCSAWrapper.h for more details.\n" + << "The default index (FMIndex) implements this method!" << std::endl; + std::exit(5); + } +}; // class RLCSAWrapper + +} // namespace SXSI + +#endif