--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2010 by Niko Välimäki *
+ * *
+ * RLCSA implementation for the TextCollection interface *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+
+#ifndef _RLCSAWrapper_H_
+#define _RLCSAWrapper_H_
+
+#include "TextCollection.h"
+
+#include "incbwt/rlcsa.h"
+
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+# define W 64
+#else
+# define W 32
+#endif
+
+#include <set>
+#include <string>
+
+namespace SXSI
+{
+
+/**
+ * Partial implementation of the TextCollection interface
+ *
+ * Supports index construction, save, load and simple search.
+ * Use FMIndex implementation for full support.
+ */
+class RLCSAWrapper : public SXSI::TextCollection
+{
+public:
+ RLCSAWrapper(const CSA::RLCSA* index)
+ : rlcsa(index)
+ { /* NOP */ }
+
+ ~RLCSAWrapper()
+ {
+ delete rlcsa; rlcsa = 0;
+ }
+
+ bool EmptyText(DocId k) const
+ {
+ return false; // Empty texts are not indexed
+ }
+
+ /**
+ * Extracting one text.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ uchar * GetText(DocId i) const
+ {
+ return rlcsa->display(i);
+ }
+ void DeleteText(uchar *text) const
+ {
+ delete [] text;
+ }
+
+ /**
+ * Returns a pointer to the beginning of texts i, i+1, ..., j.
+ * Texts are separated by a '\0' byte.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ uchar * GetText(DocId i, DocId j) const
+ {
+ std::cerr << "RLCSAWrapper::GetText(i,j): unsupported method!" << std::endl
+ << "Use the default (FMIndex) text collection instead!" << std::endl;
+ std::exit(1);
+ }
+
+ bool IsPrefix(uchar const *) const { unsupported(); return false; };
+ bool IsSuffix(uchar const *) const { unsupported(); return false; };
+ bool IsEqual(uchar const *) const { unsupported(); return false; };
+ bool IsContains(uchar const *) const { unsupported(); return false; };
+ bool IsLessThan(uchar const *) const { unsupported(); return false; };
+
+ bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; };
+ bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; };
+
+ ulong Count(uchar const *pattern) const
+ {
+// std::pair<TextPosition, TextPosition> p = backwards(pattern);
+ CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
+
+ if (p.first <= p.second)
+ return p.second - p.first + 1;
+ else
+ return 0;
+ }
+
+ unsigned CountPrefix(uchar const *) const { unsupported(); return 0; };
+ unsigned CountSuffix(uchar const *) const { unsupported(); return 0; };
+ unsigned CountEqual(uchar const *) const { unsupported(); return 0; };
+ unsigned CountContains(uchar const *) const { unsupported(); return 0; };
+ unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; };
+
+ unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+ unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; };
+
+ // Definition of document_result is inherited from SXSI::TextCollection.
+ document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
+ document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
+ document_result Equal(uchar const *) const { unsupported(); return document_result(); };
+ document_result Contains(uchar const *pattern) const
+ {
+ //std::pair<TextPosition,TextPosition> p = backwards(pattern);
+ CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
+ if (p.first > p.second)
+ return document_result();
+
+ document_result dr;
+ dr.reserve(p.second - p.first + 2);
+ for (ulong i = p.first; i <= p.second; ++i)
+ dr.push_back(rlcsa->getSequenceForPosition(rlcsa->locate(i)));
+ return dr;
+ }
+ document_result LessThan(uchar const *) const { unsupported(); return document_result(); };
+ document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); };
+ document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); };
+
+ document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+ document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
+
+ // Definition of full_result is inherited from SXSI::TextCollection.
+ full_result FullContains(uchar const *) const { unsupported(); return full_result(); };
+ full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); };
+ full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); };
+ full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); };
+
+ // Index from/to disk
+ RLCSAWrapper(FILE *file, char const *filename)
+ : rlcsa(new CSA::RLCSA(std::string(filename)))
+ { /* NOP */ }
+
+ void Save(FILE *file, char const *filename) const
+ {
+ const char type = 'R';
+ // Saving type info:
+ if (std::fwrite(&type, 1, 1, file) != 1)
+ throw std::runtime_error("RLCSAWrapper::Save(): file write error (type flag).");
+
+ this->rlcsa->writeTo(std::string(filename));
+ }
+
+private:
+ const CSA::RLCSA* rlcsa;
+
+/* std::pair<TextPosition, TextPosition> backwards(uchar const *pattern) const
+ {
+ TextPosition i = std::strlen((char const *)pattern) - 1;
+ int c = (int)pattern[i];
+
+ std::cerr << "\nFirst symb = " << (char)c << std::endl;
+
+ TextPosition sp = rlcsa->C(c);
+ TextPosition ep = rlcsa->C(c+1)-1;
+ printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ while (sp<=ep && i>=1)
+ {
+ c = (int)pattern[--i];
+ sp = LF(c, sp);
+ ep = LF(c, ep+1)-1;
+ printf("new: c = %c, sp = %lu, ep = %lu\n", pattern[i], sp, ep);
+ }
+
+ uchar* found = rlcsa->display(sp, std::strlen((char const *)pattern), 5, i);
+ std::cerr << "found: " << found << " (" << i << std::endl;
+
+ return std::make_pair(sp, ep);
+ }*/
+
+ inline TextCollection::TextPosition
+ LF(uchar c, TextPosition i) const
+ {
+ if(i == (TextPosition)-1 || i < this->rlcsa->getNumberOfSequences())
+ { return this->rlcsa->C(c); }
+ return this->rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c);
+ }
+
+
+ void unsupported() const
+ {
+ std::cerr << std::endl << "-------------------------------------------------------------\n"
+ << "RLCSAWrapper: unsupported method!\nSee RLCSAWrapper.h for more details.\n"
+ << "The default index (FMIndex) implements this method!" << std::endl;
+ std::exit(5);
+ }
+}; // class RLCSAWrapper
+
+} // namespace SXSI
+
+#endif