1 /******************************************************************************
2 * Copyright (C) 2010 by Niko Välimäki *
4 * RLCSA implementation for the TextCollection interface *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU Lesser General Public License as published *
8 * by the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU Lesser General Public License for more details. *
16 * You should have received a copy of the GNU Lesser General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
20 *****************************************************************************/
22 #ifndef _RLCSAWrapper_H_
23 #define _RLCSAWrapper_H_
25 #include "TextCollection.h"
27 #include "incbwt/rlcsa.h"
29 // Re-define word size to ulong:
44 * Partial implementation of the TextCollection interface
46 * Supports index construction, save, load and simple search.
47 * Use FMIndex implementation for full support.
49 class RLCSAWrapper : public SXSI::TextCollection
52 RLCSAWrapper(const CSA::RLCSA* index)
58 delete rlcsa; rlcsa = 0;
61 bool EmptyText(DocId k) const
63 return false; // Empty texts are not indexed
67 * Extracting one text.
69 * Call DeleteText() for each pointer returned by GetText()
70 * to avoid possible memory leaks.
72 uchar * GetText(DocId i) const
74 return rlcsa->display(i);
76 void DeleteText(uchar *text) const
82 * Returns a pointer to the beginning of texts i, i+1, ..., j.
83 * Texts are separated by a '\0' byte.
85 * Call DeleteText() for each pointer returned by GetText()
86 * to avoid possible memory leaks.
88 uchar * GetText(DocId i, DocId j) const
90 std::cerr << "RLCSAWrapper::GetText(i,j): unsupported method!" << std::endl
91 << "Use the default (FMIndex) text collection instead!" << std::endl;
95 bool IsPrefix(uchar const *) const { unsupported(); return false; };
96 bool IsSuffix(uchar const *) const { unsupported(); return false; };
97 bool IsEqual(uchar const *) const { unsupported(); return false; };
98 bool IsContains(uchar const *) const { unsupported(); return false; };
99 bool IsLessThan(uchar const *) const { unsupported(); return false; };
101 bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; };
102 bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; };
103 bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; };
104 bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; };
105 bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; };
107 ulong Count(uchar const *pattern) const
109 // std::pair<TextPosition, TextPosition> p = backwards(pattern);
110 CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
112 if (p.first <= p.second)
113 return p.second - p.first + 1;
118 unsigned CountPrefix(uchar const *) const { unsupported(); return 0; };
119 unsigned CountSuffix(uchar const *) const { unsupported(); return 0; };
120 unsigned CountEqual(uchar const *) const { unsupported(); return 0; };
121 unsigned CountContains(uchar const *) const { unsupported(); return 0; };
122 unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; };
124 unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
125 unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
126 unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; };
127 unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; };
128 unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; };
130 // Definition of document_result is inherited from SXSI::TextCollection.
131 document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
132 document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
133 document_result Equal(uchar const *) const { unsupported(); return document_result(); };
134 document_result Contains(uchar const *pattern) const
136 //std::pair<TextPosition,TextPosition> p = backwards(pattern);
137 CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
138 if (p.first > p.second)
139 return document_result();
142 dr.reserve(p.second - p.first + 2);
143 for (ulong i = p.first; i <= p.second; ++i)
144 dr.push_back(rlcsa->getSequenceForPosition(rlcsa->locate(i)));
147 document_result LessThan(uchar const *) const { unsupported(); return document_result(); };
148 document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); };
149 document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); };
151 document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
152 document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
153 document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
154 document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
155 document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
157 // Definition of full_result is inherited from SXSI::TextCollection.
158 full_result FullContains(uchar const *) const { unsupported(); return full_result(); };
159 full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); };
160 full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); };
161 full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); };
163 // Index from/to disk
164 RLCSAWrapper(FILE *file, char const *filename)
165 : rlcsa(new CSA::RLCSA(std::string(filename)))
168 void Save(FILE *file, char const *filename) const
170 const char type = 'R';
172 if (std::fwrite(&type, 1, 1, file) != 1)
173 throw std::runtime_error("RLCSAWrapper::Save(): file write error (type flag).");
175 this->rlcsa->writeTo(std::string(filename));
179 const CSA::RLCSA* rlcsa;
181 /* std::pair<TextPosition, TextPosition> backwards(uchar const *pattern) const
183 TextPosition i = std::strlen((char const *)pattern) - 1;
184 int c = (int)pattern[i];
186 std::cerr << "\nFirst symb = " << (char)c << std::endl;
188 TextPosition sp = rlcsa->C(c);
189 TextPosition ep = rlcsa->C(c+1)-1;
190 printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
191 while (sp<=ep && i>=1)
193 c = (int)pattern[--i];
196 printf("new: c = %c, sp = %lu, ep = %lu\n", pattern[i], sp, ep);
199 uchar* found = rlcsa->display(sp, std::strlen((char const *)pattern), 5, i);
200 std::cerr << "found: " << found << " (" << i << std::endl;
202 return std::make_pair(sp, ep);
205 inline TextCollection::TextPosition
206 LF(uchar c, TextPosition i) const
208 if(i == (TextPosition)-1 || i < this->rlcsa->getNumberOfSequences())
209 { return this->rlcsa->C(c); }
210 return this->rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c);
214 void unsupported() const
216 std::cerr << std::endl << "-------------------------------------------------------------\n"
217 << "RLCSAWrapper: unsupported method!\nSee RLCSAWrapper.h for more details.\n"
218 << "The default index (FMIndex) implements this method!" << std::endl;
221 }; // class RLCSAWrapper