1 /******************************************************************************
2 * Copyright (C) 2010 by Niko Välimäki *
4 * RLCSA implementation for the TextCollection interface *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU Lesser General Public License as published *
8 * by the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU Lesser General Public License for more details. *
16 * You should have received a copy of the GNU Lesser General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
20 *****************************************************************************/
22 #ifndef _RLCSAWrapper_H_
23 #define _RLCSAWrapper_H_
25 #include "TextCollection.h"
26 #include "IndelQuery.h"
28 #include "incbwt/rlcsa.h"
30 // Re-define word size to ulong:
46 * Partial implementation of the TextCollection interface
48 * Supports index construction, save, load and simple search.
49 * Use FMIndex implementation for full support.
51 class RLCSAWrapper : public SXSI::TextCollection
54 RLCSAWrapper(const CSA::RLCSA* index)
57 // Init the edit distance look-up tables
58 MyersEditDistanceIncremental::initMyersFourRussians();
63 delete rlcsa; rlcsa = 0;
66 bool EmptyText(DocId k) const
68 return false; // Empty texts are not indexed
72 * Extracting one text.
74 * Call DeleteText() for each pointer returned by GetText()
75 * to avoid possible memory leaks.
77 uchar * GetText(DocId i) const
79 return rlcsa->display(i);
81 void DeleteText(uchar *text) const
87 * Returns a pointer to the beginning of texts i, i+1, ..., j.
88 * Texts are separated by a '\0' byte.
90 * Call DeleteText() for each pointer returned by GetText()
91 * to avoid possible memory leaks.
93 uchar * GetText(DocId i, DocId j) const
95 std::cerr << "RLCSAWrapper::GetText(i,j): unsupported method!" << std::endl
96 << "Use the default (FMIndex) text collection instead!" << std::endl;
100 bool IsPrefix(uchar const *) const { unsupported(); return false; };
101 bool IsSuffix(uchar const *) const { unsupported(); return false; };
102 bool IsEqual(uchar const *) const { unsupported(); return false; };
103 bool IsContains(uchar const *) const { unsupported(); return false; };
104 bool IsLessThan(uchar const *) const { unsupported(); return false; };
106 bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; };
107 bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; };
108 bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; };
109 bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; };
110 bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; };
112 ulong Count(uchar const *pattern) const
114 // std::pair<TextPosition, TextPosition> p = backwards(pattern);
115 CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
117 if (p.first <= p.second)
118 return p.second - p.first + 1;
123 unsigned CountPrefix(uchar const *) const { unsupported(); return 0; };
124 unsigned CountSuffix(uchar const *) const { unsupported(); return 0; };
125 unsigned CountEqual(uchar const *) const { unsupported(); return 0; };
126 unsigned CountContains(uchar const *) const { unsupported(); return 0; };
127 unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; };
129 unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
130 unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
131 unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; };
132 unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; };
133 unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; };
135 // Definition of document_result is inherited from SXSI::TextCollection.
136 document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
137 document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
138 document_result Equal(uchar const *) const { unsupported(); return document_result(); };
140 document_result Contains(uchar const *pattern) const
142 /*************************************************************************
143 * Approximate pattern matching
145 * Using suffix filters. Has to enumerate *all* approx. occurrences (sloooow...)
146 * instead of just returning the best occurrence (which is usually much faster).
148 * Query format: contains("APM 3 GATTACA")
150 * "APM" is the keyword for approximate queries.
151 * "3" is the maximum edit distance allowed.
152 * "GATTACA" is the query word to be aligned.
154 if (strncmp((char const *)pattern, "APM ", 4) == 0)
156 // Edit distance allowed.
157 int k = std::atoi((char const *)pattern + 4);
158 if (k < 0 || k == INT_MAX || k == INT_MIN)
159 goto exact_pattern_matching; // Invalid format
161 // Find the start of the pattern (i.e. the second ' ')
162 uchar const * tmp = pattern + 4;
163 while (*tmp != ' ' && *tmp != 0) ++tmp;
164 if (*tmp != ' ' || tmp == pattern + 4)
165 goto exact_pattern_matching; // Invalid format
168 //std::cerr << "Pattern: " << tmp+1 << ", k = " << k << std::endl;
169 return iq.align(tmp+1, k);
172 /*************************************************************************
173 * Exact pattern matching
175 exact_pattern_matching:
176 CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
177 if (p.first > p.second)
178 return document_result();
181 dr.reserve(p.second - p.first + 2);
182 for (ulong i = p.first; i <= p.second; ++i)
183 dr.push_back(rlcsa->getSequenceForPosition(rlcsa->locate(i)));
186 document_result LessThan(uchar const *) const { unsupported(); return document_result(); };
187 document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); };
188 document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); };
190 document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
191 document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
192 document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
193 document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
194 document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
196 // Definition of full_result is inherited from SXSI::TextCollection.
197 full_result FullContains(uchar const *) const { unsupported(); return full_result(); };
198 full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); };
199 full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); };
200 full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); };
202 // Index from/to disk
203 RLCSAWrapper(FILE *file, char const *filename)
204 : rlcsa(new CSA::RLCSA(std::string(filename)))
207 void Save(FILE *file, char const *filename) const
209 const char type = 'R';
211 if (std::fwrite(&type, 1, 1, file) != 1)
212 throw std::runtime_error("RLCSAWrapper::Save(): file write error (type flag).");
214 this->rlcsa->writeTo(std::string(filename));
217 TextCollection::TextPosition getLength() const
219 return this->rlcsa->getSize() + this->rlcsa->getNumberOfSequences();
222 inline TextCollection::TextPosition LF(uchar c, TextPosition i) const
225 if(i < this->rlcsa->getNumberOfSequences())
227 return rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c);
230 uchar* getSuffix(TextPosition pos, unsigned l) const
233 uchar* text = new uchar[l + 1];
235 if(l == 0 || pos < this->rlcsa->getNumberOfSequences())
240 pos -= this->rlcsa->getNumberOfSequences();
242 unsigned n = rlcsa->displayFromPosition(pos, l, text);
247 DocId getDoc(TextPosition i) const
249 if(i < this->rlcsa->getNumberOfSequences())
252 CSA::pair_type pt = rlcsa->getRelativePosition(this->rlcsa->locate(i - this->rlcsa->getNumberOfSequences()));
257 const CSA::RLCSA* rlcsa;
259 /* std::pair<TextPosition, TextPosition> backwards(uchar const *pattern) const
261 TextPosition i = std::strlen((char const *)pattern) - 1;
262 int c = (int)pattern[i];
264 std::cerr << "\nFirst symb = " << (char)c << std::endl;
266 TextPosition sp = LF(c, -1);
267 TextPosition ep = LF(c, getLength()-1) - 1;
268 printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
269 while (sp<=ep && i>=1)
271 c = (int)pattern[--i];
274 printf("new: c = %c, sp = %lu, ep = %lu\n", pattern[i], sp, ep);
277 return std::make_pair(sp, ep);
280 void unsupported() const
282 std::cerr << std::endl << "-------------------------------------------------------------\n"
283 << "RLCSAWrapper: unsupported method!\nSee RLCSAWrapper.h for more details.\n"
284 << "The default index (FMIndex) implements this method!" << std::endl;
287 }; // class RLCSAWrapper