1 /******************************************************************************
2 * Copyright (C) 2010 by Niko Välimäki *
4 * RLCSA implementation for the TextCollection interface *
6 * This program is free software; you can redistribute it and/or modify *
7 * it under the terms of the GNU Lesser General Public License as published *
8 * by the Free Software Foundation; either version 2 of the License, or *
9 * (at your option) any later version. *
11 * This program is distributed in the hope that it will be useful, *
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 * GNU Lesser General Public License for more details. *
16 * You should have received a copy of the GNU Lesser General Public License *
17 * along with this program; if not, write to the *
18 * Free Software Foundation, Inc., *
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
20 *****************************************************************************/
22 #ifndef _RLCSAWrapper_H_
23 #define _RLCSAWrapper_H_
25 #include "TextCollection.h"
26 #include "IndelQuery.h"
27 #include "PssmQuery.h"
29 #include "incbwt/rlcsa.h"
31 // Re-define word size to ulong:
48 * Partial implementation of the TextCollection interface
50 * Supports index construction, save, load and simple search.
51 * Use FMIndex implementation for full support.
53 class RLCSAWrapper : public SXSI::TextCollection
56 RLCSAWrapper(const CSA::RLCSA* index)
59 // Init the edit distance look-up tables
60 MyersEditDistanceIncremental::initMyersFourRussians();
65 delete rlcsa; rlcsa = 0;
68 bool EmptyText(DocId k) const
70 return false; // Empty texts are not indexed
74 * Extracting one text.
76 * Call DeleteText() for each pointer returned by GetText()
77 * to avoid possible memory leaks.
79 uchar * GetText(DocId i) const
81 return rlcsa->display(i);
83 void DeleteText(uchar *text) const
89 * Returns a pointer to the beginning of texts i, i+1, ..., j.
90 * Texts are separated by a '\0' byte.
92 * Call DeleteText() for each pointer returned by GetText()
93 * to avoid possible memory leaks.
95 uchar * GetText(DocId i, DocId j) const
97 std::cerr << "RLCSAWrapper::GetText(i,j): unsupported method!" << std::endl
98 << "Use the default (FMIndex) text collection instead!" << std::endl;
102 bool IsPrefix(uchar const *) const { unsupported(); return false; };
103 bool IsSuffix(uchar const *) const { unsupported(); return false; };
104 bool IsEqual(uchar const *) const { unsupported(); return false; };
105 bool IsContains(uchar const *) const { unsupported(); return false; };
106 bool IsLessThan(uchar const *) const { unsupported(); return false; };
108 bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; };
109 bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; };
110 bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; };
111 bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; };
112 bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; };
114 ulong Count(uchar const *pattern) const
116 // std::pair<TextPosition, TextPosition> p = backwards(pattern);
117 CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
119 if (p.first <= p.second)
120 return p.second - p.first + 1;
125 unsigned CountPrefix(uchar const *) const { unsupported(); return 0; };
126 unsigned CountSuffix(uchar const *) const { unsupported(); return 0; };
127 unsigned CountEqual(uchar const *) const { unsupported(); return 0; };
128 unsigned CountContains(uchar const *) const { unsupported(); return 0; };
129 unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; };
131 unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
132 unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; };
133 unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; };
134 unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; };
135 unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; };
137 // Definition of document_result is inherited from SXSI::TextCollection.
138 document_result Prefix(uchar const *) const { unsupported(); return document_result(); };
139 document_result Suffix(uchar const *) const { unsupported(); return document_result(); };
140 document_result Equal(uchar const *) const { unsupported(); return document_result(); };
142 document_result Contains(uchar const *pattern) const
144 /*************************************************************************
145 * Approximate pattern matching
147 * Using suffix filters. Has to enumerate *all* approx. occurrences (sloooow...)
148 * instead of just returning the best occurrence (which is usually much faster).
150 * Query format: contains("APM 3 GATTACA")
152 * "APM" is the keyword for approximate queries.
153 * "3" is the maximum edit distance allowed.
154 * "GATTACA" is the query word to be aligned.
156 if (strncmp((char const *)pattern, "APM ", 4) == 0)
158 // Edit distance allowed.
159 int k = std::atoi((char const *)pattern + 4);
160 if (k < 0 || k == INT_MAX || k == INT_MIN)
161 goto exact_pattern_matching; // Invalid format
163 // Find the start of the pattern (i.e. the second ' ')
164 uchar const * tmp = pattern + 4;
165 while (*tmp != ' ' && *tmp != 0) ++tmp;
166 if (*tmp != ' ' || tmp == pattern + 4)
167 goto exact_pattern_matching; // Invalid format
170 // std::cerr << "RLCSAWrapper::Contains(): Pattern: " << tmp+1 << ", k = " << k << std::endl;
171 return iq.align(tmp+1, k);
174 /*************************************************************************
175 * Position Specific Scoring Matrix (PSSM) matching
176 * See PssmQuery.h for usage information.
178 if (strncmp((char const *)pattern, "PSSM ", 4) == 0)
181 double thr = std::atof((char const *)pattern + 5);
183 goto exact_pattern_matching; // Invalid format
185 // Find the start of the pattern (i.e. the second ' ')
186 uchar const * tmp = pattern + 5;
187 while (*tmp != ' ' && *tmp != 0) ++tmp;
188 if (*tmp != ' ' || tmp == pattern + 5)
189 goto exact_pattern_matching; // Invalid format
191 PssmQuery pq(this, std::log(thr));
192 //std::cerr << "Pattern: " << tmp+1 << ", log(threshold) = " << std::log(thr) << std::endl;
193 return pq.align(tmp+1, 0);
196 /*************************************************************************
197 * Exact pattern matching
199 exact_pattern_matching:
200 CSA::pair_type p = rlcsa->count(std::string((char const *)pattern));
201 if (p.first > p.second)
202 return document_result();
205 dr.reserve(p.second - p.first + 2);
206 for (ulong i = p.first; i <= p.second; ++i)
207 dr.push_back(rlcsa->getSequenceForPosition(rlcsa->locate(i)));
210 document_result LessThan(uchar const *) const { unsupported(); return document_result(); };
211 document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); };
212 document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); };
214 document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
215 document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
216 document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
217 document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
218 document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); };
220 // Definition of full_result is inherited from SXSI::TextCollection.
221 full_result FullContains(uchar const *) const { unsupported(); return full_result(); };
222 full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); };
223 full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); };
224 full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); };
226 // Index from/to disk
227 RLCSAWrapper(FILE *file, char const *filename)
228 : rlcsa(new CSA::RLCSA(std::string(filename)))
230 // Init the edit distance look-up tables
231 MyersEditDistanceIncremental::initMyersFourRussians();
234 void Save(FILE *file, char const *filename) const
236 const char type = 'R';
238 if (std::fwrite(&type, 1, 1, file) != 1)
239 throw std::runtime_error("RLCSAWrapper::Save(): file write error (type flag).");
242 this->rlcsa->writeTo(std::string(filename));
245 TextCollection::TextPosition getLength() const
247 return this->rlcsa->getSize() + this->rlcsa->getNumberOfSequences();
250 inline TextCollection::TextPosition LF(uchar c, TextPosition i) const
253 if(i < this->rlcsa->getNumberOfSequences())
255 return rlcsa->LF(i - this->rlcsa->getNumberOfSequences(), c);
258 uchar* getSuffix(TextPosition pos, unsigned l) const
261 uchar* text = new uchar[l + 1];
263 if(l == 0 || pos < this->rlcsa->getNumberOfSequences())
268 pos -= this->rlcsa->getNumberOfSequences();
270 unsigned n = rlcsa->displayFromPosition(pos, l, text);
275 DocId getDoc(TextPosition i) const
277 if(i < this->rlcsa->getNumberOfSequences())
280 CSA::pair_type pt = rlcsa->getRelativePosition(this->rlcsa->locate(i - this->rlcsa->getNumberOfSequences()));
285 const CSA::RLCSA* rlcsa;
287 /* std::pair<TextPosition, TextPosition> backwards(uchar const *pattern) const
289 TextPosition i = std::strlen((char const *)pattern) - 1;
290 int c = (int)pattern[i];
292 std::cerr << "\nFirst symb = " << (char)c << std::endl;
294 TextPosition sp = LF(c, -1);
295 TextPosition ep = LF(c, getLength()-1) - 1;
296 printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
297 while (sp<=ep && i>=1)
299 c = (int)pattern[--i];
302 printf("new: c = %c, sp = %lu, ep = %lu\n", pattern[i], sp, ep);
305 return std::make_pair(sp, ep);
308 void unsupported() const
310 std::cerr << std::endl << "-------------------------------------------------------------\n"
311 << "RLCSAWrapper: unsupported method!\nSee RLCSAWrapper.h for more details.\n"
312 << "The default index (FMIndex) implements this method!" << std::endl;
315 }; // class RLCSAWrapper