+TextCollection::document_result CSA::Equal(uchar const *pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all empty texts
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all empty texts
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId); // already within [begin, end]
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+
+TextCollection::document_result CSA::Contains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ // We want unique document indentifiers, using std::set to collect them
+ std::set<DocId> resultSet;
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->access(i);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ assert((unsigned)di < numberOfTexts);
+ resultSet.insert(di);
+ }
+ }
+
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ // Map to doc ID's
+ for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+ *it = emptyTextRank->select0(*it+1);
+ return result;
+}
+
+TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ // We want unique document indentifiers, using std::set to collect them
+ std::set<DocId> resultSet;
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->access(i);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ assert((unsigned)docId < numberOfTexts);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ }
+
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ // Map to doc ID's
+ for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+ *it = emptyTextRank->select0(*it+1);
+ return result;
+}
+
+TextCollection::document_result CSA::LessThan(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep] and [begin, end]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
+/**