return false;
}
+bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return true;
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ if (CountEndmarkers(sp, ep, begin, end))
+ return true;
+ return false;
+}
+
+
bool CSA::IsSuffix(uchar const *pattern) const
{
// Here counting is as fast as IsSuffix():
return false;
}
+bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
+{
+ // Here counting is as fast as IsSuffix():
+ if (CountSuffix(pattern, begin, end) > 0)
+ return true;
+ return false;
+}
+
bool CSA::IsEqual(uchar const *pattern) const
{
TextPosition m = std::strlen((char *)pattern);
return false;
}
+bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const
+{
+ TextPosition m = std::strlen((char *)pattern);
+ if (m == 0)
+ {
+ if (numberOfAllTexts - numberOfTexts > 0u)
+ return true; // Empty texts exists
+ return false; // No empty texts exists
+ }
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ // Check for end-marker(s) in result interval
+ if (CountEndmarkers(sp, ep))
+ return true;
+ return false;
+}
+
bool CSA::IsContains(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
return false;
}
+bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const
+{
+ // Here counting is as fast as existential querying
+ if (CountContains(pattern, begin, end) > 0)
+ return true; // FIXME No need to filter result set
+ return false;
+}
+
bool CSA::IsLessThan(uchar const * pattern) const
{
if (CountLessThan(pattern) > 0)
return false;
}
+bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+ if (CountLessThan(pattern, begin, end) > 0)
+ return true;
+ return false;
+}
+
/******************************************************************
* Counting queries
*/
return CountEndmarkers(sp, ep);
}
+unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return numberOfAllTexts;
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep, begin, end);
+}
+
unsigned CSA::CountSuffix(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
return count;
}
+unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return numberOfAllTexts;
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
+
+ return count;
+}
+
unsigned CSA::CountEqual(uchar const *pattern) const
{
TextPosition m = strlen((char const *)pattern);
return CountEndmarkers(sp, ep);
}
+unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep); // Already within [begin, end]
+}
+
unsigned CSA::CountContains(uchar const * pattern) const
{
TextPosition m = strlen((char const *)pattern);
return numberOfAllTexts; // Total number of texts.
// Here counting is as slow as fetching the result set
- // because we would have to filter out occ's that fall within same document.
+ // because we have to filter out occ's that fall within same document.
TextCollection::document_result result = Contains(pattern);
return result.size();
}
+unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts; // Total number of texts.
+
+ // Here counting is as slow as fetching the result set
+ // because we have to filter out occ's that fall within same document.
+ TextCollection::document_result result = Contains(pattern, begin, end);
+ return result.size();
+}
+
// Less than or equal
unsigned CSA::CountLessThan(uchar const * pattern) const
{
return CountEndmarkers(sp, ep);
}
+unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep, begin, end);
+}
+
/**
* Document reporting queries
*/
return result;
}
+TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all 1...k
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep] and [begin, end]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
TextCollection::document_result CSA::Suffix(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
return result;
}
+TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all 1...k
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ TextCollection::document_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence, already within [begin, end]
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
+ }
+ // Assert: c == '\0' OR sampled->IsBitSet(i)
+
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+ }
+ else // Sampled position
+ {
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+ }
+ }
+
+ return result;
+}
+
+
TextCollection::document_result CSA::Equal(uchar const *pattern) const
{
TextPosition m = strlen((char const *)pattern);
return result;
}
+TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // FIXME Should return all empty texts
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep, begin, end);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId); // already within [begin, end]
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
TextCollection::document_result CSA::Contains(uchar const * pattern) const
{
return result;
}
+TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ // We want unique document indentifiers, using std::set to collect them
+ std::set<DocId> resultSet;
+
+ ulong sampled_rank_i = 0;
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->access(i);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ assert((unsigned)docId < numberOfTexts);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ }
+
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ // Map to doc ID's
+ for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+ *it = emptyTextRank->select0(*it+1);
+ return result;
+}
+
TextCollection::document_result CSA::LessThan(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
return result;
}
+TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep] and [begin, end]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
+}
+
/**
* Full result set queries
*/
return result;
}
+TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return full_result(); // FIXME Throw exception?
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ full_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+
+ ulong sampled_rank_i = 0;
+ // Report each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
+
+ // End-marker that we found belongs to the "preceeding" doc in collection:
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
+
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, textPos));
+ }
+ }
+
+ return result;
+}
+
/**
* Save index to a file handle
* Rest of the functions follow...
*/
+
+// FIXME Use 2D-search structure
+unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+{
+ if (sp > ep)
+ return 0;
+
+ ulong ranksp = 0;
+ if (sp != 0)
+ ranksp = alphabetrank->rank(0, sp - 1);
+
+ unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
+ if (resultSize == 0)
+ return 0;
+
+ // Count end-markers in result interval and within [begin, end]
+ unsigned count = 0;
+ unsigned i = ranksp;
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ if (docId >= begin && docId <= end)
+ ++ count;
+
+ -- resultSize;
+ ++ i;
+ }
+ return count;
+}
ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
{
return 0;
}
+ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
+{
+ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+ int c = (int)pattern[m-1];
+ assert(c == 0); // Start from endmarkers
+ TextPosition i=m-1;
+ TextPosition sp = begin;
+ TextPosition ep = end;
+ while (sp<=ep && i>=1)
+ {
+// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ c = (int)pattern[--i];
+ sp = C[c]+alphabetrank->rank(c,sp-1);
+ ep = C[c]+alphabetrank->rank(c,ep)-1;
+ }
+ *spResult = sp;
+ *epResult = ep;
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+}
+
ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
{
// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
// delete [] bwt;
//alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
- std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
- std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+// std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
delete bmb;
bwt = 0; // already deleted
- std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
- std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+// std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
}
void CSA::maketables()
bool IsContains(uchar const *) const;
bool IsLessThan(uchar const *) const;
+ bool IsPrefix(uchar const *, DocId, DocId) const;
+ bool IsSuffix(uchar const *, DocId, DocId) const;
+ bool IsEqual(uchar const *, DocId, DocId) const;
+ bool IsContains(uchar const *, DocId, DocId) const;
+ bool IsLessThan(uchar const *, DocId, DocId) const;
+
ulong Count(uchar const *) const;
unsigned CountPrefix(uchar const *) const;
unsigned CountSuffix(uchar const *) const;
unsigned CountEqual(uchar const *) const;
unsigned CountContains(uchar const *) const;
unsigned CountLessThan(const unsigned char*) const;
+
+ unsigned CountPrefix(uchar const *, DocId, DocId) const;
+ unsigned CountSuffix(uchar const *, DocId, DocId) const;
+ unsigned CountEqual(uchar const *, DocId, DocId) const;
+ unsigned CountContains(uchar const *, DocId, DocId) const;
+ unsigned CountLessThan(uchar const *, DocId, DocId) const;
- // document_result is inherited from SXSI::TextCollection.
+ // Definition of document_result is inherited from SXSI::TextCollection.
document_result Prefix(uchar const *) const;
document_result Suffix(uchar const *) const;
document_result Equal(uchar const *) const;
document_result Contains(uchar const *) const;
document_result LessThan(uchar const *) const;
- // full_result is inherited from SXSI::TextCollection.
+ document_result Prefix(uchar const *, DocId, DocId) const;
+ document_result Suffix(uchar const *, DocId, DocId) const;
+ document_result Equal(uchar const *, DocId, DocId) const;
+ document_result Contains(uchar const *, DocId, DocId) const;
+ document_result LessThan(uchar const *, DocId, DocId) const;
+
+ // Definition of full_result is inherited from SXSI::TextCollection.
full_result FullContains(uchar const *) const;
+ full_result FullContains(uchar const *, DocId, DocId) const;
// Index from/to disk
void Load(FILE *, unsigned);
void maketables();
DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+ ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
// TextPosition Inverse(TextPosition) const;
// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
return alphabetrank->rank(0, ep) - ranksp;
}
+ unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const;
}; // class CSA
} // namespace SXSI