From 3d2ba0261930887bf1e223ebe00b09937c76cbf2 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Wed, 11 Mar 2009 11:35:02 +0000 Subject: [PATCH] Added queries for DocId intervals git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@248 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- CSA.cpp | 458 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- CSA.h | 25 +++- 2 files changed, 476 insertions(+), 7 deletions(-) diff --git a/CSA.cpp b/CSA.cpp index eaa5cc0..5dd14b7 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -406,6 +406,22 @@ bool CSA::IsPrefix(uchar const * pattern) const return false; } +bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return true; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep, begin, end)) + return true; + return false; +} + + bool CSA::IsSuffix(uchar const *pattern) const { // Here counting is as fast as IsSuffix(): @@ -414,6 +430,14 @@ bool CSA::IsSuffix(uchar const *pattern) const return false; } +bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const +{ + // Here counting is as fast as IsSuffix(): + if (CountSuffix(pattern, begin, end) > 0) + return true; + return false; +} + bool CSA::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); @@ -434,6 +458,26 @@ bool CSA::IsEqual(uchar const *pattern) const return false; } +bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = std::strlen((char *)pattern); + if (m == 0) + { + if (numberOfAllTexts - numberOfTexts > 0u) + return true; // Empty texts exists + return false; // No empty texts exists + } + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep)) + return true; + return false; +} + bool CSA::IsContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -449,6 +493,14 @@ bool CSA::IsContains(uchar const * pattern) const return false; } +bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const +{ + // Here counting is as fast as existential querying + if (CountContains(pattern, begin, end) > 0) + return true; // FIXME No need to filter result set + return false; +} + bool CSA::IsLessThan(uchar const * pattern) const { if (CountLessThan(pattern) > 0) @@ -456,6 +508,13 @@ bool CSA::IsLessThan(uchar const * pattern) const return false; } +bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const +{ + if (CountLessThan(pattern, begin, end) > 0) + return true; + return false; +} + /****************************************************************** * Counting queries */ @@ -483,6 +542,19 @@ unsigned CSA::CountPrefix(uchar const * pattern) const return CountEndmarkers(sp, ep); } +unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfAllTexts; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep, begin, end); +} + unsigned CSA::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -496,6 +568,19 @@ unsigned CSA::CountSuffix(uchar const * pattern) const return count; } +unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfAllTexts; + + TextPosition sp = 0, ep = 0; + // Search with end-marker + unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end); + + return count; +} + unsigned CSA::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); @@ -510,6 +595,20 @@ unsigned CSA::CountEqual(uchar const *pattern) const return CountEndmarkers(sp, ep); } +unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); // Already within [begin, end] +} + unsigned CSA::CountContains(uchar const * pattern) const { TextPosition m = strlen((char const *)pattern); @@ -517,11 +616,23 @@ unsigned CSA::CountContains(uchar const * pattern) const return numberOfAllTexts; // Total number of texts. // Here counting is as slow as fetching the result set - // because we would have to filter out occ's that fall within same document. + // because we have to filter out occ's that fall within same document. TextCollection::document_result result = Contains(pattern); return result.size(); } +unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts; // Total number of texts. + + // Here counting is as slow as fetching the result set + // because we have to filter out occ's that fall within same document. + TextCollection::document_result result = Contains(pattern, begin, end); + return result.size(); +} + // Less than or equal unsigned CSA::CountLessThan(uchar const * pattern) const { @@ -536,6 +647,19 @@ unsigned CSA::CountLessThan(uchar const * pattern) const return CountEndmarkers(sp, ep); } +unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep, begin, end); +} + /** * Document reporting queries */ @@ -576,6 +700,44 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const return result; } +TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep] and [begin, end]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(docId); + + -- resultSize; + ++ i; + } + + return result; +} + TextCollection::document_result CSA::Suffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -626,6 +788,57 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const return result; } +TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + // Search with end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + + ulong sampled_rank_i = 0; + // Check each occurrence, already within [begin, end] + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); + } + // Assert: c == '\0' OR sampled->IsBitSet(i) + + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); + } + else // Sampled position + { + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); + } + } + + return result; +} + + TextCollection::document_result CSA::Equal(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); @@ -663,6 +876,43 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const return result; } +TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all empty texts + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); // already within [begin, end] + + -- resultSize; + ++ i; + } + + return result; +} + TextCollection::document_result CSA::Contains(uchar const * pattern) const { @@ -713,6 +963,57 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const return result; } +TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + // We want unique document indentifiers, using std::set to collect them + std::set resultSet; + + ulong sampled_rank_i = 0; + // Check each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->access(i); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + else + { + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + assert((unsigned)docId < numberOfTexts); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + } + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + // Map to doc ID's + for (document_result::iterator it = result.begin(); it != result.end(); ++it) + *it = emptyTextRank->select0(*it+1); + return result; +} + TextCollection::document_result CSA::LessThan(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -750,6 +1051,44 @@ TextCollection::document_result CSA::LessThan(uchar const * pattern) const return result; } +TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep] and [begin, end]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(docId); + + -- resultSize; + ++ i; + } + + return result; +} + /** * Full result set queries */ @@ -805,6 +1144,60 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const return result; } +TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return full_result(); // FIXME Throw exception? + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + + ulong sampled_rank_i = 0; + // Report each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; +// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, textPos)); + } + } + + return result; +} + /** * Save index to a file handle @@ -936,6 +1329,38 @@ void CSA::Load(FILE *file, unsigned samplerate) * Rest of the functions follow... */ + +// FIXME Use 2D-search structure +unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const +{ + if (sp > ep) + return 0; + + ulong ranksp = 0; + if (sp != 0) + ranksp = alphabetrank->rank(0, sp - 1); + + unsigned resultSize = alphabetrank->rank(0, ep) - ranksp; + if (resultSize == 0) + return 0; + + // Count end-markers in result interval and within [begin, end] + unsigned count = 0; + unsigned i = ranksp; + while (resultSize) + { + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + ++ count; + + -- resultSize; + ++ i; + } + return count; +} ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { @@ -959,6 +1384,29 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; } +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const +{ + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + int c = (int)pattern[m-1]; + assert(c == 0); // Start from endmarkers + TextPosition i=m-1; + TextPosition sp = begin; + TextPosition ep = end; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + sp = C[c]+alphabetrank->rank(c,sp-1); + ep = C[c]+alphabetrank->rank(c,ep)-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; +} + ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { @@ -1024,8 +1472,8 @@ void CSA::makewavelet(uchar *bwt) // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); // delete [] bwt; //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! - std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +// std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove @@ -1037,8 +1485,8 @@ void CSA::makewavelet(uchar *bwt) delete bmb; bwt = 0; // already deleted - std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +// std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; } void CSA::maketables() diff --git a/CSA.h b/CSA.h index be9a2f9..eab3637 100644 --- a/CSA.h +++ b/CSA.h @@ -82,22 +82,41 @@ public: bool IsContains(uchar const *) const; bool IsLessThan(uchar const *) const; + bool IsPrefix(uchar const *, DocId, DocId) const; + bool IsSuffix(uchar const *, DocId, DocId) const; + bool IsEqual(uchar const *, DocId, DocId) const; + bool IsContains(uchar const *, DocId, DocId) const; + bool IsLessThan(uchar const *, DocId, DocId) const; + ulong Count(uchar const *) const; unsigned CountPrefix(uchar const *) const; unsigned CountSuffix(uchar const *) const; unsigned CountEqual(uchar const *) const; unsigned CountContains(uchar const *) const; unsigned CountLessThan(const unsigned char*) const; + + unsigned CountPrefix(uchar const *, DocId, DocId) const; + unsigned CountSuffix(uchar const *, DocId, DocId) const; + unsigned CountEqual(uchar const *, DocId, DocId) const; + unsigned CountContains(uchar const *, DocId, DocId) const; + unsigned CountLessThan(uchar const *, DocId, DocId) const; - // document_result is inherited from SXSI::TextCollection. + // Definition of document_result is inherited from SXSI::TextCollection. document_result Prefix(uchar const *) const; document_result Suffix(uchar const *) const; document_result Equal(uchar const *) const; document_result Contains(uchar const *) const; document_result LessThan(uchar const *) const; - // full_result is inherited from SXSI::TextCollection. + document_result Prefix(uchar const *, DocId, DocId) const; + document_result Suffix(uchar const *, DocId, DocId) const; + document_result Equal(uchar const *, DocId, DocId) const; + document_result Contains(uchar const *, DocId, DocId) const; + document_result LessThan(uchar const *, DocId, DocId) const; + + // Definition of full_result is inherited from SXSI::TextCollection. full_result FullContains(uchar const *) const; + full_result FullContains(uchar const *, DocId, DocId) const; // Index from/to disk void Load(FILE *, unsigned); @@ -303,6 +322,7 @@ private: void maketables(); DocId DocIdAtTextPos(BlockArray*, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; + ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; // TextPosition Inverse(TextPosition) const; // TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; @@ -324,6 +344,7 @@ private: return alphabetrank->rank(0, ep) - ranksp; } + unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const; }; // class CSA } // namespace SXSI -- 2.17.1