From: nvalimak Date: Tue, 10 Mar 2009 19:07:14 +0000 (+0000) Subject: Added LessThan() implementation X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=a85e71cda23aadfe73810956b3b51672e8199cba Added LessThan() implementation git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@234 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/CSA.cpp b/CSA.cpp index a190b48..eaa5cc0 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -205,6 +205,12 @@ void CSA::MakeStatic() throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); #endif + if (texts.size() == 0) // Empty collection + { + ++n; + texts.append(1, '\0'); + } + // Bitvector of empty texts, FIXME Remove? { //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() < 0) + return true; return false; } @@ -504,11 +522,18 @@ unsigned CSA::CountContains(uchar const * pattern) const return result.size(); } +// Less than or equal unsigned CSA::CountLessThan(uchar const * pattern) const { - // TODO - assert(0); - return 0; + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); } /** @@ -690,9 +715,39 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const TextCollection::document_result CSA::LessThan(uchar const * pattern) const { - // TODO - assert(0); - return document_result(); + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); + + -- resultSize; + ++ i; + } + + return result; } /** @@ -904,6 +959,33 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; } + +ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +{ + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + uint c = (int)pattern[m-1]; + TextPosition i=m-1; + TextPosition sp = 1; + TextPosition ep = C[c+1]-1; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + uint result = alphabetrank->rank(c,ep); + if (result == ~0u) + ep = 0; + else + ep = C[c]+result-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; +} + + CSA::~CSA() { #ifdef CSA_TEST_BWT delete dynFMI; @@ -942,13 +1024,21 @@ void CSA::makewavelet(uchar *bwt) // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); // delete [] bwt; //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! + std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + + HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? - wt_coder * wtc = new wt_coder_huff(bwt,n,am); + static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate? +// static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? + wt_coder * wtc = new wt_coder_binary(bwt,n,am); alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am); delete bmb; - delete [] bwt; + bwt = 0; // already deleted + + std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; } void CSA::maketables() @@ -957,17 +1047,24 @@ void CSA::maketables() // Note: bwtEndPos already set! // and the length of the first text (counter l): #ifdef CSA_TEST_BWT - ulong i = 0; - ulong l = 1; - - uchar c = alphabetrank->access(i); - while (c != '\0') { - i = C[c]+alphabetrank->rank(c, i)-1; - ++l; - c = alphabetrank->access(i); + ulong i = 0; + ulong l = 1; + uint alphabetrank_i_tmp = 0; + uchar c = alphabetrank->access(i, alphabetrank_i_tmp); + while (c != '\0') + { + i = C[c]+alphabetrank_i_tmp-1; + ++l; + c = alphabetrank->access(i, alphabetrank_i_tmp); + } + + if (i != bwtEndPos) // compare result + { + std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl; + exit(0); + } } - assert(i == bwtEndPos); // compare result #endif // Build up arrays for text length and starting positions @@ -997,6 +1094,7 @@ void CSA::maketables() DocId textId = numberOfTexts; ulong ulongmax = 0; ulongmax--; + uint alphabetrank_i_tmp =0; //positions: for (ulong i=n-1;iaccess(p); + uchar c = alphabetrank->access(p, alphabetrank_i_tmp); if (c == '\0') { --textId; // Record the order of end-markers in BWT: - ulong endmarkerRank = alphabetrank->rank(0, p) - 1; + ulong endmarkerRank = alphabetrank_i_tmp - 1; (*endmarkerDocId)[endmarkerRank] = textId; // Store text length and text start position: @@ -1031,7 +1129,7 @@ void CSA::maketables() p = textId; // Correct LF-mapping to the last char of the previous text. } else - p = C[c]+alphabetrank->rank(c, p)-1; + p = C[c]+alphabetrank_i_tmp-1; } assert(textId == 0);