X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.cpp;h=3f75433f32586eaf4dfef2e87f24522ca7a9a634;hb=af8938dbee21244687184dd0502a84ce1af70c50;hp=5b2b190d92c7a2e8defe2028bf7690951b7ff06b;hpb=35c8f7ba17cd67f73c3ecb933fa9ea0becaf16f1;p=SXSI%2FTextCollection.git diff --git a/CSA.cpp b/CSA.cpp index 5b2b190..3f75433 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -25,7 +25,7 @@ #undef max #include "dcover/bwt.hpp" -#include "HeapProfiler.h" // FIXME remove +//#include "HeapProfiler.h" // FIXME remove #include #include @@ -40,7 +40,8 @@ using std::pair; using std::make_pair; using std::map; -using SXSI::TextCollection; +namespace SXSI +{ // Save file version info const uchar CSA::versionFlag = 2; @@ -204,7 +205,14 @@ void CSA::MakeStatic() throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); #endif - // Bitvector of empty texts + if (texts.size() == 0) // Empty collection + { + ++n; + ++numberOfTexts; + texts.append(1, '\0'); + } + + // Bitvector of empty texts, FIXME Remove? { //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() < 50) - std::cout << texts[i]; - else - std::cout << (int)texts[i]; - std::cout << std::endl;*/ + texts.reserve(0); // Release extra capacity + +/* FILE *fp = fopen("texts.txt", "wb"); + std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl; + fclose(fp);*/ + uchar *bwt = new uchar[n]; { - // FIXME More succinct solution needed - unsigned* maptexts = new unsigned[n+1]; + // More succinct solution via StringIterator, see below. +/* unsigned* maptexts = new unsigned[n+1]; // Map text chars to [0..255+numberOfTexts] unsigned count = 0; for (ulong i = 0; i < n; ++i) @@ -234,25 +241,49 @@ void CSA::MakeStatic() maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts] else { uchar c = (uchar)texts[i]; - maptexts[i] = (unsigned)c + numberOfTexts + 1; + maptexts[i] = (unsigned)c + numberOfTexts; } - maptexts[n] = '\0'; - texts.erase(); - texts.reserve(0); // Release capacity assert(count == numberOfTexts); - bwtEndPos = (ulong)compute_bwt(&maptexts[0], &maptexts[n], - &bwt[0], numberOfTexts + 1); + std::cout << "maptext: "; + for (ulong i = 0; i <= n; ++i) + std::cout << maptexts[i] << ", "; + std::cout << std::endl;*/ + + // Mark endmarker positions into bitvector + ulong * endmarkers = new ulong[n/W+1]; + for (ulong i = 0; i < n/W+1; ++i) + endmarkers[i] = 0; + for (ulong i = 0; i < n; ++i) + if (texts[i] == 0) + Tools::SetField(endmarkers, 1, i, 1); + BitRank* br = new BitRank(endmarkers, n, true); + assert(numberOfTexts == br->rank(n-1)); + + // Build iterators, FIXME clean up iterator construction. + StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br); + StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br); + + bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n], + &bwt[0], numberOfTexts); bwt[--bwtEndPos] = '\0'; - delete [] maptexts; + texts.erase(); + texts.reserve(0); // Release capacity + delete br; // bitvector of endmarkers } // End of bw transform +// std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl; + +/* fp = fopen("bwt.txt", "wb"); + std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl; + fclose(fp);*/ + #ifdef CSA_TEST_BWT { uchar *bwtTest = dynFMI->getBWT(); -/* printf("123456789012345678901234567890123456789\n"); + printf("123456789012345678901234567890123456789\n"); for (TextPosition i = 0; i < n && i < 100; i ++) if (bwt[i] < 50) printf("%d", (int)bwt[i]); @@ -264,7 +295,7 @@ void CSA::MakeStatic() printf("%d", (int)bwtTest[i]); else printf("%c", bwtTest[i]); - printf("\n");*/ + printf("\n"); // Sanity check assert(numberOfTexts == dynFMI->getCollectionSize()); @@ -376,6 +407,22 @@ bool CSA::IsPrefix(uchar const * pattern) const return false; } +bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return true; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep, begin, end)) + return true; + return false; +} + + bool CSA::IsSuffix(uchar const *pattern) const { // Here counting is as fast as IsSuffix(): @@ -384,6 +431,14 @@ bool CSA::IsSuffix(uchar const *pattern) const return false; } +bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const +{ + // Here counting is as fast as IsSuffix(): + if (CountSuffix(pattern, begin, end) > 0) + return true; + return false; +} + bool CSA::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); @@ -404,6 +459,26 @@ bool CSA::IsEqual(uchar const *pattern) const return false; } +bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = std::strlen((char *)pattern); + if (m == 0) + { + if (numberOfAllTexts - numberOfTexts > 0u) + return true; // Empty texts exists + return false; // No empty texts exists + } + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep)) + return true; + return false; +} + bool CSA::IsContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -419,10 +494,25 @@ bool CSA::IsContains(uchar const * pattern) const return false; } -bool CSA::IsLessThan(uchar const*) const +bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const { - // TODO - assert(0); + // Here counting is as fast as existential querying + if (CountContains(pattern, begin, end) > 0) + return true; // FIXME No need to filter result set + return false; +} + +bool CSA::IsLessThan(uchar const * pattern) const +{ + if (CountLessThan(pattern) > 0) + return true; + return false; +} + +bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const +{ + if (CountLessThan(pattern, begin, end) > 0) + return true; return false; } @@ -453,6 +543,19 @@ unsigned CSA::CountPrefix(uchar const * pattern) const return CountEndmarkers(sp, ep); } +unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfAllTexts; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep, begin, end); +} + unsigned CSA::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -466,6 +569,19 @@ unsigned CSA::CountSuffix(uchar const * pattern) const return count; } +unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfAllTexts; + + TextPosition sp = 0, ep = 0; + // Search with end-marker + unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end); + + return count; +} + unsigned CSA::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); @@ -480,6 +596,20 @@ unsigned CSA::CountEqual(uchar const *pattern) const return CountEndmarkers(sp, ep); } +unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); // Already within [begin, end] +} + unsigned CSA::CountContains(uchar const * pattern) const { TextPosition m = strlen((char const *)pattern); @@ -487,16 +617,48 @@ unsigned CSA::CountContains(uchar const * pattern) const return numberOfAllTexts; // Total number of texts. // Here counting is as slow as fetching the result set - // because we would have to filter out occ's that fall within same document. + // because we have to filter out occ's that fall within same document. TextCollection::document_result result = Contains(pattern); return result.size(); } +unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts; // Total number of texts. + + // Here counting is as slow as fetching the result set + // because we have to filter out occ's that fall within same document. + TextCollection::document_result result = Contains(pattern, begin, end); + return result.size(); +} + +// Less than or equal unsigned CSA::CountLessThan(uchar const * pattern) const { - // TODO - assert(0); - return 0; + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); +} + +unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep, begin, end); } /** @@ -539,6 +701,44 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const return result; } +TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep] and [begin, end]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(docId); + + -- resultSize; + ++ i; + } + + return result; +} + TextCollection::document_result CSA::Suffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); @@ -589,6 +789,57 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const return result; } +TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + // Search with end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + + ulong sampled_rank_i = 0; + // Check each occurrence, already within [begin, end] + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); + } + // Assert: c == '\0' OR sampled->IsBitSet(i) + + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); + } + else // Sampled position + { + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); + } + } + + return result; +} + + TextCollection::document_result CSA::Equal(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); @@ -626,6 +877,43 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const return result; } +TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all empty texts + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); // already within [begin, end] + + -- resultSize; + ++ i; + } + + return result; +} + TextCollection::document_result CSA::Contains(uchar const * pattern) const { @@ -676,11 +964,130 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const return result; } +TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + // We want unique document indentifiers, using std::set to collect them + std::set resultSet; + + ulong sampled_rank_i = 0; + // Check each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->access(i); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + else + { + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + assert((unsigned)docId < numberOfTexts); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + } + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + // Map to doc ID's + for (document_result::iterator it = result.begin(); it != result.end(); ++it) + *it = emptyTextRank->select0(*it+1); + return result; +} + TextCollection::document_result CSA::LessThan(uchar const * pattern) const { - // TODO - assert(0); - return document_result(); + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); + + -- resultSize; + ++ i; + } + + return result; +} + +TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep] and [begin, end]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) + { + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(docId); + + -- resultSize; + ++ i; + } + + return result; } /** @@ -738,6 +1145,60 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const return result; } +TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return full_result(); // FIXME Throw exception? + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + + ulong sampled_rank_i = 0; + // Report each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; +// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + result.push_back(make_pair(docId, textPos)); + } + } + + return result; +} + /** * Save index to a file handle @@ -869,6 +1330,38 @@ void CSA::Load(FILE *file, unsigned samplerate) * Rest of the functions follow... */ + +// FIXME Use 2D-search structure +unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const +{ + if (sp > ep) + return 0; + + ulong ranksp = 0; + if (sp != 0) + ranksp = alphabetrank->rank(0, sp - 1); + + unsigned resultSize = alphabetrank->rank(0, ep) - ranksp; + if (resultSize == 0) + return 0; + + // Count end-markers in result interval and within [begin, end] + unsigned count = 0; + unsigned i = ranksp; + while (resultSize) + { + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + if (docId >= begin && docId <= end) + ++ count; + + -- resultSize; + ++ i; + } + return count; +} ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { @@ -892,6 +1385,56 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; } +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const +{ + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + int c = (int)pattern[m-1]; + assert(c == 0); // Start from endmarkers + TextPosition i=m-1; + TextPosition sp = begin; + TextPosition ep = end; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + sp = C[c]+alphabetrank->rank(c,sp-1); + ep = C[c]+alphabetrank->rank(c,ep)-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; +} + + +ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +{ + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + uint c = (int)pattern[m-1]; + TextPosition i=m-1; + TextPosition sp = 1; + TextPosition ep = C[c+1]-1; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + uint result = alphabetrank->rank(c,ep); + if (result == ~0u) + ep = 0; + else + ep = C[c]+result-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; +} + + CSA::~CSA() { #ifdef CSA_TEST_BWT delete dynFMI; @@ -930,21 +1473,21 @@ void CSA::makewavelet(uchar *bwt) // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); // delete [] bwt; //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! +// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +// std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - // FIXME: static_sequence_wvtree accepts only uint arrays - uint *text = new uint[n]; - for (i = 0; i < n; ++i) // Silly - text[i] = bwt[i]; - delete [] bwt; - bwt = 0; +// HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? - wt_coder * wtc = new wt_coder_huff(text,n,am); - alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am); + static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate? +// static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? + wt_coder * wtc = new wt_coder_binary(bwt,n,am); + alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am); delete bmb; - delete [] text; - text = 0; + bwt = 0; // already deleted + +// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +// std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; } void CSA::maketables() @@ -953,17 +1496,24 @@ void CSA::maketables() // Note: bwtEndPos already set! // and the length of the first text (counter l): #ifdef CSA_TEST_BWT - ulong i = 0; - ulong l = 1; - - uchar c = alphabetrank->access(i); - while (c != '\0') { - i = C[c]+alphabetrank->rank(c, i)-1; - ++l; - c = alphabetrank->access(i); + ulong i = 0; + ulong l = 1; + uint alphabetrank_i_tmp = 0; + uchar c = alphabetrank->access(i, alphabetrank_i_tmp); + while (c != '\0') + { + i = C[c]+alphabetrank_i_tmp-1; + ++l; + c = alphabetrank->access(i, alphabetrank_i_tmp); + } + + if (i != bwtEndPos) // compare result + { + std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl; + exit(0); + } } - assert(i == bwtEndPos); // compare result #endif // Build up arrays for text length and starting positions @@ -993,6 +1543,7 @@ void CSA::maketables() DocId textId = numberOfTexts; ulong ulongmax = 0; ulongmax--; + uint alphabetrank_i_tmp =0; //positions: for (ulong i=n-1;iaccess(p); + uchar c = alphabetrank->access(p, alphabetrank_i_tmp); if (c == '\0') { --textId; // Record the order of end-markers in BWT: - ulong endmarkerRank = alphabetrank->rank(0, p) - 1; + ulong endmarkerRank = alphabetrank_i_tmp - 1; (*endmarkerDocId)[endmarkerRank] = textId; // Store text length and text start position: @@ -1027,7 +1578,7 @@ void CSA::maketables() p = textId; // Correct LF-mapping to the last char of the previous text. } else - p = C[c]+alphabetrank->rank(c, p)-1; + p = C[c]+alphabetrank_i_tmp-1; } assert(textId == 0); @@ -1154,3 +1705,5 @@ unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) { return x | (bit << pos); } +} // namespace SXSI +