From: nvalimak Date: Mon, 20 Apr 2009 11:27:42 +0000 (+0000) Subject: Small fixes X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=2ff99d618e7f031bcfd95895c982e0958ae58d8c;p=SXSI%2FTextCollection.git Small fixes git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@333 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 1e0b2b8..211849e 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -19,7 +19,9 @@ *****************************************************************************/ #include "TCImplementation.h" -//#include "HeapProfiler.h" // FIXME remove +#ifdef DEBUG_MEMUSAGE +#include "HeapProfiler.h" // FIXME remove +#endif #include #include @@ -409,14 +411,13 @@ TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) TextCollection::document_result result; result.reserve(ep-sp+1); // Try to avoid reallocation. - ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + while (c != '\0' && !sampled->access(i)) { i = C[c]+alphabetrank->rank(c,i)-1; c = alphabetrank->access(i); @@ -431,7 +432,7 @@ TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) } else // Sampled position { - DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; result.push_back(docId); } } @@ -452,14 +453,13 @@ TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, TextCollection::document_result result; result.reserve(ep-sp+1); // Try to avoid reallocation. - ulong sampled_rank_i = 0; // Check each occurrence, already within [begin, end] for (; sp <= ep; ++sp) { TextPosition i = sp; uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + while (c != '\0' && !sampled->access(i)) { i = C[c]+alphabetrank->rank(c,i)-1; c = alphabetrank->access(i); @@ -474,7 +474,7 @@ TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, } else // Sampled position { - DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; result.push_back(docId); } } @@ -525,13 +525,12 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern // We want unique document indentifiers, using std::set to collect them std::set resultSet; - ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + while (c != '\0' && !sampled->access(i)) { i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping c = alphabetrank->access(i); @@ -544,7 +543,7 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern } else { - DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + DocId di = (*suffixDocId)[sampled->rank1(i)-1]; assert((unsigned)di < numberOfTexts); resultSet.insert(di); } @@ -568,13 +567,12 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern // We want unique document indentifiers, using std::set to collect them std::set resultSet; - ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + while (c != '\0' && !sampled->access(i)) { i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping c = alphabetrank->access(i); @@ -589,7 +587,7 @@ TextCollection::document_result TCImplementation::Contains(uchar const * pattern } else { - DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; assert((unsigned)docId < numberOfTexts); if (docId >= begin && docId <= end) resultSet.insert(docId); @@ -643,14 +641,13 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern full_result result; result.reserve(ep-sp+1); // Try to avoid reallocation. - ulong sampled_rank_i = 0; // Report each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; TextPosition dist = 0; uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + while (c != '\0' && !sampled->access(i)) { i = C[c]+alphabetrank->rank(c,i)-1; c = alphabetrank->access(i); @@ -665,9 +662,8 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern } else { - TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; -// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; result.push_back(make_pair(docId, textPos)); } @@ -689,14 +685,13 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern full_result result; result.reserve(ep-sp+1); // Try to avoid reallocation. - ulong sampled_rank_i = 0; // Report each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; TextPosition dist = 0; uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) + while (c != '\0' && !sampled->access(i)) { i = C[c]+alphabetrank->rank(c,i)-1; c = alphabetrank->access(i); @@ -714,9 +709,8 @@ TextCollection::full_result TCImplementation::FullContains(uchar const * pattern } else { - TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; -// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; if (docId >= begin && docId <= end) result.push_back(make_pair(docId, textPos)); @@ -765,7 +759,7 @@ void TCImplementation::Save(FILE *file) const throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position)."); alphabetrank->save(file); - sampled->Save(file); + sampled->save(file); suffixes->Save(file); suffixDocId->Save(file); @@ -811,7 +805,7 @@ TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position)."); alphabetrank = static_sequence::load(file); - sampled = new BSGAP(file); + sampled = static_bitsequence::load(file); suffixes = new BlockArray(file); suffixDocId = new BlockArray(file); @@ -940,8 +934,10 @@ void TCImplementation::makewavelet(uchar *bwt) //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; -// std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; -// HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove +#ifdef DEBUG_MEMUSAGE + std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove +#endif alphabet_mapper * am = new alphabet_mapper_none(); static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate? @@ -950,8 +946,10 @@ void TCImplementation::makewavelet(uchar *bwt) delete bmb; bwt = 0; // already deleted -// std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; -// std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +#endif } void TCImplementation::maketables() @@ -984,9 +982,9 @@ void TCImplementation::maketables() // Mapping from end-markers to doc ID's: BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); - ulong *sampledpositions = new ulong[n/W+1]; - for (ulong i=0;i samplerate) { - Tools::SetField(sampledpositions,1,p,1); + set_field(sampledpositions,1,p,1); (*positions)[sampleCount] = p; (*tmpSuffix)[sampleCount] = x; // FIXME remove sampleCount ++; @@ -1032,8 +1030,9 @@ void TCImplementation::maketables() } assert(textId == 0); - sampled = new BSGAP(sampledpositions,n,true); - sampleLength = sampled->rank(n-1); + sampled = new static_bitsequence_rrr02(sampledpositions, n, 16); + delete [] sampledpositions; + sampleLength = sampled->rank1(n-1); assert(sampleCount == sampleLength); // Suffixes store an offset from the text start position @@ -1042,7 +1041,8 @@ void TCImplementation::maketables() for(ulong i=0; irank((*positions)[i]); + ulong j = sampled->rank1((*positions)[i]); + if (j==0) j=sampleLength; TextPosition textPos = (*tmpSuffix)[i]; (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos); @@ -1057,8 +1057,10 @@ void TCImplementation::maketables() delete positions; delete textStartPos; - // std::cerr << "heap usage before Doc: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - +#ifdef DEBUG_MEMUSAGE + std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif uint *tmp = new uint[numberOfTexts]; // FIXME Silly... for (unsigned i = 0; i < numberOfTexts; ++i) @@ -1073,9 +1075,11 @@ void TCImplementation::maketables() static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? Doc = new static_sequence_wvtree_noptrs(tmp, numberOfTexts, bmb, am); delete bmb; - delete [] tmp; + // delete [] tmp; // already deleted! -// std::cerr << "heap usage after Doc: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +#ifdef DEBUG_MEMUSAGE + std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +#endif /*document_result res = Doc->access(3, 3, 0, 3); cout << "Access result: "; diff --git a/TCImplementation.h b/TCImplementation.h index 0f0cab4..5430f21 100644 --- a/TCImplementation.h +++ b/TCImplementation.h @@ -23,7 +23,6 @@ #include "BitRank.h" #include "TextCollection.h" #include "BlockArray.h" -#include "BSGAP.h" // Include from XMLTree/libcds #include // Defines W == 32 @@ -118,7 +117,7 @@ private: static_sequence * alphabetrank; // Sample structures for texts longer than samplerate - BSGAP *sampled; // FIXME Replace with RRR02 + static_bitsequence * sampled; // FIXME Replace with RRR02 BlockArray *suffixes; BlockArray *suffixDocId;