From 1c0ccf5923b31eceae88afc64f1b4727cd488643 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Sat, 12 Dec 2009 21:21:39 +0000 Subject: [PATCH] Added support for non-indexed texts git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@623 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- TCImplementation.cpp | 123 ++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 47 deletions(-) diff --git a/TCImplementation.cpp b/TCImplementation.cpp index 09f6d24..4df5c69 100644 --- a/TCImplementation.cpp +++ b/TCImplementation.cpp @@ -48,7 +48,8 @@ const uchar TCImplementation::versionFlag = 8; * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, - unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, char tsType) + unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, + CSA::DeltaVector & notIndexed, const string & niText, char tsType) : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0) { @@ -56,7 +57,7 @@ TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerat bwt = 0; // Make sampling tables - maketables(numberOfSamples_, tsType); + maketables(numberOfSamples_, tsType, notIndexed, niText); } bool TCImplementation::EmptyText(DocId k) const @@ -93,8 +94,9 @@ uchar * TCImplementation::GetText(DocId k) const res[i-j-1] = result[j]; return res;*/ } + /* - * Not supported + * Substring queries are supported via the pointer returned by TextStorage::GetText uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const { assert(k < (DocId)numberOfTexts); @@ -109,6 +111,8 @@ uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const return Substring(i + start, j-i+1); }*/ + + /****************************************************************** * Existential queries */ @@ -1036,7 +1040,7 @@ void TCImplementation::makewavelet(uchar *bwt) #endif } -void TCImplementation::maketables(ulong sampleLength, char tsType) +void TCImplementation::maketables(ulong sampleLength, char tsType, CSA::DeltaVector & notIndexed, const string & niText) { // Calculate BWT end-marker position (of last inserted text) { @@ -1080,15 +1084,18 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) ulongmax--; uint alphabetrank_i_tmp =0; - TextStorageBuilder tsbuilder(n); - Tools::StartTimer(); + // Text length = n + number of bytes not indexed. + TextStorageBuilder tsbuilder(n + niText.length()); + ulong tsb_i = n + niText.length(); // Iterator from text length to 0. + string::const_reverse_iterator nit_i = niText.rbegin(); // Iterator through non-indexed texts for (ulong i=n-1;iGetPos(i) x=(i==n-1)?0:i+1; uchar c = alphabetrank->access(p, alphabetrank_i_tmp); - tsbuilder[i] = c; + + tsbuilder[--tsb_i] = c; // Build TextStorage if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0') { @@ -1099,27 +1106,71 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) if (c == '\0') { - --textId; + unsigned prevTextId = textId; // Cache textId value. + --textId; + /** + * At first c == '\0' it holds that (prevTextId == numberOfTexts), thus, + * we have to search for the first text that is actually *indexed* + * to get correct prevTextId. + */ + if (prevTextId == numberOfTexts) + { + prevTextId = 0; + while (notIndexed.isSet(prevTextId)) + ++ prevTextId; + // Now prevTextId points to the first indexed Doc ID. + } + + /** + * Insert non-indexed texts + */ + while (notIndexed.isSet(textId)) + { + do { + tsbuilder[tsb_i] = *nit_i; + -- tsb_i; + ++ nit_i; + } while (nit_i != niText.rend() && *nit_i != '\0'); + + tsbuilder[tsb_i] = '\0'; + + if (textId == 0) + break; + --textId; + } // Record the order of end-markers in BWT: ulong endmarkerRank = alphabetrank_i_tmp - 1; //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts); - (*endmarkerDocId)[endmarkerRank] = (textId + 1) % numberOfTexts; + (*endmarkerDocId)[endmarkerRank] = prevTextId % numberOfTexts; // Store text length and text start position: if (textId < (DocId)numberOfTexts - 1) { // (*textStartPos)[textId + 1] = x; // x-1 is text position of end-marker. + posOfSuccEndmarker = i; } - // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). - p = textId; // Correct LF-mapping to the last char of the previous text. + // LF-mapping from '\0' does not work with this (pseudo) BWT. + // Correct LF-mapping to the last char of the previous text: + p = textId - notIndexed.rank(textId); } else // Now c != '\0', do LF-mapping: p = C[c]+alphabetrank_i_tmp-1; } + while (textId > 0 && notIndexed.isSet(textId-1)) + { + do { + -- tsb_i; + tsbuilder[tsb_i] = *nit_i; + ++ nit_i; + } while (nit_i != niText.rend() && *nit_i != '\0'); + --textId; + } assert(textId == 0); + assert(tsb_i == 0); + assert(nit_i == niText.rend()); #ifdef DEBUG_MEMUSAGE std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; @@ -1147,16 +1198,21 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - x = n - 2; - posOfSuccEndmarker = n-1; - for(ulong i=0; iIsEndmarker(x)) + { posOfSuccEndmarker = x--; + -- textId; + } } assert((*positions)[i] < n); ulong j = sampled->rank1((*positions)[i]); @@ -1164,14 +1220,18 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) assert(j != 0); // if (j==0) j=sampleLength; TextPosition textPos = (x==n-1)?0:x+1; - (*suffixDocId)[j-1] = textStorage->DocIdAtTextPos(textPos); + (*suffixDocId)[j-1] = textId; // textStorage->DocIdAtTextPos(textPos); + assert(textStorage->DocIdAtTextPos(textPos) == textId); assert((*suffixDocId)[j-1] < numberOfTexts); // calculate offset from text start: (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]); --x; if (x != ~0lu && textStorage->IsEndmarker(x)) + { posOfSuccEndmarker = x--; + -- textId; + } } delete positions; @@ -1181,37 +1241,6 @@ void TCImplementation::maketables(ulong sampleLength, char tsType) HeapProfiler::ResetMaxHeapConsumption(); #endif - /** - * Second pass: check tables - */ -/* p=bwtEndPos; - textId = numberOfTexts; - for (ulong i=n-1;iaccess(p)) { - ulong j = sampled->rank1(p)-1; - assert((*suffixDocId)[j] == DocIdAtTextPos(textStartPos, x)); - - // calculate offset from text start: - assert((*suffixes)[j] == x - (*textStartPos)[(*suffixDocId)[j]]); - } - - uchar c = alphabetrank->access(p, alphabetrank_i_tmp); - - if (c == '\0') - { - --textId; - // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). - p = textId; // Correct LF-mapping to the last char of the previous text. - } - else // Now c != '\0', do LF-mapping: - p = C[c]+alphabetrank_i_tmp-1; - } - assert(textId == 0); - delete textStartPos -*/ - #ifdef DEBUG_MEMUSAGE std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; HeapProfiler::ResetMaxHeapConsumption(); -- 2.17.1