From: nvalimak Date: Fri, 6 Mar 2009 13:39:35 +0000 (+0000) Subject: BWT via Difference cover X-Git-Url: http://git.nguyen.vg/gitweb/?p=SXSI%2FTextCollection.git;a=commitdiff_plain;h=35c8f7ba17cd67f73c3ecb933fa9ea0becaf16f1 BWT via Difference cover git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@211 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/CSA.cpp b/CSA.cpp index 04ace79..5b2b190 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -19,9 +19,16 @@ *****************************************************************************/ #include "CSA.h" #include "TextCollection.h" + +// Conflicts with std::min and std::max +#undef min +#undef max +#include "dcover/bwt.hpp" + +#include "HeapProfiler.h" // FIXME remove + #include #include -#include #include #include #include @@ -131,12 +138,13 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ CSA::CSA(unsigned samplerate) - : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0), - codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), - textLength(0), textStartPos(0), emptyTextRank(0) + : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0), + suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), + emptyTextRank(0) { this->samplerate = samplerate; - + +#ifdef CSA_TEST_BWT // FIXME TODO : DynFMI needs distribution of characters before hand // This will create fully balanced wavelet tree for all chars [0, 255]. uchar temp[256]; @@ -149,6 +157,7 @@ CSA::CSA(unsigned samplerate) uchar *temp = Tools::GetFileContents("data.txt", 0); dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false); delete [] temp;*/ +#endif } /** @@ -160,7 +169,9 @@ CSA::CSA(unsigned samplerate) void CSA::InsertText(uchar const * text) { // Sanity check: +#ifdef CSA_TEST_BWT assert(dynFMI != 0); +#endif TextPosition m = std::strlen((char *)text) + 1; if (m > maxTextLength) @@ -171,20 +182,27 @@ void CSA::InsertText(uchar const * text) this->n += m; this->numberOfTexts ++; this->numberOfAllTexts ++; +#ifdef CSA_TEST_BWT dynFMI->addText(text, m); +#endif + + texts.append((char *) text); + texts.append(1, '\0'); } else { emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here - this->numberOfAllTexts ++; // Replace with dynamic bitvector + this->numberOfAllTexts ++; } } void CSA::MakeStatic() { // Sanity check: +#ifdef CSA_TEST_BWT if (dynFMI == 0) throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); +#endif // Bitvector of empty texts { @@ -197,53 +215,74 @@ void CSA::MakeStatic() emptyTextId.clear(); emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true); } - - uchar *bwt = dynFMI->getBWT(); - /*printf("123456789012345678901234567890123456789\n"); - for (TextPosition i = 0; i < n; i ++) - if (bwt[i] < 50) - printf("%d", (int)bwt[i]); + +/* std::cout << "texts: "; + for (ulong i = 0; i < n; i++) + if (texts[i] > 50) + std::cout << texts[i]; else - printf("%c", bwt[i]); - printf("\n");*/ - - -/* for (TextPosition i = 1; i <= n; i ++) + std::cout << (int)texts[i]; + std::cout << std::endl;*/ + uchar *bwt = new uchar[n]; { - printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i)); - }*/ - - // Sanity check - assert(numberOfTexts == dynFMI->getCollectionSize()); + // FIXME More succinct solution needed + unsigned* maptexts = new unsigned[n+1]; + // Map text chars to [0..255+numberOfTexts] + unsigned count = 0; + for (ulong i = 0; i < n; ++i) + if (texts[i] == 0) + maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts] + else { + uchar c = (uchar)texts[i]; + maptexts[i] = (unsigned)c + numberOfTexts + 1; + } - delete dynFMI; - dynFMI = 0; + maptexts[n] = '\0'; + texts.erase(); + texts.reserve(0); // Release capacity + assert(count == numberOfTexts); - makewavelet(bwt); // Deletes bwt! - bwt = 0; + bwtEndPos = (ulong)compute_bwt(&maptexts[0], &maptexts[n], + &bwt[0], numberOfTexts + 1); - // Calculate BWT end-marker position (of last inserted text) - // and the length of the first text (counter l): - ulong i = 0; - ulong l = 1; + bwt[--bwtEndPos] = '\0'; + delete [] maptexts; + } // End of bw transform - uchar c = alphabetrank->access(i); - while (c != '\0') - { - i = C[c]+alphabetrank->rank(c, i)-1; - ++l; - c = alphabetrank->access(i); +#ifdef CSA_TEST_BWT + { + uchar *bwtTest = dynFMI->getBWT(); +/* printf("123456789012345678901234567890123456789\n"); + for (TextPosition i = 0; i < n && i < 100; i ++) + if (bwt[i] < 50) + printf("%d", (int)bwt[i]); + else + printf("%c", bwt[i]); + printf("\n"); + for (TextPosition i = 0; i < n && i < 100; i ++) + if (bwtTest[i] < 50) + printf("%d", (int)bwtTest[i]); + else + printf("%c", bwtTest[i]); + printf("\n");*/ + + // Sanity check + assert(numberOfTexts == dynFMI->getCollectionSize()); + + delete dynFMI; + dynFMI = 0; + for (ulong i = 0; i < n; ++i) + if (bwt[i] != bwtTest[i]) + { + std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl; + assert(0); + } + delete [] bwtTest; } - bwtEndPos = i; - assert(bwtEndPos < n); - //printf("bwtEndPos = %lu\n", bwtEndPos); +#endif // CSA_TEST_BWT - // Build up arrays for text length and starting positions - // FIXME Remove, temp data - textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength)); - textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); - (*textLength)[0] = l; - (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables(). + makewavelet(bwt); // Deletes bwt! + bwt = 0; // Make sampling tables maketables(); @@ -766,19 +805,17 @@ void CSA::Save(FILE *file) const void CSA::Load(FILE *file, unsigned samplerate) { // Delete everything +#ifdef CSA_TEST_BWT delete dynFMI; dynFMI = 0; +#endif delete alphabetrank; alphabetrank = 0; delete sampled; sampled = 0; delete suffixes; suffixes = 0; delete suffixDocId; suffixDocId = 0; - delete positions; positions = 0; delete [] codetable; codetable = 0; delete endmarkerDocId; endmarkerDocId = 0; delete emptyTextRank; emptyTextRank = 0; - // FIXME Remove following: - delete textLength; textLength = 0; - delete textStartPos; textStartPos = 0; this->maxTextLength = 0; this->numberOfTexts = 0; @@ -823,7 +860,7 @@ void CSA::Load(FILE *file, unsigned samplerate) emptyTextRank = new BSGAP(file); // FIXME Construct data structures with new samplerate - //maketables(); // FIXME: this will redo text length tables + //maketables(); } @@ -832,80 +869,6 @@ void CSA::Load(FILE *file, unsigned samplerate) * Rest of the functions follow... */ -/* - * Not supported -uchar * CSA::Substring(TextPosition i, TextPosition l) const -{ - uchar *result = new uchar[l + 1]; - if (l == 0) - { - result[0] = 0u; - return result; - } - - TextPosition dist; - TextPosition k = i + l - 1; - // Check for end of the string - if (k > n - 1) - { - l -= k - n + 1; - k = n - 1; - } - - TextPosition skip = samplerate - k % samplerate - 1; - TextPosition j; - if (k / samplerate + 1 >= n / samplerate) - { - j = bwtEndPos; - skip = n - k - 1; - } - else - { - j = (*positions)[k/samplerate+1]; - //cout << samplerate << ' ' << j << '\n'; - } - - for (dist = 0; dist < skip + l; dist++) - { - int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp); - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = alphabetrank->rank(0, j) - 1; - j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker - } - else - j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping - if (dist >= skip) - result[l + skip - dist - 1] = c; - } - result[l] = 0u; - return result; -} -*/ - /*TextPosition CSA::inverse(TextPosition i) -{ - TextPosition skip = samplerate - i % samplerate; - TextPosition j; - if (i / samplerate + 1 >= n / samplerate) - { - j = bwtEndPos; - skip = n - i; - } - else - { - j = (*positions)[i/samplerate+1]; - //cout << samplerate << ' ' << j << '\n'; - } - - while (skip > 0) - { - int c = alphabetrank->charAtPos(j); - j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping - skip --; - } - return j; - }*/ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { @@ -929,50 +892,17 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; } - /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) -{ - sp = C[(int)c]+alphabetrank->rank(c,sp-1); - ep = C[(int)c]+alphabetrank->rank(c,ep)-1; - - if (sp<=ep) - return ep - sp + 1; - else - return 0; - }*/ - -CSA::TextPosition CSA::Lookup(TextPosition i) const -{ - TextPosition dist=0; - while (!sampled->IsBitSet(i)) - { - - int c = alphabetrank->access(i); - if (c == '\0') - { - // End-markers are sampled - unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; - DocId docId = (*endmarkerDocId)[endmarkerRank]; - return (*textStartPos)[docId] + dist; - } - i = C[c]+alphabetrank->rank(c,i)-1; - ++dist; - } - - return (*suffixes)[sampled->rank(i)-1]+dist; -} - CSA::~CSA() { +#ifdef CSA_TEST_BWT delete dynFMI; +#endif delete alphabetrank; delete sampled; delete suffixes; delete suffixDocId; - delete positions; - delete [] codetable; + delete [] codetable; // FIXME remove delete endmarkerDocId; - delete textLength; - delete textStartPos; delete emptyTextRank; } @@ -989,11 +919,6 @@ void CSA::makewavelet(uchar *bwt) for (i=255;i>=min;--i) if (C[i]>0) {max = i; break;} - // Print frequencies - /*for(i = 0; i < 256; i++) - if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); - fflush(stdout);*/ - ulong prev=C[0], temp; C[0]=0; for (i=1;i<256;i++) { @@ -1006,6 +931,7 @@ void CSA::makewavelet(uchar *bwt) // delete [] bwt; //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! + // FIXME: static_sequence_wvtree accepts only uint arrays uint *text = new uint[n]; for (i = 0; i < n; ++i) // Silly text[i] = bwt[i]; @@ -1014,52 +940,62 @@ void CSA::makewavelet(uchar *bwt) alphabet_mapper * am = new alphabet_mapper_none(); static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? - - //cout << "Building Huffman table..."; cout.flush(); - wt_coder * wtc = new wt_coder_huff(text,n,am); - - //cout << "done" << endl; cout.flush(); - //cout << "Building static_sequence..."; cout.flush(); - alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am); + delete bmb; delete [] text; text = 0; - -/* for (i = 0; i < n; ++i) - { - uchar c = alphabetrank->charAtPos(i); - TextPosition j = C[c]+alphabetrank->rank(c, i)-1; - printf("LF[%lu] = %lu\n", i, j); - }*/ } void CSA::maketables() { + // Calculate BWT end-marker position (of last inserted text) + // Note: bwtEndPos already set! + // and the length of the first text (counter l): +#ifdef CSA_TEST_BWT + ulong i = 0; + ulong l = 1; + + uchar c = alphabetrank->access(i); + while (c != '\0') + { + i = C[c]+alphabetrank->rank(c, i)-1; + ++l; + c = alphabetrank->access(i); + } + assert(i == bwtEndPos); // compare result +#endif + + // Build up arrays for text length and starting positions + // FIXME Temp, remove + //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength)); + BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); + //(*textLength)[0] = l; + (*textStartPos)[0] = 0; + + // Construct samples ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; - - ulong *sampledpositions = new ulong[n/W+1]; unsigned ceilLog2n = Tools::CeilLog2(n); - positions = new BlockArray(sampleLength, ceilLog2n); + BlockArray* positions = new BlockArray(sampleLength, ceilLog2n); BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n); // Mapping from end-markers to doc ID's: endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); - ulong i,j=0; - for (i=0;iGetPos(i) x=(i==n-1)?0:i+1; @@ -1082,7 +1018,7 @@ void CSA::maketables() // Store text length and text start position: if (textId < (DocId)numberOfTexts - 1) { - (*textLength)[textId + 1] = posOfSuccEndmarker - x; + //(*textLength)[textId + 1] = posOfSuccEndmarker - x; (*textStartPos)[textId + 1] = x; // x is the position of end-marker. posOfSuccEndmarker = x; } @@ -1095,51 +1031,31 @@ void CSA::maketables() } assert(textId == 0); - /*i = 0; - for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i) - { - int docc = (*endmarkerDocId)[i]; - ulong poss = (*endmarkerPos)[i]; - printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss); - }*/ -/* - for (i = 0; i < numberOfTexts; ++ i) - { - //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl; - assert(dynTextLength[i].first == (*textLength)[i]); - assert(dynTextLength[i].second == (*textStartPos)[i]); - } -*/ sampled = new BSGAP(sampledpositions,n,true); sampleLength = sampled->rank(n-1); assert(sampleCount == sampleLength); -// std::cout << ";sampleLength;" << sampleLength << std::endl; - // Suffixes == offset from text start position + + // Suffixes store an offset from the text start position suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - //suffixes: - for(i=0; irank((*positions)[i]); + ulong j = sampled->rank((*positions)[i]); if (j==0) j=sampleLength; - TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate; - (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]); + TextPosition textPos = (*tmpSuffix)[i]; + (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos); - assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts); + assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts); assert((*suffixDocId)[j-1] < numberOfTexts); // calculate offset from text start: (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]]; } // FIXME Temp, remove delete tmpSuffix; - tmpSuffix=0; delete positions; - positions =0; - delete textLength; - textLength = 0; +// delete textLength; delete textStartPos; - textStartPos = 0; } @@ -1149,7 +1065,7 @@ void CSA::maketables() * Starting text position of the document is stored into second parameter. * Binary searching on text starting positions. */ -TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const +TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const { assert(i < n);