From 8bda359694b7233cd6154df9ac691eccaabfbb92 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Fri, 9 Jan 2009 13:37:00 +0000 Subject: [PATCH] Rewrote some data structures and algorithm parts git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@48 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- CSA.cpp | 384 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 201 insertions(+), 183 deletions(-) diff --git a/CSA.cpp b/CSA.cpp index db96b4b..5719154 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -121,7 +121,7 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { } -//////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////f/////////////////////////////// // Class CSA /** @@ -130,7 +130,8 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { */ CSA::CSA(unsigned samplerate) : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0), - codetable(0), dynFMI(0) + codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0), + textLength(0), textStartPos(0) { this->samplerate = samplerate; @@ -141,6 +142,11 @@ CSA::CSA(unsigned samplerate) temp[i] = i+1; temp[255] = 0; dynFMI = new DynFMI(temp, 1, 255, false); + + /* Debug code: take char distribution from data.txt. + uchar *temp = Tools::GetFileContents("data.txt", 0); + dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false); + delete [] temp;*/ } /** @@ -152,12 +158,14 @@ CSA::CSA(unsigned samplerate) void CSA::InsertText(uchar const * text) { // Sanity check: - assert(dynFMI != 0); + assert(dynFMI != 0); TextPosition m = std::strlen((char *)text) + 1; - // Store text length and starting position - textLength.push_back(make_pair(m, n)); + if (m > maxTextLength) + maxTextLength = m; // Store length of the longest text seen so far. + this->n += m; + this->numberOfTexts ++; dynFMI->addText(text, m); } @@ -171,14 +179,14 @@ void CSA::MakeStatic() } uchar *bwt = dynFMI->getBWT(); - /*printf("1234567890123456789\n"); + /*printf("123456789012345678901234567890123456789\n"); for (TextPosition i = 0; i < n; i ++) if (bwt[i] < 50) printf("%d", (int)bwt[i]); else printf("%c", bwt[i]); - printf("\n"); - */ + printf("\n");*/ + /* for (TextPosition i = 1; i <= n; i ++) { @@ -186,42 +194,51 @@ void CSA::MakeStatic() }*/ // Sanity check - assert(textLength.size() == dynFMI->getCollectionSize()); + assert(numberOfTexts == dynFMI->getCollectionSize()); delete dynFMI; dynFMI = 0; - makewavelet(bwt); + makewavelet(bwt); // Calculate BWT end-marker position (of last inserted text) + // and the length of the first text (counter l): ulong i = 0; + ulong l = 1; while (bwt[i] != '\0') { uchar c = bwt[i]; i = C[c]+alphabetrank->rank(c, i)-1; + ++l; } bwtEndPos = i; //printf("bwtEndPos = %lu\n", bwtEndPos); delete [] bwt; + // Build up arrays for text length and starting positions + textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength)); + textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); + (*textLength)[0] = l; + (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables(). + // Make sampling tables maketables(); } bool CSA::EmptyText(DocId k) const { - assert(k < (DocId)textLength.size()); - return (1 == textLength[k].first); + assert(k < (DocId)numberOfTexts); + return (1 == (*textLength)[k]); } uchar* CSA::GetText(DocId k) const { - assert(k < (DocId)textLength.size()); + assert(k < (DocId)numberOfTexts); - uchar* result = new uchar[textLength[k].first]; + uchar* result = new uchar[(*textLength)[k]]; TextPosition i = k; - TextPosition j = textLength[k].first-1; + TextPosition j = (*textLength)[k] - 1; result[j] = '\0'; uchar c = alphabetrank->charAtPos(i); @@ -239,32 +256,30 @@ uchar* CSA::GetText(DocId k) const uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const { - assert(k < (DocId)textLength.size()); - assert(j < textLength[k].first); + assert(k < (DocId)numberOfTexts); + assert(j < (*textLength)[k]); assert(i <= j); // Start position of k'th text - ulong start = textLength[k].second; + ulong start = (*textStartPos)[k]; return Substring(i + start, j-i+1); } -/** +/****************************************************************** * Existential queries */ bool CSA::IsPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return true; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); // Check for end-marker(s) in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - if (it != endmarkers.end() && it->first <= ep) + if (CountEndmarkers(sp, ep)) return true; return false; } @@ -280,17 +295,15 @@ bool CSA::IsSuffix(uchar const *pattern) const bool CSA::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); - if (m == 0) - return textLength.size(); + //if (m == 0) + // return false; // FIXME Check for empty texts! TextPosition sp = 0, ep = 0; // Match including end-marker Search(pattern, m+1, &sp, &ep); // Check for end-marker(s) in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - if (it != endmarkers.end() && it->first <= ep) + if (CountEndmarkers(sp, ep)) return true; return false; } @@ -313,39 +326,31 @@ bool CSA::IsContains(uchar const * pattern) const bool CSA::IsLessThan(uchar const*) const { // TODO + assert(0); return false; } -/** +/****************************************************************** * Counting queries */ unsigned CSA::CountPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return numberOfTexts; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); // Count end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) - { - ++ count; - ++ it; - } - - return count; + return CountEndmarkers(sp, ep); } unsigned CSA::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return numberOfTexts; TextPosition sp = 0, ep = 0; // Search with end-marker @@ -357,24 +362,15 @@ unsigned CSA::CountSuffix(uchar const * pattern) const unsigned CSA::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); - if (m == 0) - return textLength.size(); +// if (m == 0) +// return ; // FIXME Test for empty texts. TextPosition sp = 0, ep = 0; // Match including end-marker Search(pattern, m+1, &sp, &ep); // Count end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) - { - ++ count; - ++ it; - } - - return count; + return CountEndmarkers(sp, ep); } unsigned CSA::CountContains(uchar const * pattern) const @@ -388,6 +384,7 @@ unsigned CSA::CountContains(uchar const * pattern) const unsigned CSA::CountLessThan(uchar const * pattern) const { // TODO + assert(0); return 0; } @@ -406,18 +403,22 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const TextCollection::document_result result; // Report end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) { - // End-marker we found belongs to the "previous" doc in the collection - DocId docId = ((it->second).first + 1) % textLength.size(); + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; result.push_back(docId); - ++ count; - ++ it; + -- resultSize; + ++ i; } - + return result; } @@ -445,15 +446,16 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const } if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker that we found belongs to the "previous" doc in collection: - DocId docId = (endm.first + 1) % textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; result.push_back(docId); } - else + else // Sampled position { - TextPosition textPos = suffixes[sampled->rank(i)-1]; + TextPosition textPos = (*suffixes)[sampled->rank(i)-1]; result.push_back(DocIdAtTextPos(textPos)); } } @@ -474,18 +476,22 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const TextCollection::document_result result; // Report end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) { - // End-marker that we found belongs to the "previous" doc in collection: - DocId docId = ((it->second).first + 1) % textLength.size(); + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; result.push_back(docId); - ++ count; - ++ it; + -- resultSize; + ++ i; } - + return result; } @@ -515,15 +521,16 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const } if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker we found belongs to the "previous" doc in collection - DocId docId = (endm.first + 1) % textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; resultSet.insert(docId); } else { - TextPosition textPos = suffixes[sampled->rank(i)-1]; + TextPosition textPos = (*suffixes)[sampled->rank(i)-1]; resultSet.insert(DocIdAtTextPos(textPos)); } } @@ -538,6 +545,7 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const TextCollection::document_result CSA::LessThan(uchar const * pattern) const { // TODO + assert(0); return document_result(); } @@ -570,18 +578,19 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const } if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker we found belongs to the "previous" doc in the collection: - DocId docId = (endm.first+1)%textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; result.push_back(make_pair(docId, dist)); } else { - TextPosition textPos = suffixes[sampled->rank(i)-1] + dist; + TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist; // Read document identifier and its starting position in text order DocId docId = DocIdAtTextPos(textPos); - result.push_back(make_pair(docId, textPos - textLength[docId].second)); + result.push_back(make_pair(docId, textPos - (*textStartPos)[docId])); } } @@ -600,24 +609,39 @@ TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const assert(i < n); DocId a = 0; - DocId b = textLength.size() - 1; + DocId b = numberOfTexts - 1; while (a < b) { DocId c = a + (b - a)/2; - if (textLength[c].second > i) + if ((*textStartPos)[c] > i) b = c - 1; - else if (textLength[c+1].second > i) + else if ((*textStartPos)[c+1] > i) return c; else a = c + 1; } - assert(a < (DocId)textLength.size()); - assert(i > textLength[a].second); - assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second)); + assert(a < (DocId)numberOfTexts); + assert(i > (*textStartPos)[a]); + assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); return a; } +/** + * Count end-markers in given interval + */ +unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const +{ + if (sp > ep) + return 0; + + ulong ranksp = 0; + if (sp != 0) + ranksp = alphabetrank->rank(0, sp - 1); + + return alphabetrank->rank(0, ep) - ranksp; +} + /** * Save index to a file handle * @@ -630,10 +654,12 @@ TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const * end marker position in BWT; * BWT string of length n; * number of texts; + * length of longest text; * * array of pairs. * * TODO: Save the data structures instead of BWT sequence? + * TODO: Don't save textLength and textStartPos arrays. */ void CSA::Save(FILE *file) const { @@ -658,15 +684,18 @@ void CSA::Save(FILE *file) const throw std::runtime_error("CSA::Save(): file write error (bwt sequence)."); } - unsigned r = textLength.size(); + unsigned r = numberOfTexts; if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (r)."); + + if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (maxTextLength)."); - for (r = 0; r < textLength.size(); ++ r) + for (r = 0; r < numberOfTexts; ++ r) { - if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1) + if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (text length)."); - if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1) + if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (text start)."); } } @@ -688,9 +717,12 @@ void CSA::Load(FILE *file, unsigned samplerate) delete [] positions; positions = 0; delete [] codetable; codetable = 0; - endmarkers.clear(); - textLength.clear(); + delete endmarkerDocId; endmarkerDocId = 0; + delete textLength; textLength = 0; + delete textStartPos; textStartPos = 0; + this->maxTextLength = 0; + this->numberOfTexts = 0; this->samplerate = samplerate; this->n = 0; @@ -719,7 +751,14 @@ void CSA::Load(FILE *file, unsigned samplerate) unsigned r = 0; if (std::fread(&r, sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (r)."); + + if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (maxTextLength)."); + // Build up arrays for text length and starting positions + textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength)); + textStartPos = new BlockArray(r, Tools::CeilLog2(this->n)); + while (r > 0) { TextPosition length = 0, start = 0; @@ -728,14 +767,16 @@ void CSA::Load(FILE *file, unsigned samplerate) if (std::fread(&start, sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (text start)."); - textLength.push_back(make_pair(length, start)); + (*textLength)[numberOfTexts] = length; + (*textStartPos)[numberOfTexts] = start; + ++numberOfTexts; --r; } // Construct data structures makewavelet(bwt); delete [] bwt; - maketables(); + maketables(); // FIXME: this will redo text length tables } @@ -771,7 +812,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const } else { - j = positions[k/samplerate+1]; + j = (*positions)[k/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } @@ -780,9 +821,9 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const int c = alphabetrank->charAtPos(j); if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(j))->second; - j = endm.first; // LF-mapping for end-marker + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, j) - 1; + j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker } else j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping @@ -804,7 +845,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const } else { - j = positions[i/samplerate+1]; + j = (*positions)[i/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } @@ -857,24 +898,29 @@ CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(sample int c = alphabetrank->charAtPos(i); if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - return endm.second + dist; // returns text position. + // End-markers are sampled + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + DocId docId = (*endmarkerDocId)[endmarkerRank]; + return (*textStartPos)[docId] + dist; } i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping ++dist; } - return suffixes[sampled->rank(i)-1]+dist; + return (*suffixes)[sampled->rank(i)-1]+dist; } CSA::~CSA() { delete dynFMI; delete alphabetrank; delete sampled; - delete [] suffixes; - delete [] positions; + delete suffixes; + delete positions; delete [] codetable; + + delete endmarkerDocId; + delete textLength; + delete textStartPos; } void CSA::makewavelet(uchar *bwt) @@ -891,7 +937,7 @@ void CSA::makewavelet(uchar *bwt) if (C[i]>0) {max = i; break;} // Print frequencies -/* for(i = 0; i < 256; i++) + /*for(i = 0; i < 256; i++) if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); fflush(stdout);*/ @@ -919,34 +965,51 @@ void CSA::maketables() ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; ulong *sampledpositions = new ulong[n/W+1]; - suffixes = new ulong[sampleLength]; - positions = new ulong[sampleLength]; - + unsigned ceilLog2n = Tools::CeilLog2(n); + suffixes = new BlockArray(sampleLength, ceilLog2n); + positions = new BlockArray(sampleLength, ceilLog2n); + + // Mapping from end-markers to doc ID's: + endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); + ulong i,j=0; for (i=0;iGetPos(i) x=(i==n-1)?0:i+1; if (x % samplerate == 0) { Tools::SetField(sampledpositions,1,p,1); - positions[x/samplerate] = p; + (*positions)[x/samplerate] = p; } uchar c = alphabetrank->charAtPos(p); if (c == '\0') { - // Record the order of end-markers in BWT: --textId; - endmarkers[p] = make_pair(textId, (TextPosition)x); + + // Record the order of end-markers in BWT: + ulong endmarkerRank = alphabetrank->rank(0, p) - 1; + (*endmarkerDocId)[endmarkerRank] = textId; + + // Store text length and text start position: + if (textId < (DocId)numberOfTexts - 1) + { + (*textLength)[textId + 1] = posOfSuccEndmarker - x; + (*textStartPos)[textId + 1] = x; // x is the position of end-marker. + posOfSuccEndmarker = x; + } + // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). p = textId; // Correct LF-mapping to the last char of the previous text. } @@ -954,77 +1017,32 @@ void CSA::maketables() p = C[c]+alphabetrank->rank(c, p)-1; } assert(textId == 0); - /*for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it) + + /*i = 0; + for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i) { - printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second); + int docc = (*endmarkerDocId)[i]; + ulong poss = (*endmarkerPos)[i]; + printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss); }*/ - +/* + for (i = 0; i < numberOfTexts; ++ i) + { + //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl; + assert(dynTextLength[i].first == (*textLength)[i]); + assert(dynTextLength[i].second == (*textStartPos)[i]); + } +*/ sampled = new BitRank(sampledpositions,n,true); //suffixes: for(i=0; irank(positions[i]); + j = sampled->rank((*positions)[i]); if (j==0) j=sampleLength; - suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate; + (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate; } } -uchar * CSA::LoadFromFile(const char *filename) -{ - uchar *s; - std::ifstream file (filename, std::ios::in|std::ios::binary); - if (file.is_open()) - { - std::cerr << "Loading CSA from file: " << filename << std::endl; - file.read((char *)&bwtEndPos, sizeof(TextPosition)); - s = new uchar[n]; - for (ulong offset = 0; offset < n; offset ++) - file.read((char *)(s + offset), sizeof(char)); - file.close(); - } - else - { - std::cerr << "Unable to open file " << filename << std::endl; - exit(1); - } - return s; -} - -void CSA::SaveToFile(const char *filename, uchar *bwt) -{ - std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc); - if (file.is_open()) - { - std::cerr << "Writing CSA to file: " << filename << std::endl; - file.write((char *)&bwtEndPos, sizeof(TextPosition)); - std::cerr << "Writing BWT of " << n << " bytes." << std::endl; - for (ulong offset = 0; offset < n; offset ++) - file.write((char *)(bwt + offset), sizeof(char)); - file.close(); - } - else - { - std::cerr << "Unable to open file " << filename << std::endl; - exit(1); - } -} - -/*uchar * CSA::BWT(uchar *text) -{ - uchar *s; - - DynFMI *wt = new DynFMI((uchar *) text, n); - s = wt->getBWT(); - for (ulong i=0;i