X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.cpp;h=04ace79dd25e8b21f737198b0b7c892af887db1a;hb=663cd2f6cc5e3796d001e8c527de0aea8c8bbf68;hp=754dfa8eecb797ea2ef159311596ef904e878fdd;hpb=ccc18959e93d8986c0ce81209865a0a2b5f42be6;p=SXSI%2FTextCollection.git diff --git a/CSA.cpp b/CSA.cpp index 754dfa8..04ace79 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -35,10 +35,12 @@ using std::map; using SXSI::TextCollection; +// Save file version info +const uchar CSA::versionFlag = 2; //////////////////////////////////////////////////////////////////////////// // Class CSA::THuffAlphabetRank - +// FIXME Unused code CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) { left = NULL; right = NULL; @@ -129,8 +131,9 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ CSA::CSA(unsigned samplerate) - : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0), - codetable(0), dynFMI(0) + : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0), + codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), + textLength(0), textStartPos(0), emptyTextRank(0) { this->samplerate = samplerate; @@ -141,6 +144,11 @@ CSA::CSA(unsigned samplerate) temp[i] = i+1; temp[255] = 0; dynFMI = new DynFMI(temp, 1, 255, false); + + /* Debug code: take char distribution from data.txt. + uchar *temp = Tools::GetFileContents("data.txt", 0); + dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false); + delete [] temp;*/ } /** @@ -152,33 +160,53 @@ CSA::CSA(unsigned samplerate) void CSA::InsertText(uchar const * text) { // Sanity check: - assert(dynFMI != 0); + assert(dynFMI != 0); TextPosition m = std::strlen((char *)text) + 1; - // Store text length and starting position - textLength.push_back(make_pair(m, n)); - this->n += m; - dynFMI->addText(text, m); + if (m > maxTextLength) + maxTextLength = m; // Store length of the longest text seen so far. + + if (m > 1) + { + this->n += m; + this->numberOfTexts ++; + this->numberOfAllTexts ++; + dynFMI->addText(text, m); + } + else + { + emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here + this->numberOfAllTexts ++; // Replace with dynamic bitvector + } } void CSA::MakeStatic() { // Sanity check: if (dynFMI == 0) + throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); + + // Bitvector of empty texts { - std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl; - std::exit(0); + //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it) + Tools::SetField(tempB, 1, (*it), 1); + emptyTextId.clear(); + emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true); } - + uchar *bwt = dynFMI->getBWT(); - /*printf("1234567890123456789\n"); + /*printf("123456789012345678901234567890123456789\n"); for (TextPosition i = 0; i < n; i ++) if (bwt[i] < 50) printf("%d", (int)bwt[i]); else printf("%c", bwt[i]); - printf("\n"); - */ + printf("\n");*/ + /* for (TextPosition i = 1; i <= n; i ++) { @@ -186,87 +214,132 @@ void CSA::MakeStatic() }*/ // Sanity check - assert(textLength.size() == dynFMI->getCollectionSize()); + assert(numberOfTexts == dynFMI->getCollectionSize()); delete dynFMI; dynFMI = 0; - makewavelet(bwt); + makewavelet(bwt); // Deletes bwt! + bwt = 0; // Calculate BWT end-marker position (of last inserted text) + // and the length of the first text (counter l): ulong i = 0; - while (bwt[i] != '\0') + ulong l = 1; + + uchar c = alphabetrank->access(i); + while (c != '\0') { - uchar c = bwt[i]; i = C[c]+alphabetrank->rank(c, i)-1; + ++l; + c = alphabetrank->access(i); } bwtEndPos = i; + assert(bwtEndPos < n); //printf("bwtEndPos = %lu\n", bwtEndPos); - delete [] bwt; - + // Build up arrays for text length and starting positions + // FIXME Remove, temp data + textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength)); + textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); + (*textLength)[0] = l; + (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables(). + // Make sampling tables maketables(); } +bool CSA::EmptyText(DocId k) const +{ + assert(k < (DocId)numberOfTexts); + if (emptyTextRank->IsBitSet(k)) + return true; + return false; +} uchar* CSA::GetText(DocId k) const { - assert(k < (DocId)textLength.size()); + assert(k < (DocId)numberOfTexts); - uchar* result = new uchar[textLength[k].first]; + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + // Map to non-empty text + k -= textRank; //emptyTextRank->rank(k); + TextPosition i = k; - TextPosition j = textLength[k].first-1; - result[j] = '\0'; - uchar c = alphabetrank->charAtPos(i); + string result; + // Reserve average string length to avoid reallocs + result.reserve(n/numberOfTexts); + + uchar c = alphabetrank->access(i); while (c != '\0') { - --j; - result[j] = c; + result.push_back(c); i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->charAtPos(i); // "next" char. + c = alphabetrank->access(i); // "next" char. } - assert(j == 0); - return result; + + // Convert to uchar (FIXME return string?) + i = result.size(); + uchar* res = new uchar[i+1]; + res[i] = '\0'; + for (ulong j = 0; j < i; ++j) + res[i-j-1] = result[j]; + return res; } - +/* + * Not supported uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const { - assert(k < (DocId)textLength.size()); - assert(j < textLength[k].first); + assert(k < (DocId)numberOfTexts); + assert(j < (*textLength)[k]); assert(i <= j); + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + + // Map to non-empty text + k -= textRank; // emptyTextRank->rank(k); + // Start position of k'th text - ulong start = textLength[k].second; + ulong start = (*textStartPos)[k]; return Substring(i + start, j-i+1); -} + }*/ -/** +/****************************************************************** * Existential queries */ bool CSA::IsPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return true; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); // Check for end-marker(s) in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - if (it != endmarkers.end() && it->first <= ep) + if (CountEndmarkers(sp, ep)) return true; return false; } bool CSA::IsSuffix(uchar const *pattern) const { - // Here counting is as fast as isSuffix(): + // Here counting is as fast as IsSuffix(): if (CountSuffix(pattern) > 0) return true; return false; @@ -276,16 +349,18 @@ bool CSA::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); if (m == 0) - return textLength.size(); + { + if (numberOfAllTexts - numberOfTexts > 0u) + return true; // Empty texts exists + return false; // No empty texts exists + } TextPosition sp = 0, ep = 0; // Match including end-marker Search(pattern, m+1, &sp, &ep); // Check for end-marker(s) in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - if (it != endmarkers.end() && it->first <= ep) + if (CountEndmarkers(sp, ep)) return true; return false; } @@ -308,39 +383,42 @@ bool CSA::IsContains(uchar const * pattern) const bool CSA::IsLessThan(uchar const*) const { // TODO + assert(0); return false; } -/** +/****************************************************************** * Counting queries */ +ulong CSA::Count(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return 0; + + TextPosition sp = 0, ep = 0; + unsigned count = (unsigned) Search(pattern, m, &sp, &ep); + return count; +} + unsigned CSA::CountPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return numberOfAllTexts; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); // Count end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) - { - ++ count; - ++ it; - } - - return count; + return CountEndmarkers(sp, ep); } unsigned CSA::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return numberOfAllTexts; TextPosition sp = 0, ep = 0; // Search with end-marker @@ -353,27 +431,22 @@ unsigned CSA::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) - return textLength.size(); + return numberOfAllTexts - numberOfTexts; // Empty texts. TextPosition sp = 0, ep = 0; // Match including end-marker Search(pattern, m+1, &sp, &ep); // Count end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) - { - ++ count; - ++ it; - } - - return count; + return CountEndmarkers(sp, ep); } unsigned CSA::CountContains(uchar const * pattern) const { + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts; // Total number of texts. + // Here counting is as slow as fetching the result set // because we would have to filter out occ's that fall within same document. TextCollection::document_result result = Contains(pattern); @@ -383,6 +456,7 @@ unsigned CSA::CountContains(uchar const * pattern) const unsigned CSA::CountLessThan(uchar const * pattern) const { // TODO + assert(0); return 0; } @@ -393,7 +467,7 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); @@ -401,18 +475,28 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const TextCollection::document_result result; // Report end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) { - // End-marker we found belongs to the "previous" doc in the collection - DocId docId = ((it->second).first + 1) % textLength.size(); + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); - ++ count; - ++ it; + + -- resultSize; + ++ i; } - + return result; } @@ -420,36 +504,46 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; // Search with end-marker Search(pattern, m+1, &sp, &ep); TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); } + // Assert: c == '\0' OR sampled->IsBitSet(i) + if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker that we found belongs to the "previous" doc in collection: - DocId docId = (endm.first + 1) % textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); } - else + else // Sampled position { - TextPosition textPos = suffixes[sampled->rank(i)-1]; - result.push_back(DocIdAtTextPos(textPos)); + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); } } @@ -460,7 +554,7 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all empty texts TextPosition sp = 0, ep = 0; // Match including end-marker @@ -469,18 +563,27 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const TextCollection::document_result result; // Report end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) { - // End-marker that we found belongs to the "previous" doc in collection: - DocId docId = ((it->second).first + 1) % textLength.size(); + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); - ++ count; - ++ it; + + -- resultSize; + ++ i; } - + return result; } @@ -494,45 +597,50 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); - + // We want unique document indentifiers, using std::set to collect them std::set resultSet; + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + c = alphabetrank->access(i); } if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker we found belongs to the "previous" doc in collection - DocId docId = (endm.first + 1) % textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; resultSet.insert(docId); } else { - TextPosition textPos = suffixes[sampled->rank(i)-1]; - resultSet.insert(DocIdAtTextPos(textPos)); + DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + assert((unsigned)di < numberOfTexts); + resultSet.insert(di); } } - // Convert std::set to std::vector (TODO better way to construct result vector?) - TextCollection::document_result result; - for (std::set::iterator it = resultSet.begin(); it != resultSet.end(); ++it) - result.push_back(*it); + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + // Map to doc ID's + for (document_result::iterator it = result.begin(); it != result.end(); ++it) + *it = emptyTextRank->select0(*it+1); return result; } TextCollection::document_result CSA::LessThan(uchar const * pattern) const { // TODO + assert(0); return document_result(); } @@ -543,40 +651,48 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return full_result(); + return full_result(); // FIXME Throw exception? TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Report each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; TextPosition dist = 0; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); ++ dist; } if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker we found belongs to the "previous" doc in the collection: - DocId docId = (endm.first+1)%textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(make_pair(docId, dist)); } else { - TextPosition textPos = suffixes[sampled->rank(i)-1] + dist; - // Read document identifier and its starting position in text order - DocId docId = DocIdAtTextPos(textPos); - result.push_back(make_pair(docId, textPos - textLength[docId].second)); + TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; +// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(make_pair(docId, textPos)); } } @@ -584,86 +700,60 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const } -/** - * Finds document identifier for given text position - * - * Starting text position of the document is stored into second parameter. - * Binary searching on text starting positions. - */ -TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const -{ - assert(i < n); - - DocId a = 0; - DocId b = textLength.size() - 1; - while (a < b) - { - DocId c = a + (b - a)/2; - if (textLength[c].second > i) - b = c - 1; - else if (textLength[c+1].second > i) - return c; - else - a = c + 1; - } - - assert(a < (DocId)textLength.size()); - assert(i > textLength[a].second); - assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second)); - return a; -} - /** * Save index to a file handle * * Throws a std::runtime_error exception on i/o error. * First byte that is saved represents the version number of the save file. - * In version 1 files, the data fields are: - * version info; - * samplerate; - * length of the BWT; - * end marker position in BWT; - * BWT string of length n; - * number of texts; - * - * array of pairs. - * - * TODO: Save the data structures instead of BWT sequence? + * In version 2 files, the data fields are: + * uchar versionFlag; + TextPosition n; + unsigned samplerate; + unsigned C[256]; + TextPosition bwtEndPos; + static_sequence * alphabetrank; + BSGAP *sampled; + BlockArray *suffixes; + BlockArray *suffixDocId; + unsigned numberOfTexts; + unsigned numberOfAllTexts; + ulong maxTextLength; + BlockArray *endmarkerDocId; + BSGAP *emptyTextRank; */ void CSA::Save(FILE *file) const { - // Saving version 1 data: - uchar versionFlag = 1; + // Saving version info: if (std::fwrite(&versionFlag, 1, 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (version flag)."); + if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (n)."); if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (samplerate)."); - if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (n)."); + for(ulong i = 0; i < 256; ++i) + if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (C table)."); if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (bwt end position)."); - for (ulong offset = 0; offset < n; offset ++) - { - uchar c = alphabetrank->charAtPos(offset); - if (std::fwrite(&c, sizeof(uchar), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (bwt sequence)."); - } - - unsigned r = textLength.size(); - if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (r)."); + alphabetrank->save(file); + sampled->Save(file); + suffixes->Save(file); + suffixDocId->Save(file); - for (r = 0; r < textLength.size(); ++ r) - { - if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (text length)."); - if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (text start)."); - } + if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfTexts)."); + if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts)."); + if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (maxTextLength)."); + + endmarkerDocId->Save(file); + emptyTextRank->Save(file); + fflush(file); } @@ -679,58 +769,61 @@ void CSA::Load(FILE *file, unsigned samplerate) delete dynFMI; dynFMI = 0; delete alphabetrank; alphabetrank = 0; delete sampled; sampled = 0; - delete [] suffixes; suffixes = 0; - delete [] positions; positions = 0; + delete suffixes; suffixes = 0; + delete suffixDocId; suffixDocId = 0; + delete positions; positions = 0; delete [] codetable; codetable = 0; - endmarkers.clear(); - textLength.clear(); + delete endmarkerDocId; endmarkerDocId = 0; + delete emptyTextRank; emptyTextRank = 0; + // FIXME Remove following: + delete textLength; textLength = 0; + delete textStartPos; textStartPos = 0; + this->maxTextLength = 0; + this->numberOfTexts = 0; + this->numberOfAllTexts = 0; this->samplerate = samplerate; this->n = 0; - uchar versionFlag = 0; - if (std::fread(&versionFlag, 1, 1, file) != 1) + uchar verFlag = 0; + if (std::fread(&verFlag, 1, 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (version flag)."); - if (versionFlag != 1) - throw std::runtime_error("CSA::Load(): invalid start byte."); + if (verFlag != CSA::versionFlag) + throw std::runtime_error("CSA::Load(): invalid save file version."); + if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (n)."); if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (samplerate)."); - if (this->samplerate == 0) - this->samplerate = samplerate; +// FIXME samplerate can not be changed during load. +// if (this->samplerate == 0) +// this->samplerate = samplerate; - if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (n)."); + for(ulong i = 0; i < 256; ++i) + if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (C table)."); if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (bwt end position)."); - uchar *bwt = new uchar[n]; - for (ulong offset = 0; offset < n; offset ++) - if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (bwt sequence)."); + alphabetrank = static_sequence::load(file); + sampled = new BSGAP(file); + suffixes = new BlockArray(file); + suffixDocId = new BlockArray(file); - unsigned r = 0; - if (std::fread(&r, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (r)."); - - while (r > 0) - { - TextPosition length = 0, start = 0; - if (std::fread(&length, sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (text length)."); - if (std::fread(&start, sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (text start)."); - - textLength.push_back(make_pair(length, start)); - --r; - } + if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfTexts)."); + if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts)."); + if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (maxTextLength)."); - // Construct data structures - makewavelet(bwt); - delete [] bwt; - maketables(); + endmarkerDocId = new BlockArray(file); + emptyTextRank = new BSGAP(file); + + // FIXME Construct data structures with new samplerate + //maketables(); // FIXME: this will redo text length tables } @@ -739,6 +832,8 @@ void CSA::Load(FILE *file, unsigned samplerate) * Rest of the functions follow... */ +/* + * Not supported uchar * CSA::Substring(TextPosition i, TextPosition l) const { uchar *result = new uchar[l + 1]; @@ -766,18 +861,18 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const } else { - j = positions[k/samplerate+1]; + j = (*positions)[k/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } for (dist = 0; dist < skip + l; dist++) { - int c = alphabetrank->charAtPos(j); + int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp); if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(j))->second; - j = endm.first; // LF-mapping for end-marker + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, j) - 1; + j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker } else j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping @@ -787,7 +882,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const result[l] = 0u; return result; } - +*/ /*TextPosition CSA::inverse(TextPosition i) { TextPosition skip = samplerate - i % samplerate; @@ -799,7 +894,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const } else { - j = positions[i/samplerate+1]; + j = (*positions)[i/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } @@ -812,7 +907,8 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const return j; }*/ -ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +{ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) int c = (int)pattern[m-1]; TextPosition i=m-1; @@ -831,7 +927,7 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return ep - sp + 1; else return 0; - } +} /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) { @@ -844,32 +940,40 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; }*/ -CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma) +CSA::TextPosition CSA::Lookup(TextPosition i) const { TextPosition dist=0; while (!sampled->IsBitSet(i)) { - int c = alphabetrank->charAtPos(i); + + int c = alphabetrank->access(i); if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - return endm.second + dist; // returns text position. + // End-markers are sampled + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + DocId docId = (*endmarkerDocId)[endmarkerRank]; + return (*textStartPos)[docId] + dist; } - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + i = C[c]+alphabetrank->rank(c,i)-1; ++dist; } - return suffixes[sampled->rank(i)-1]+dist; + return (*suffixes)[sampled->rank(i)-1]+dist; } CSA::~CSA() { delete dynFMI; delete alphabetrank; delete sampled; - delete [] suffixes; - delete [] positions; + delete suffixes; + delete suffixDocId; + delete positions; delete [] codetable; + + delete endmarkerDocId; + delete textLength; + delete textStartPos; + delete emptyTextRank; } void CSA::makewavelet(uchar *bwt) @@ -886,7 +990,7 @@ void CSA::makewavelet(uchar *bwt) if (C[i]>0) {max = i; break;} // Print frequencies -/* for(i = 0; i < 256; i++) + /*for(i = 0; i < 256; i++) if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); fflush(stdout);*/ @@ -897,9 +1001,30 @@ void CSA::makewavelet(uchar *bwt) C[i]=C[i-1]+prev; prev = temp; } - this->codetable = node::makecodetable(bwt,n); - alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); - //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); +// this->codetable = node::makecodetable(bwt,n); +// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); +// delete [] bwt; + //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! + + uint *text = new uint[n]; + for (i = 0; i < n; ++i) // Silly + text[i] = bwt[i]; + delete [] bwt; + bwt = 0; + + alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? + + //cout << "Building Huffman table..."; cout.flush(); + + wt_coder * wtc = new wt_coder_huff(text,n,am); + + //cout << "done" << endl; cout.flush(); + //cout << "Building static_sequence..."; cout.flush(); + + alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am); + delete [] text; + text = 0; /* for (i = 0; i < n; ++i) { @@ -914,34 +1039,54 @@ void CSA::maketables() ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; ulong *sampledpositions = new ulong[n/W+1]; - suffixes = new ulong[sampleLength]; - positions = new ulong[sampleLength]; - + unsigned ceilLog2n = Tools::CeilLog2(n); + positions = new BlockArray(sampleLength, ceilLog2n); + BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n); + + // Mapping from end-markers to doc ID's: + endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); + ulong i,j=0; for (i=0;iGetPos(i) x=(i==n-1)?0:i+1; - if (x % samplerate == 0) { + if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { Tools::SetField(sampledpositions,1,p,1); - positions[x/samplerate] = p; + (*positions)[sampleCount] = p; + (*tmpSuffix)[sampleCount] = x; // FIXME remove + sampleCount ++; } - uchar c = alphabetrank->charAtPos(p); + uchar c = alphabetrank->access(p); if (c == '\0') { - // Record the order of end-markers in BWT: --textId; - endmarkers[p] = make_pair(textId, (TextPosition)x); + + // Record the order of end-markers in BWT: + ulong endmarkerRank = alphabetrank->rank(0, p) - 1; + (*endmarkerDocId)[endmarkerRank] = textId; + + // Store text length and text start position: + if (textId < (DocId)numberOfTexts - 1) + { + (*textLength)[textId + 1] = posOfSuccEndmarker - x; + (*textStartPos)[textId + 1] = x; // x is the position of end-marker. + posOfSuccEndmarker = x; + } + // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). p = textId; // Correct LF-mapping to the last char of the previous text. } @@ -949,76 +1094,83 @@ void CSA::maketables() p = C[c]+alphabetrank->rank(c, p)-1; } assert(textId == 0); - /*for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it) + + /*i = 0; + for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i) { - printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second); + int docc = (*endmarkerDocId)[i]; + ulong poss = (*endmarkerPos)[i]; + printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss); }*/ +/* + for (i = 0; i < numberOfTexts; ++ i) + { + //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl; + assert(dynTextLength[i].first == (*textLength)[i]); + assert(dynTextLength[i].second == (*textStartPos)[i]); + } +*/ + sampled = new BSGAP(sampledpositions,n,true); + sampleLength = sampled->rank(n-1); + assert(sampleCount == sampleLength); +// std::cout << ";sampleLength;" << sampleLength << std::endl; + // Suffixes == offset from text start position + suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); + suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - sampled = new BitRank(sampledpositions,n,true); - //suffixes: for(i=0; irank(positions[i]); + assert((*positions)[i] < n); + j = sampled->rank((*positions)[i]); if (j==0) j=sampleLength; - suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate; - } -} + TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate; + (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]); -uchar * CSA::LoadFromFile(const char *filename) -{ - uchar *s; - std::ifstream file (filename, std::ios::in|std::ios::binary); - if (file.is_open()) - { - std::cerr << "Loading CSA from file: " << filename << std::endl; - file.read((char *)&bwtEndPos, sizeof(TextPosition)); - s = new uchar[n]; - for (ulong offset = 0; offset < n; offset ++) - file.read((char *)(s + offset), sizeof(char)); - file.close(); - } - else - { - std::cerr << "Unable to open file " << filename << std::endl; - exit(1); + assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts); + assert((*suffixDocId)[j-1] < numberOfTexts); + // calculate offset from text start: + (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]]; } - return s; + // FIXME Temp, remove + delete tmpSuffix; + tmpSuffix=0; + delete positions; + positions =0; + delete textLength; + textLength = 0; + delete textStartPos; + textStartPos = 0; } -void CSA::SaveToFile(const char *filename, uchar *bwt) + +/** + * Finds document identifier for given text position + * + * Starting text position of the document is stored into second parameter. + * Binary searching on text starting positions. + */ +TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const { - std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc); - if (file.is_open()) - { - std::cerr << "Writing CSA to file: " << filename << std::endl; - file.write((char *)&bwtEndPos, sizeof(TextPosition)); - std::cerr << "Writing BWT of " << n << " bytes." << std::endl; - for (ulong offset = 0; offset < n; offset ++) - file.write((char *)(bwt + offset), sizeof(char)); - file.close(); - } - else + assert(i < n); + + DocId a = 0; + DocId b = numberOfTexts - 1; + while (a < b) { - std::cerr << "Unable to open file " << filename << std::endl; - exit(1); + DocId c = a + (b - a)/2; + if ((*textStartPos)[c] > i) + b = c - 1; + else if ((*textStartPos)[c+1] > i) + return c; + else + a = c + 1; } -} -/*uchar * CSA::BWT(uchar *text) -{ - uchar *s; - - DynFMI *wt = new DynFMI((uchar *) text, n); - s = wt->getBWT(); - for (ulong i=0;i= (*textStartPos)[a]); + assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); + return a; +} CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n) {