X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.cpp;h=04ace79dd25e8b21f737198b0b7c892af887db1a;hb=368c3edc500ee74859732705e46b7c346e8a6d65;hp=a15c733abe1ae300225ea9df20a714cf99d1ad80;hpb=7d27a4450ed429e3b63e9d3ba7217a28cbbf9a31;p=SXSI%2FTextCollection.git diff --git a/CSA.cpp b/CSA.cpp index a15c733..04ace79 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include // For strlen() using std::vector; @@ -34,10 +35,12 @@ using std::map; using SXSI::TextCollection; +// Save file version info +const uchar CSA::versionFlag = 2; //////////////////////////////////////////////////////////////////////////// // Class CSA::THuffAlphabetRank - +// FIXME Unused code CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) { left = NULL; right = NULL; @@ -128,8 +131,9 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ CSA::CSA(unsigned samplerate) - : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0), - codetable(0), dynFMI(0) + : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0), + codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), + textLength(0), textStartPos(0), emptyTextRank(0) { this->samplerate = samplerate; @@ -140,6 +144,11 @@ CSA::CSA(unsigned samplerate) temp[i] = i+1; temp[255] = 0; dynFMI = new DynFMI(temp, 1, 255, false); + + /* Debug code: take char distribution from data.txt. + uchar *temp = Tools::GetFileContents("data.txt", 0); + dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false); + delete [] temp;*/ } /** @@ -151,33 +160,53 @@ CSA::CSA(unsigned samplerate) void CSA::InsertText(uchar const * text) { // Sanity check: - assert(dynFMI != 0); + assert(dynFMI != 0); TextPosition m = std::strlen((char *)text) + 1; - // Store text length and starting position - textLength.push_back(make_pair(m, n)); - this->n += m; - dynFMI->addText(text, m); + if (m > maxTextLength) + maxTextLength = m; // Store length of the longest text seen so far. + + if (m > 1) + { + this->n += m; + this->numberOfTexts ++; + this->numberOfAllTexts ++; + dynFMI->addText(text, m); + } + else + { + emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here + this->numberOfAllTexts ++; // Replace with dynamic bitvector + } } void CSA::MakeStatic() { // Sanity check: if (dynFMI == 0) + throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); + + // Bitvector of empty texts { - std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl; - std::exit(0); + //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it) + Tools::SetField(tempB, 1, (*it), 1); + emptyTextId.clear(); + emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true); } - + uchar *bwt = dynFMI->getBWT(); - /*printf("1234567890123456789\n"); + /*printf("123456789012345678901234567890123456789\n"); for (TextPosition i = 0; i < n; i ++) if (bwt[i] < 50) printf("%d", (int)bwt[i]); else printf("%c", bwt[i]); - printf("\n"); - */ + printf("\n");*/ + /* for (TextPosition i = 1; i <= n; i ++) { @@ -185,123 +214,132 @@ void CSA::MakeStatic() }*/ // Sanity check - assert(textLength.size() == dynFMI->getCollectionSize()); + assert(numberOfTexts == dynFMI->getCollectionSize()); delete dynFMI; dynFMI = 0; - ulong i, min = 0, - max; - for (i=0;i<256;i++) - C[i]=0; - for (i=0;i0) {min = i; break;} - for (i=255;i>=min;--i) - if (C[i]>0) {max = i; break;} - - // Print frequencies -/* for(i = 0; i < 256; i++) - if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); - fflush(stdout);*/ - - ulong prev=C[0], temp; - C[0]=0; - for (i=1;i<256;i++) { - temp = C[i]; - C[i]=C[i-1]+prev; - prev = temp; - } - this->codetable = node::makecodetable(bwt,n); - alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); - //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); - -/* for (i = 0; i < n; ++i) - { - uchar c = alphabetrank->charAtPos(i); - TextPosition j = C[c]+alphabetrank->rank(c, i)-1; - printf("LF[%lu] = %lu\n", i, j); - }*/ + makewavelet(bwt); // Deletes bwt! + bwt = 0; // Calculate BWT end-marker position (of last inserted text) - i = 0; - while (bwt[i] != '\0') + // and the length of the first text (counter l): + ulong i = 0; + ulong l = 1; + + uchar c = alphabetrank->access(i); + while (c != '\0') { - uchar c = alphabetrank->charAtPos(i); i = C[c]+alphabetrank->rank(c, i)-1; + ++l; + c = alphabetrank->access(i); } bwtEndPos = i; + assert(bwtEndPos < n); //printf("bwtEndPos = %lu\n", bwtEndPos); - - delete [] bwt; + // Build up arrays for text length and starting positions + // FIXME Remove, temp data + textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength)); + textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); + (*textLength)[0] = l; + (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables(). + // Make sampling tables maketables(); - // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small - // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+... - //printf("FMindex takes %d B\n", - // 6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8); } +bool CSA::EmptyText(DocId k) const +{ + assert(k < (DocId)numberOfTexts); + if (emptyTextRank->IsBitSet(k)) + return true; + return false; +} uchar* CSA::GetText(DocId k) const { - assert(k < (DocId)textLength.size()); + assert(k < (DocId)numberOfTexts); - uchar* result = new uchar[textLength[k].first]; + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + // Map to non-empty text + k -= textRank; //emptyTextRank->rank(k); + TextPosition i = k; - TextPosition j = textLength[k].first-1; - result[j] = '\0'; - uchar c = alphabetrank->charAtPos(i); + string result; + // Reserve average string length to avoid reallocs + result.reserve(n/numberOfTexts); + + uchar c = alphabetrank->access(i); while (c != '\0') { - --j; - result[j] = c; + result.push_back(c); i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->charAtPos(i); // "next" char. + c = alphabetrank->access(i); // "next" char. } - assert(j == 0); - return result; + + // Convert to uchar (FIXME return string?) + i = result.size(); + uchar* res = new uchar[i+1]; + res[i] = '\0'; + for (ulong j = 0; j < i; ++j) + res[i-j-1] = result[j]; + return res; } - +/* + * Not supported uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const { - assert(k < (DocId)textLength.size()); - assert(j < textLength[k].first); + assert(k < (DocId)numberOfTexts); + assert(j < (*textLength)[k]); assert(i <= j); + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + + // Map to non-empty text + k -= textRank; // emptyTextRank->rank(k); + // Start position of k'th text - ulong start = textLength[k].second; + ulong start = (*textStartPos)[k]; return Substring(i + start, j-i+1); -} + }*/ -/** +/****************************************************************** * Existential queries */ bool CSA::IsPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return true; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); // Check for end-marker(s) in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - if (it != endmarkers.end() && it->first <= ep) + if (CountEndmarkers(sp, ep)) return true; return false; } bool CSA::IsSuffix(uchar const *pattern) const { - // Here counting is as fast as isSuffix(): + // Here counting is as fast as IsSuffix(): if (CountSuffix(pattern) > 0) return true; return false; @@ -311,16 +349,18 @@ bool CSA::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); if (m == 0) - return textLength.size(); + { + if (numberOfAllTexts - numberOfTexts > 0u) + return true; // Empty texts exists + return false; // No empty texts exists + } TextPosition sp = 0, ep = 0; // Match including end-marker Search(pattern, m+1, &sp, &ep); // Check for end-marker(s) in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - if (it != endmarkers.end() && it->first <= ep) + if (CountEndmarkers(sp, ep)) return true; return false; } @@ -343,39 +383,42 @@ bool CSA::IsContains(uchar const * pattern) const bool CSA::IsLessThan(uchar const*) const { // TODO + assert(0); return false; } -/** +/****************************************************************** * Counting queries */ +ulong CSA::Count(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return 0; + + TextPosition sp = 0, ep = 0; + unsigned count = (unsigned) Search(pattern, m, &sp, &ep); + return count; +} + unsigned CSA::CountPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return numberOfAllTexts; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); // Count end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) - { - ++ count; - ++ it; - } - - return count; + return CountEndmarkers(sp, ep); } unsigned CSA::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return textLength.size(); + return numberOfAllTexts; TextPosition sp = 0, ep = 0; // Search with end-marker @@ -388,27 +431,22 @@ unsigned CSA::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) - return textLength.size(); + return numberOfAllTexts - numberOfTexts; // Empty texts. TextPosition sp = 0, ep = 0; // Match including end-marker Search(pattern, m+1, &sp, &ep); // Count end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) - { - ++ count; - ++ it; - } - - return count; + return CountEndmarkers(sp, ep); } unsigned CSA::CountContains(uchar const * pattern) const { + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts; // Total number of texts. + // Here counting is as slow as fetching the result set // because we would have to filter out occ's that fall within same document. TextCollection::document_result result = Contains(pattern); @@ -418,6 +456,7 @@ unsigned CSA::CountContains(uchar const * pattern) const unsigned CSA::CountLessThan(uchar const * pattern) const { // TODO + assert(0); return 0; } @@ -428,7 +467,7 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); @@ -436,18 +475,28 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const TextCollection::document_result result; // Report end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep]: + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) { - // End-marker we found belongs to the "previous" doc in the collection - DocId docId = ((it->second).first + 1) % textLength.size(); + // End-marker we found belongs to the "preceeding" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); - ++ count; - ++ it; + + -- resultSize; + ++ i; } - + return result; } @@ -455,36 +504,46 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; // Search with end-marker Search(pattern, m+1, &sp, &ep); TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); } + // Assert: c == '\0' OR sampled->IsBitSet(i) + if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker that we found belongs to the "previous" doc in collection: - DocId docId = (endm.first + 1) % textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); } - else + else // Sampled position { - TextPosition textPos = suffixes[sampled->rank(i)-1]; - result.push_back(DocIdAtTextPos(textPos)); + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); } } @@ -495,7 +554,7 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all empty texts TextPosition sp = 0, ep = 0; // Match including end-marker @@ -504,18 +563,27 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const TextCollection::document_result result; // Report end-markers in result interval - map >::const_iterator it; - it = endmarkers.lower_bound(sp); - unsigned count = 0; - while (it != endmarkers.end() && it->first <= ep) + unsigned resultSize = CountEndmarkers(sp, ep); + if (resultSize == 0) + return result; + + result.reserve(resultSize); // Try to avoid reallocation. + + unsigned i = 0; + if (sp > 0) + i = alphabetrank->rank(0, sp - 1); + while (resultSize) { - // End-marker that we found belongs to the "previous" doc in collection: - DocId docId = ((it->second).first + 1) % textLength.size(); + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); - ++ count; - ++ it; + + -- resultSize; + ++ i; } - + return result; } @@ -529,45 +597,50 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); - + // We want unique document indentifiers, using std::set to collect them std::set resultSet; + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + c = alphabetrank->access(i); } if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker we found belongs to the "previous" doc in collection - DocId docId = (endm.first + 1) % textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; resultSet.insert(docId); } else { - TextPosition textPos = suffixes[sampled->rank(i)-1]; - resultSet.insert(DocIdAtTextPos(textPos)); + DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + assert((unsigned)di < numberOfTexts); + resultSet.insert(di); } } - // Convert std::set to std::vector (TODO better way to construct result vector?) - TextCollection::document_result result; - for (std::set::iterator it = resultSet.begin(); it != resultSet.end(); ++it) - result.push_back(*it); + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + // Map to doc ID's + for (document_result::iterator it = result.begin(); it != result.end(); ++it) + *it = emptyTextRank->select0(*it+1); return result; } TextCollection::document_result CSA::LessThan(uchar const * pattern) const { // TODO + assert(0); return document_result(); } @@ -578,40 +651,48 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return full_result(); + return full_result(); // FIXME Throw exception? TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Report each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; TextPosition dist = 0; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); ++ dist; } if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - // End-marker we found belongs to the "previous" doc in the collection: - DocId docId = (endm.first+1)%textLength.size(); + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + + // End-marker that we found belongs to the "preceeding" doc in collection: + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(make_pair(docId, dist)); } else { - TextPosition textPos = suffixes[sampled->rank(i)-1] + dist; - // Read document identifier and its starting position in text order - DocId docId = DocIdAtTextPos(textPos); - result.push_back(make_pair(docId, textPos - textLength[docId].second)); + TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; +// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(make_pair(docId, textPos)); } } @@ -620,50 +701,139 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const /** - * Finds document identifier for given text position + * Save index to a file handle * - * Starting text position of the document is stored into second parameter. - * Binary searching on text starting positions. + * Throws a std::runtime_error exception on i/o error. + * First byte that is saved represents the version number of the save file. + * In version 2 files, the data fields are: + * uchar versionFlag; + TextPosition n; + unsigned samplerate; + unsigned C[256]; + TextPosition bwtEndPos; + static_sequence * alphabetrank; + BSGAP *sampled; + BlockArray *suffixes; + BlockArray *suffixDocId; + unsigned numberOfTexts; + unsigned numberOfAllTexts; + ulong maxTextLength; + BlockArray *endmarkerDocId; + BSGAP *emptyTextRank; */ -TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const +void CSA::Save(FILE *file) const { - assert(i < n); - - DocId a = 0; - DocId b = textLength.size() - 1; - while (a < b) - { - DocId c = a + (b - a)/2; - if (textLength[c].second > i) - b = c - 1; - else if (textLength[c+1].second > i) - return c; - else - a = c + 1; - } + // Saving version info: + if (std::fwrite(&versionFlag, 1, 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (version flag)."); - assert(a < (DocId)textLength.size()); - assert(i > textLength[a].second); - assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second)); - return a; -} + if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (n)."); + if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (samplerate)."); -void CSA::Load(FILE *filename, unsigned samplerate) -{ - // TODO + for(ulong i = 0; i < 256; ++i) + if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (C table)."); + if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (bwt end position)."); + + alphabetrank->save(file); + sampled->Save(file); + suffixes->Save(file); + suffixDocId->Save(file); + + if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfTexts)."); + if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts)."); + if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (maxTextLength)."); + + endmarkerDocId->Save(file); + emptyTextRank->Save(file); + fflush(file); } -void CSA::Save(FILE *filename) + +/** + * Load index from a file handle + * + * Throws a std::runtime_error exception on i/o error. + * For more info, see CSA::Save(). + */ +void CSA::Load(FILE *file, unsigned samplerate) { - // TODO + // Delete everything + delete dynFMI; dynFMI = 0; + delete alphabetrank; alphabetrank = 0; + delete sampled; sampled = 0; + delete suffixes; suffixes = 0; + delete suffixDocId; suffixDocId = 0; + delete positions; positions = 0; + delete [] codetable; codetable = 0; + + delete endmarkerDocId; endmarkerDocId = 0; + delete emptyTextRank; emptyTextRank = 0; + // FIXME Remove following: + delete textLength; textLength = 0; + delete textStartPos; textStartPos = 0; + + this->maxTextLength = 0; + this->numberOfTexts = 0; + this->numberOfAllTexts = 0; + this->samplerate = samplerate; + this->n = 0; + + uchar verFlag = 0; + if (std::fread(&verFlag, 1, 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (version flag)."); + if (verFlag != CSA::versionFlag) + throw std::runtime_error("CSA::Load(): invalid save file version."); + + if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (n)."); + if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (samplerate)."); +// FIXME samplerate can not be changed during load. +// if (this->samplerate == 0) +// this->samplerate = samplerate; + + for(ulong i = 0; i < 256; ++i) + if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (C table)."); + + if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (bwt end position)."); + + alphabetrank = static_sequence::load(file); + sampled = new BSGAP(file); + suffixes = new BlockArray(file); + suffixDocId = new BlockArray(file); + + if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfTexts)."); + if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts)."); + if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (maxTextLength)."); + + endmarkerDocId = new BlockArray(file); + emptyTextRank = new BSGAP(file); + + // FIXME Construct data structures with new samplerate + //maketables(); // FIXME: this will redo text length tables } + /** * Rest of the functions follow... */ +/* + * Not supported uchar * CSA::Substring(TextPosition i, TextPosition l) const { uchar *result = new uchar[l + 1]; @@ -691,18 +861,18 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const } else { - j = positions[k/samplerate+1]; + j = (*positions)[k/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } for (dist = 0; dist < skip + l; dist++) { - int c = alphabetrank->charAtPos(j); + int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp); if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(j))->second; - j = endm.first; // LF-mapping for end-marker + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, j) - 1; + j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker } else j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping @@ -712,7 +882,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const result[l] = 0u; return result; } - +*/ /*TextPosition CSA::inverse(TextPosition i) { TextPosition skip = samplerate - i % samplerate; @@ -724,7 +894,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const } else { - j = positions[i/samplerate+1]; + j = (*positions)[i/samplerate+1]; //cout << samplerate << ' ' << j << '\n'; } @@ -737,7 +907,8 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const return j; }*/ -ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +{ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) int c = (int)pattern[m-1]; TextPosition i=m-1; @@ -756,7 +927,7 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return ep - sp + 1; else return 0; - } +} /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) { @@ -769,67 +940,153 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; }*/ -CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma) +CSA::TextPosition CSA::Lookup(TextPosition i) const { TextPosition dist=0; while (!sampled->IsBitSet(i)) { - int c = alphabetrank->charAtPos(i); + + int c = alphabetrank->access(i); if (c == '\0') { - // map::operator[] is not const, using find() instead: - pair endm = (endmarkers.find(i))->second; - return endm.second + dist; // returns text position. + // End-markers are sampled + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + DocId docId = (*endmarkerDocId)[endmarkerRank]; + return (*textStartPos)[docId] + dist; } - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + i = C[c]+alphabetrank->rank(c,i)-1; ++dist; } - return suffixes[sampled->rank(i)-1]+dist; + return (*suffixes)[sampled->rank(i)-1]+dist; } CSA::~CSA() { + delete dynFMI; delete alphabetrank; delete sampled; - delete [] suffixes; - delete [] positions; + delete suffixes; + delete suffixDocId; + delete positions; delete [] codetable; + + delete endmarkerDocId; + delete textLength; + delete textStartPos; + delete emptyTextRank; } +void CSA::makewavelet(uchar *bwt) +{ + ulong i, min = 0, + max; + for (i=0;i<256;i++) + C[i]=0; + for (i=0;i0) {min = i; break;} + for (i=255;i>=min;--i) + if (C[i]>0) {max = i; break;} + + // Print frequencies + /*for(i = 0; i < 256; i++) + if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); + fflush(stdout);*/ + + ulong prev=C[0], temp; + C[0]=0; + for (i=1;i<256;i++) { + temp = C[i]; + C[i]=C[i-1]+prev; + prev = temp; + } +// this->codetable = node::makecodetable(bwt,n); +// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); +// delete [] bwt; + //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! + + uint *text = new uint[n]; + for (i = 0; i < n; ++i) // Silly + text[i] = bwt[i]; + delete [] bwt; + bwt = 0; + + alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? + + //cout << "Building Huffman table..."; cout.flush(); + + wt_coder * wtc = new wt_coder_huff(text,n,am); + + //cout << "done" << endl; cout.flush(); + //cout << "Building static_sequence..."; cout.flush(); + + alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am); + delete [] text; + text = 0; + +/* for (i = 0; i < n; ++i) + { + uchar c = alphabetrank->charAtPos(i); + TextPosition j = C[c]+alphabetrank->rank(c, i)-1; + printf("LF[%lu] = %lu\n", i, j); + }*/ +} void CSA::maketables() { ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; ulong *sampledpositions = new ulong[n/W+1]; - suffixes = new ulong[sampleLength]; - positions = new ulong[sampleLength]; - + unsigned ceilLog2n = Tools::CeilLog2(n); + positions = new BlockArray(sampleLength, ceilLog2n); + BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n); + + // Mapping from end-markers to doc ID's: + endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); + ulong i,j=0; for (i=0;iGetPos(i) x=(i==n-1)?0:i+1; - if (x % samplerate == 0) { + if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { Tools::SetField(sampledpositions,1,p,1); - positions[x/samplerate] = p; + (*positions)[sampleCount] = p; + (*tmpSuffix)[sampleCount] = x; // FIXME remove + sampleCount ++; } - uchar c = alphabetrank->charAtPos(p); + uchar c = alphabetrank->access(p); if (c == '\0') { - // Record the order of end-markers in BWT: --textId; - endmarkers[p] = make_pair(textId, (TextPosition)x); + + // Record the order of end-markers in BWT: + ulong endmarkerRank = alphabetrank->rank(0, p) - 1; + (*endmarkerDocId)[endmarkerRank] = textId; + + // Store text length and text start position: + if (textId < (DocId)numberOfTexts - 1) + { + (*textLength)[textId + 1] = posOfSuccEndmarker - x; + (*textStartPos)[textId + 1] = x; // x is the position of end-marker. + posOfSuccEndmarker = x; + } + // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). p = textId; // Correct LF-mapping to the last char of the previous text. } @@ -837,76 +1094,83 @@ void CSA::maketables() p = C[c]+alphabetrank->rank(c, p)-1; } assert(textId == 0); - /*for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it) + + /*i = 0; + for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i) { - printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second); + int docc = (*endmarkerDocId)[i]; + ulong poss = (*endmarkerPos)[i]; + printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss); }*/ +/* + for (i = 0; i < numberOfTexts; ++ i) + { + //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl; + assert(dynTextLength[i].first == (*textLength)[i]); + assert(dynTextLength[i].second == (*textStartPos)[i]); + } +*/ + sampled = new BSGAP(sampledpositions,n,true); + sampleLength = sampled->rank(n-1); + assert(sampleCount == sampleLength); +// std::cout << ";sampleLength;" << sampleLength << std::endl; + // Suffixes == offset from text start position + suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); + suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - sampled = new BitRank(sampledpositions,n,true); - //suffixes: for(i=0; irank(positions[i]); + assert((*positions)[i] < n); + j = sampled->rank((*positions)[i]); if (j==0) j=sampleLength; - suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate; - } -} + TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate; + (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]); -uchar * CSA::LoadFromFile(const char *filename) -{ - uchar *s; - std::ifstream file (filename, std::ios::in|std::ios::binary); - if (file.is_open()) - { - std::cerr << "Loading CSA from file: " << filename << std::endl; - file.read((char *)&bwtEndPos, sizeof(TextPosition)); - s = new uchar[n]; - for (ulong offset = 0; offset < n; offset ++) - file.read((char *)(s + offset), sizeof(char)); - file.close(); + assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts); + assert((*suffixDocId)[j-1] < numberOfTexts); + // calculate offset from text start: + (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]]; } - else - { - std::cerr << "Unable to open file " << filename << std::endl; - exit(1); - } - return s; + // FIXME Temp, remove + delete tmpSuffix; + tmpSuffix=0; + delete positions; + positions =0; + delete textLength; + textLength = 0; + delete textStartPos; + textStartPos = 0; } -void CSA::SaveToFile(const char *filename, uchar *bwt) -{ - std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc); - if (file.is_open()) - { - std::cerr << "Writing CSA to file: " << filename << std::endl; - file.write((char *)&bwtEndPos, sizeof(TextPosition)); - std::cerr << "Writing BWT of " << n << " bytes." << std::endl; - for (ulong offset = 0; offset < n; offset ++) - file.write((char *)(bwt + offset), sizeof(char)); - file.close(); - } - else - { - std::cerr << "Unable to open file " << filename << std::endl; - exit(1); - } -} -/*uchar * CSA::BWT(uchar *text) +/** + * Finds document identifier for given text position + * + * Starting text position of the document is stored into second parameter. + * Binary searching on text starting positions. + */ +TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const { - uchar *s; + assert(i < n); - DynFMI *wt = new DynFMI((uchar *) text, n); - s = wt->getBWT(); - for (ulong i=0;i i) + b = c - 1; + else if ((*textStartPos)[c+1] > i) + return c; + else + a = c + 1; + } - delete wt; - return s; - }*/ + assert(a < (DocId)numberOfTexts); + assert(i >= (*textStartPos)[a]); + assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); + return a; +} CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n) {