X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.cpp;h=bdf97ef03ce785140bb3a356ca77af059ee75a3c;hb=580705e1bb680eba3cc20075f39365d10cc20012;hp=5719154cf87c8ccdc7b30440647cc4d809cdcab2;hpb=8bda359694b7233cd6154df9ac691eccaabfbb92;p=SXSI%2FTextCollection.git diff --git a/CSA.cpp b/CSA.cpp index 5719154..bdf97ef 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -35,6 +35,8 @@ using std::map; using SXSI::TextCollection; +// Save file version info +const uchar CSA::versionFlag = 2; //////////////////////////////////////////////////////////////////////////// // Class CSA::THuffAlphabetRank @@ -120,6 +122,24 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { delete bitrank; } +/** + * Saving data fields: + BitRank *bitrank; + bool leaf; + uchar ch; + left child; + right child; +*/ +void CSA::THuffAlphabetRank::Save(FILE *file) +{ + +} + +CSA::THuffAlphabetRank::THuffAlphabetRank(FILE *file) +{ + + +} /////////////////////////////////////////////f/////////////////////////////// // Class CSA @@ -129,9 +149,9 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ CSA::CSA(unsigned samplerate) - : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0), - codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0), - textLength(0), textStartPos(0) + : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0), + codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), + textLength(0), textStartPos(0), emptyTextRank(0) { this->samplerate = samplerate; @@ -164,20 +184,38 @@ void CSA::InsertText(uchar const * text) if (m > maxTextLength) maxTextLength = m; // Store length of the longest text seen so far. - this->n += m; - this->numberOfTexts ++; - dynFMI->addText(text, m); + if (m > 1) + { + this->n += m; + this->numberOfTexts ++; + this->numberOfAllTexts ++; + dynFMI->addText(text, m); + } + else + { + emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here + this->numberOfAllTexts ++; // Replace with dynamic bitvector + } } void CSA::MakeStatic() { // Sanity check: if (dynFMI == 0) + throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); + + // Bitvector of empty texts { - std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl; - std::exit(0); + //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it) + Tools::SetField(tempB, 1, (*it), 1); + emptyTextId.clear(); + emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true); } - + uchar *bwt = dynFMI->getBWT(); /*printf("123456789012345678901234567890123456789\n"); for (TextPosition i = 0; i < n; i ++) @@ -199,24 +237,27 @@ void CSA::MakeStatic() delete dynFMI; dynFMI = 0; - makewavelet(bwt); + makewavelet(bwt); // Deletes bwt! + bwt = 0; // Calculate BWT end-marker position (of last inserted text) // and the length of the first text (counter l): ulong i = 0; ulong l = 1; - while (bwt[i] != '\0') + + uchar c = alphabetrank->access(i); + while (c != '\0') { - uchar c = bwt[i]; i = C[c]+alphabetrank->rank(c, i)-1; ++l; + c = alphabetrank->access(i); } bwtEndPos = i; + assert(bwtEndPos < n); //printf("bwtEndPos = %lu\n", bwtEndPos); - delete [] bwt; - // Build up arrays for text length and starting positions + // FIXME Remove, temp data textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength)); textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); (*textLength)[0] = l; @@ -228,43 +269,73 @@ void CSA::MakeStatic() bool CSA::EmptyText(DocId k) const { - assert(k < (DocId)numberOfTexts); - return (1 == (*textLength)[k]); + assert(k < (DocId)numberOfTexts); + if (emptyTextRank->IsBitSet(k)) + return true; + return false; } uchar* CSA::GetText(DocId k) const { assert(k < (DocId)numberOfTexts); - uchar* result = new uchar[(*textLength)[k]]; + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + // Map to non-empty text + k -= textRank; //emptyTextRank->rank(k); + TextPosition i = k; - TextPosition j = (*textLength)[k] - 1; - result[j] = '\0'; - uchar c = alphabetrank->charAtPos(i); + string result; + // Reserve average string length to avoid reallocs + result.reserve(n/numberOfTexts); + + uchar c = alphabetrank->access(i); while (c != '\0') { - --j; - result[j] = c; + result.push_back(c); i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->charAtPos(i); // "next" char. + c = alphabetrank->access(i); // "next" char. } - assert(j == 0); - return result; + + // Convert to uchar (FIXME return string?) + i = result.size(); + uchar* res = new uchar[i+1]; + res[i] = '\0'; + for (ulong j = 0; j < i; ++j) + res[i-j-1] = result[j]; + return res; } - +/* + * Not supported uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const { assert(k < (DocId)numberOfTexts); assert(j < (*textLength)[k]); assert(i <= j); + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + + // Map to non-empty text + k -= textRank; // emptyTextRank->rank(k); + // Start position of k'th text ulong start = (*textStartPos)[k]; return Substring(i + start, j-i+1); -} + }*/ /****************************************************************** * Existential queries @@ -286,7 +357,7 @@ bool CSA::IsPrefix(uchar const * pattern) const bool CSA::IsSuffix(uchar const *pattern) const { - // Here counting is as fast as isSuffix(): + // Here counting is as fast as IsSuffix(): if (CountSuffix(pattern) > 0) return true; return false; @@ -295,8 +366,12 @@ bool CSA::IsSuffix(uchar const *pattern) const bool CSA::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); - //if (m == 0) - // return false; // FIXME Check for empty texts! + if (m == 0) + { + if (numberOfAllTexts - numberOfTexts > 0u) + return true; // Empty texts exists + return false; // No empty texts exists + } TextPosition sp = 0, ep = 0; // Match including end-marker @@ -337,7 +412,7 @@ unsigned CSA::CountPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return numberOfTexts; + return numberOfAllTexts; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); @@ -350,7 +425,7 @@ unsigned CSA::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return numberOfTexts; + return numberOfAllTexts; TextPosition sp = 0, ep = 0; // Search with end-marker @@ -362,8 +437,8 @@ unsigned CSA::CountSuffix(uchar const * pattern) const unsigned CSA::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); -// if (m == 0) -// return ; // FIXME Test for empty texts. + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. TextPosition sp = 0, ep = 0; // Match including end-marker @@ -375,6 +450,10 @@ unsigned CSA::CountEqual(uchar const *pattern) const unsigned CSA::CountContains(uchar const * pattern) const { + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts; // Total number of texts. + // Here counting is as slow as fetching the result set // because we would have to filter out occ's that fall within same document. TextCollection::document_result result = Contains(pattern); @@ -395,7 +474,7 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); @@ -407,6 +486,9 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const if (resultSize == 0) return result; + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep]: unsigned i = 0; if (sp > 0) i = alphabetrank->rank(0, sp - 1); @@ -414,7 +496,10 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const { // End-marker we found belongs to the "preceeding" doc in the collection DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); + -- resultSize; ++ i; } @@ -426,37 +511,46 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; // Search with end-marker Search(pattern, m+1, &sp, &ep); TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); } + // Assert: c == '\0' OR sampled->IsBitSet(i) + if (c == '\0') { // Rank among the end-markers in BWT unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; // End-marker that we found belongs to the "preceeding" doc in collection: - DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); } else // Sampled position { - TextPosition textPos = (*suffixes)[sampled->rank(i)-1]; - result.push_back(DocIdAtTextPos(textPos)); + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); } } @@ -467,7 +561,7 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all empty texts TextPosition sp = 0, ep = 0; // Match including end-marker @@ -480,14 +574,19 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const if (resultSize == 0) return result; + result.reserve(resultSize); // Try to avoid reallocation. + unsigned i = 0; if (sp > 0) i = alphabetrank->rank(0, sp - 1); while (resultSize) { - // End-marker we found belongs to the "previous" doc in the collection + // End-marker we found belongs to the "previous" doc in the collection DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); + -- resultSize; ++ i; } @@ -505,19 +604,20 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); - + // We want unique document indentifiers, using std::set to collect them std::set resultSet; + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + c = alphabetrank->access(i); } if (c == '\0') { @@ -530,15 +630,17 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const } else { - TextPosition textPos = (*suffixes)[sampled->rank(i)-1]; - resultSet.insert(DocIdAtTextPos(textPos)); + DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + assert((unsigned)di < numberOfTexts); + resultSet.insert(di); } } - // Convert std::set to std::vector (TODO better way to construct result vector?) - TextCollection::document_result result; - for (std::set::iterator it = resultSet.begin(); it != resultSet.end(); ++it) - result.push_back(*it); + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + // Map to doc ID's + for (document_result::iterator it = result.begin(); it != result.end(); ++it) + *it = emptyTextRank->select0(*it+1); return result; } @@ -556,24 +658,26 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return full_result(); + return full_result(); // FIXME Throw exception? TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Report each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; TextPosition dist = 0; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); ++ dist; } if (c == '\0') @@ -583,14 +687,19 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const // End-marker that we found belongs to the "preceeding" doc in collection: DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(make_pair(docId, dist)); } else { - TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist; - // Read document identifier and its starting position in text order - DocId docId = DocIdAtTextPos(textPos); - result.push_back(make_pair(docId, textPos - (*textStartPos)[docId])); + TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; +// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(make_pair(docId, textPos)); } } @@ -598,106 +707,59 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const } -/** - * Finds document identifier for given text position - * - * Starting text position of the document is stored into second parameter. - * Binary searching on text starting positions. - */ -TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const -{ - assert(i < n); - - DocId a = 0; - DocId b = numberOfTexts - 1; - while (a < b) - { - DocId c = a + (b - a)/2; - if ((*textStartPos)[c] > i) - b = c - 1; - else if ((*textStartPos)[c+1] > i) - return c; - else - a = c + 1; - } - - assert(a < (DocId)numberOfTexts); - assert(i > (*textStartPos)[a]); - assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); - return a; -} - -/** - * Count end-markers in given interval - */ -unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const -{ - if (sp > ep) - return 0; - - ulong ranksp = 0; - if (sp != 0) - ranksp = alphabetrank->rank(0, sp - 1); - - return alphabetrank->rank(0, ep) - ranksp; -} - /** * Save index to a file handle * * Throws a std::runtime_error exception on i/o error. * First byte that is saved represents the version number of the save file. - * In version 1 files, the data fields are: - * version info; - * samplerate; - * length of the BWT; - * end marker position in BWT; - * BWT string of length n; - * number of texts; - * length of longest text; - * - * array of pairs. - * - * TODO: Save the data structures instead of BWT sequence? - * TODO: Don't save textLength and textStartPos arrays. + * In version 2 files, the data fields are: + * uchar versionFlag; + TextPosition n; + unsigned samplerate; + unsigned C[256]; + TextPosition bwtEndPos; + static_sequence * alphabetrank; + BSGAP *sampled; + BlockArray *suffixes; + BlockArray *suffixDocId; + unsigned numberOfTexts; + unsigned numberOfAllTexts; + ulong maxTextLength; + BlockArray *endmarkerDocId; + BSGAP *emptyTextRank; */ void CSA::Save(FILE *file) const { - // Saving version 1 data: - uchar versionFlag = 1; + // Saving version info: if (std::fwrite(&versionFlag, 1, 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (version flag)."); + if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (n)."); if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (samplerate)."); - if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (n)."); + for(ulong i = 0; i < 256; ++i) + if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (C table)."); if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (bwt end position)."); - - for (ulong offset = 0; offset < n; offset ++) - { - uchar c = alphabetrank->charAtPos(offset); - if (std::fwrite(&c, sizeof(uchar), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (bwt sequence)."); - } - unsigned r = numberOfTexts; - if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (r)."); + alphabetrank->save(file); + sampled->Save(file); + suffixes->Save(file); + suffixDocId->Save(file); + if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfTexts)."); + if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts)."); if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (maxTextLength)."); - - for (r = 0; r < numberOfTexts; ++ r) - { - if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (text length)."); - if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (text start)."); - } + + endmarkerDocId->Save(file); + emptyTextRank->Save(file); } @@ -713,70 +775,61 @@ void CSA::Load(FILE *file, unsigned samplerate) delete dynFMI; dynFMI = 0; delete alphabetrank; alphabetrank = 0; delete sampled; sampled = 0; - delete [] suffixes; suffixes = 0; - delete [] positions; positions = 0; + delete suffixes; suffixes = 0; + delete suffixDocId; suffixDocId = 0; + delete positions; positions = 0; delete [] codetable; codetable = 0; delete endmarkerDocId; endmarkerDocId = 0; + delete emptyTextRank; emptyTextRank = 0; + // FIXME Remove following: delete textLength; textLength = 0; delete textStartPos; textStartPos = 0; this->maxTextLength = 0; this->numberOfTexts = 0; + this->numberOfAllTexts = 0; this->samplerate = samplerate; this->n = 0; - uchar versionFlag = 0; - if (std::fread(&versionFlag, 1, 1, file) != 1) + uchar verFlag = 0; + if (std::fread(&verFlag, 1, 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (version flag)."); - if (versionFlag != 1) - throw std::runtime_error("CSA::Load(): invalid start byte."); + if (verFlag != CSA::versionFlag) + throw std::runtime_error("CSA::Load(): invalid save file version."); + if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (n)."); if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (samplerate)."); - if (this->samplerate == 0) - this->samplerate = samplerate; +// FIXME samplerate can not be changed during load. +// if (this->samplerate == 0) +// this->samplerate = samplerate; - if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (n)."); + for(ulong i = 0; i < 256; ++i) + if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (C table)."); if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (bwt end position)."); - uchar *bwt = new uchar[n]; - for (ulong offset = 0; offset < n; offset ++) - if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (bwt sequence)."); - - unsigned r = 0; - if (std::fread(&r, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (r)."); + alphabetrank = static_sequence::load(file); + sampled = new BSGAP(file); + suffixes = new BlockArray(file); + suffixDocId = new BlockArray(file); + if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfTexts)."); + if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts)."); if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (maxTextLength)."); - - // Build up arrays for text length and starting positions - textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength)); - textStartPos = new BlockArray(r, Tools::CeilLog2(this->n)); - while (r > 0) - { - TextPosition length = 0, start = 0; - if (std::fread(&length, sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (text length)."); - if (std::fread(&start, sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (text start)."); - - (*textLength)[numberOfTexts] = length; - (*textStartPos)[numberOfTexts] = start; - ++numberOfTexts; - --r; - } + endmarkerDocId = new BlockArray(file); + emptyTextRank = new BSGAP(file); - // Construct data structures - makewavelet(bwt); - delete [] bwt; - maketables(); // FIXME: this will redo text length tables + // FIXME Construct data structures with new samplerate + //maketables(); // FIXME: this will redo text length tables } @@ -785,6 +838,8 @@ void CSA::Load(FILE *file, unsigned samplerate) * Rest of the functions follow... */ +/* + * Not supported uchar * CSA::Substring(TextPosition i, TextPosition l) const { uchar *result = new uchar[l + 1]; @@ -818,7 +873,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const for (dist = 0; dist < skip + l; dist++) { - int c = alphabetrank->charAtPos(j); + int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp); if (c == '\0') { // Rank among the end-markers in BWT @@ -833,7 +888,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const result[l] = 0u; return result; } - +*/ /*TextPosition CSA::inverse(TextPosition i) { TextPosition skip = samplerate - i % samplerate; @@ -858,7 +913,8 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const return j; }*/ -ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +{ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) int c = (int)pattern[m-1]; TextPosition i=m-1; @@ -877,7 +933,7 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return ep - sp + 1; else return 0; - } +} /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) { @@ -890,12 +946,13 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; }*/ -CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma) +CSA::TextPosition CSA::Lookup(TextPosition i) const { TextPosition dist=0; while (!sampled->IsBitSet(i)) { - int c = alphabetrank->charAtPos(i); + + int c = alphabetrank->access(i); if (c == '\0') { // End-markers are sampled @@ -903,7 +960,7 @@ CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(sample DocId docId = (*endmarkerDocId)[endmarkerRank]; return (*textStartPos)[docId] + dist; } - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + i = C[c]+alphabetrank->rank(c,i)-1; ++dist; } @@ -915,12 +972,14 @@ CSA::~CSA() { delete alphabetrank; delete sampled; delete suffixes; + delete suffixDocId; delete positions; delete [] codetable; delete endmarkerDocId; delete textLength; delete textStartPos; + delete emptyTextRank; } void CSA::makewavelet(uchar *bwt) @@ -948,9 +1007,30 @@ void CSA::makewavelet(uchar *bwt) C[i]=C[i-1]+prev; prev = temp; } - this->codetable = node::makecodetable(bwt,n); - alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); - //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); +// this->codetable = node::makecodetable(bwt,n); +// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); +// delete [] bwt; + //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! + + uint *text = new uint[n]; + for (i = 0; i < n; ++i) // Silly + text[i] = bwt[i]; + delete [] bwt; + bwt = 0; + + alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate? + + //cout << "Building Huffman table..."; cout.flush(); + + wt_coder * wtc = new wt_coder_huff(text,n,am); + + //cout << "done" << endl; cout.flush(); + //cout << "Building static_sequence..."; cout.flush(); + + alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am); + delete [] text; + text = 0; /* for (i = 0; i < n; ++i) { @@ -966,9 +1046,9 @@ void CSA::maketables() ulong *sampledpositions = new ulong[n/W+1]; unsigned ceilLog2n = Tools::CeilLog2(n); - suffixes = new BlockArray(sampleLength, ceilLog2n); positions = new BlockArray(sampleLength, ceilLog2n); - + BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n); + // Mapping from end-markers to doc ID's: endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); @@ -977,6 +1057,7 @@ void CSA::maketables() sampledpositions[i]=0lu; ulong x,p=bwtEndPos; + ulong sampleCount = 0; // Keeping track of text position of end-markers seen ulong posOfSuccEndmarker = n; DocId textId = numberOfTexts; @@ -988,12 +1069,14 @@ void CSA::maketables() // i substitutes SA->GetPos(i) x=(i==n-1)?0:i+1; - if (x % samplerate == 0) { + if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { Tools::SetField(sampledpositions,1,p,1); - (*positions)[x/samplerate] = p; + (*positions)[sampleCount] = p; + (*tmpSuffix)[sampleCount] = x; // FIXME remove + sampleCount ++; } - uchar c = alphabetrank->charAtPos(p); + uchar c = alphabetrank->access(p); if (c == '\0') { --textId; @@ -1033,14 +1116,66 @@ void CSA::maketables() assert(dynTextLength[i].second == (*textStartPos)[i]); } */ - sampled = new BitRank(sampledpositions,n,true); - + sampled = new BSGAP(sampledpositions,n,true); + sampleLength = sampled->rank(n-1); + assert(sampleCount == sampleLength); +// std::cout << ";sampleLength;" << sampleLength << std::endl; + // Suffixes == offset from text start position + suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); + suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); + //suffixes: for(i=0; irank((*positions)[i]); if (j==0) j=sampleLength; - (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate; - } + TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate; + (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]); + + assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts); + assert((*suffixDocId)[j-1] < numberOfTexts); + // calculate offset from text start: + (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]]; + } + // FIXME Temp, remove + delete tmpSuffix; + tmpSuffix=0; + delete positions; + positions =0; + delete textLength; + textLength = 0; + delete textStartPos; + textStartPos = 0; +} + + +/** + * Finds document identifier for given text position + * + * Starting text position of the document is stored into second parameter. + * Binary searching on text starting positions. + */ +TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const +{ + assert(i < n); + + DocId a = 0; + DocId b = numberOfTexts - 1; + while (a < b) + { + DocId c = a + (b - a)/2; + if ((*textStartPos)[c] > i) + b = c - 1; + else if ((*textStartPos)[c+1] > i) + return c; + else + a = c + 1; + } + + assert(a < (DocId)numberOfTexts); + assert(i >= (*textStartPos)[a]); + assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); + return a; } CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)