From: nvalimak Date: Wed, 27 Oct 2010 13:36:59 +0000 (+0000) Subject: Added SWCSA X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;ds=sidebyside;h=89dc22aee980ba16f757cd9a7f77478c2da50051;hp=443151511a86083b21c1c06eb610f86b3aed35be;p=SXSI%2FTextCollection.git Added SWCSA git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@925 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/FMIndex.cpp b/FMIndex.cpp new file mode 100644 index 0000000..297604b --- /dev/null +++ b/FMIndex.cpp @@ -0,0 +1,1309 @@ +/****************************************************************************** + * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * + * * + * FMIndex implementation for the TextCollection interface * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + *****************************************************************************/ +#include "FMIndex.h" + +//#define DEBUG_MEMUSAGE +#ifdef DEBUG_MEMUSAGE +#include "HeapProfiler.h" // FIXME remove +#endif + +#include +#include +#include +#include +#include +#include +#include +#include // For strlen() +using std::vector; +using std::pair; +using std::make_pair; +using std::map; +using std::string; + +namespace SXSI +{ + +// Save file version info +const uchar FMIndex::versionFlag = 9; + +/** + * Constructor inits an empty dynamic FM-index. + * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. + */ +FMIndex::FMIndex(uchar * bwt, ulong length, unsigned samplerate_, + unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, + CSA::DeltaVector & notIndexed, const string & niText, char tsType) + : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), + suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0) +{ + makewavelet(bwt); // Deletes bwt! + bwt = 0; + + // Make sampling tables + maketables(numberOfSamples_, tsType, notIndexed, niText); +} + +bool FMIndex::EmptyText(DocId k) const +{ + assert(k < (DocId)numberOfTexts); + return false; // Empty texts are not indexed +} + +uchar * FMIndex::GetText(DocId k) const +{ + assert(k < (DocId)numberOfTexts); + + return textStorage->GetText(k); +/* TextPosition i = k; + + string result; + // Reserve average string length to avoid reallocs + result.reserve(n/numberOfTexts); + + uchar c = alphabetrank->access(i); + while (c != '\0') + { + result.push_back(c); + i = C[c]+alphabetrank->rank(c,i)-1; + + c = alphabetrank->access(i); // "next" char. + } + + // Convert to uchar (FIXME return string?) + i = result.size(); + uchar* res = new uchar[i+1]; + res[i] = '\0'; + for (ulong j = 0; j < i; ++j) + res[i-j-1] = result[j]; + return res;*/ +} + +/* + * Substring queries are supported via the pointer returned by TextStorage::GetText +uchar* FMIndex::GetText(DocId k, TextPosition i, TextPosition j) const +{ + assert(k < (DocId)numberOfTexts); + assert(j < (*textLength)[k]); + assert(i <= j); + + ulong textRank = 0; + + // Start position of k'th text + ulong start = (*textStartPos)[k]; + + return Substring(i + start, j-i+1); + }*/ + + + +/****************************************************************** + * Existential queries + */ +bool FMIndex::IsPrefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return true; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep)) + return true; + return false; +} + +bool FMIndex::IsPrefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return true; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep, begin, end)) + return true; + return false; +} + + +bool FMIndex::IsSuffix(uchar const *pattern) const +{ + // Here counting is as fast as IsSuffix(): + if (CountSuffix(pattern) > 0) + return true; + return false; +} + +bool FMIndex::IsSuffix(uchar const *pattern, DocId begin, DocId end) const +{ + // Here counting is as fast as IsSuffix(): + if (CountSuffix(pattern, begin, end) > 0) + return true; + return false; +} + +bool FMIndex::IsEqual(uchar const *pattern) const +{ + TextPosition m = std::strlen((char *)pattern); + if (m == 0) + return false; // No empty texts exists + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep)) + return true; + return false; +} + +bool FMIndex::IsEqual(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = std::strlen((char *)pattern); + if (m == 0) + return false; // No empty texts exists + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + // Check for end-marker(s) in result interval + if (CountEndmarkers(sp, ep)) + return true; + return false; +} + +bool FMIndex::IsContains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return true; + + TextPosition sp = 0, ep = 0; + // Just check if pattern exists somewhere + ulong count = Search(pattern, m, &sp, &ep); + + if (count > 0) + return true; + return false; +} + +bool FMIndex::IsContains(uchar const * pattern, DocId begin, DocId end) const +{ + // Here counting is as fast as existential querying + if (CountContains(pattern, begin, end) > 0) + return true; // FIXME No need to filter result set + return false; +} + +bool FMIndex::IsLessThan(uchar const * pattern) const +{ + if (CountLessThan(pattern) > 0) + return true; + return false; +} + +bool FMIndex::IsLessThan(uchar const * pattern, DocId begin, DocId end) const +{ + if (CountLessThan(pattern, begin, end) > 0) + return true; + return false; +} + +/****************************************************************** + * Counting queries + */ +ulong FMIndex::Count(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return 0; + + TextPosition sp = 0, ep = 0; + unsigned count = (unsigned) Search(pattern, m, &sp, &ep); + return count; +} + +unsigned FMIndex::CountPrefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfTexts; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); +} + +unsigned FMIndex::CountPrefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfTexts; + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep, begin, end); +} + +unsigned FMIndex::CountSuffix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfTexts; + + TextPosition sp = 0, ep = 0; + // Search with end-marker + unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep); + + return count; +} + +unsigned FMIndex::CountSuffix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return numberOfTexts; + + TextPosition sp = 0, ep = 0; + // Search with end-marker + unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end); + + return count; +} + +unsigned FMIndex::CountEqual(uchar const *pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return 0; // No empty texts. + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); +} + +unsigned FMIndex::CountEqual(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return 0; // No empty texts. + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); // Already within [begin, end] +} + +unsigned FMIndex::CountContains(uchar const * pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfTexts; // Total number of texts. + + // Here counting is as slow as fetching the result set + // because we have to filter out occ's that fall within same document. + TextCollection::document_result result = Contains(pattern); + return result.size(); +} + +unsigned FMIndex::CountContains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfTexts; // Total number of texts. + + // Here counting is as slow as fetching the result set + // because we have to filter out occ's that fall within same document. + TextCollection::document_result result = Contains(pattern, begin, end); + return result.size(); +} + +// Less than or equal +unsigned FMIndex::CountLessThan(uchar const * pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return 0; // No empty texts. + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep); +} + +unsigned FMIndex::CountLessThan(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return 0; // No empty texts. + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Count end-markers in result interval + return CountEndmarkers(sp, ep, begin, end); +} + +/** + * Document reporting queries + */ +TextCollection::document_result FMIndex::Prefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Iterate through end-markers in [sp,ep]: + return EnumerateEndmarkers(sp, ep); +} + +TextCollection::document_result FMIndex::Prefix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Return end-markers in [sp,ep] and [begin, end]: + return EnumerateEndmarkers(sp, ep, begin, end); +} + +TextCollection::document_result FMIndex::Suffix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + // Search with end-marker + Search(pattern, m+1, &sp, &ep); + + TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + + // Check each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); + } + // Assert: c == '\0' OR sampled->IsBitSet(i) + + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + result.push_back(Doc->access(endmarkerRank)); + } + else // Sampled position + { + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + result.push_back(docId); + } + } + + return result; +} + +TextCollection::document_result FMIndex::Suffix(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all 1...k + + TextPosition sp = 0, ep = 0; + // Search with end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + + // Check each occurrence, already within [begin, end] + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); + } + // Assert: c == '\0' OR sampled->IsBitSet(i) + + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; + result.push_back(Doc->access(endmarkerRank)); + } + else // Sampled position + { + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + result.push_back(docId); + } + } + + return result; +} + + +TextCollection::document_result FMIndex::Equal(uchar const *pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all empty texts + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + // Report end-markers in result interval + return EnumerateEndmarkers(sp, ep); +} + +TextCollection::document_result FMIndex::Equal(uchar const *pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return TextCollection::document_result(); // FIXME Should return all empty texts + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep, begin, end); + + // Report end-markers in result interval + return EnumerateEndmarkers(sp, ep, begin, end); +} + + +TextCollection::document_result FMIndex::Contains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + // We want unique document indentifiers, using std::set to collect them + std::set resultSet; + EnumerateDocuments(resultSet, sp, ep); + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + return result; +} + +TextCollection::document_result FMIndex::Contains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + // We want unique document indentifiers, using std::set to collect them + std::set resultSet; + EnumerateDocuments(resultSet, sp, ep, begin, end); + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + return result; +} + + +/** + *** +* * FIXME Lessthan or equal + */ +TextCollection::document_result FMIndex::LessThan(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Report end-markers in result interval + return EnumerateEndmarkers(sp, ep); +} + +TextCollection::document_result FMIndex::LessThan(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + TextPosition sp = 0, ep = 0; + SearchLessThan(pattern, m, &sp, &ep); + + // Iterate through end-markers in [sp,ep] and [begin, end]: + return EnumerateEndmarkers(sp, ep, begin, end); +} + + +TextCollection::document_result FMIndex::KMismaches(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + suffix_range_vector ranges; + kmismatches(ranges, pattern, 0, n-1, m, k); + std::set resultSet; + + for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) + // Iterate through docs in [sp,ep]: + EnumerateDocuments(resultSet, (*it).first, (*it).second); + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + return result; +} + +TextCollection::document_result FMIndex::KErrors(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); // empty result set + + suffix_range_vector ranges; + ulong *dd = new ulong[m+1]; + for (ulong i=0;i resultSet; + for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) + // Iterate through docs in [sp,ep]: + EnumerateDocuments(resultSet, (*it).first, (*it).second); + + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + return result; +} + + +/** + * Full result set queries + */ +TextCollection::full_result FMIndex::FullContains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return full_result(); // FIXME Throw exception? + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + EnumeratePositions(result, sp, ep); + + return result; +} + +TextCollection::full_result FMIndex::FullContains(uchar const * pattern, DocId begin, DocId end) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return full_result(); // FIXME Throw exception? + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + EnumeratePositions(result, sp, ep, begin, end); + + return result; +} + +TextCollection::full_result FMIndex::FullKMismatches(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::full_result(); // empty result set + + suffix_range_vector ranges; + ulong count = kmismatches(ranges, pattern, 0, n-1, m, k); + + TextCollection::full_result result; + result.reserve(count); // avoid reallocation. + for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) + // Iterate through docs in [sp,ep]: + EnumeratePositions(result, (*it).first, (*it).second); + return result; +} + +TextCollection::full_result FMIndex::FullKErrors(uchar const * pattern, unsigned k) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::full_result(); // empty result set + + suffix_range_vector ranges; + ulong *dd = new ulong[m+1]; + for (unsigned i=0;in), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("FMIndex::Save(): file write error (n)."); + if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("FMIndex::Save(): file write error (samplerate)."); + + for(ulong i = 0; i < 256; ++i) + if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("FMIndex::Save(): file write error (C table)."); + + if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("FMIndex::Save(): file write error (bwt end position)."); + + alphabetrank->save(file); + sampled->save(file); + suffixes->Save(file); + suffixDocId->Save(file); + + if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("FMIndex::Save(): file write error (numberOfTexts)."); + if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("FMIndex::Save(): file write error (maxTextLength)."); + + Doc->save(file); + textStorage->Save(file); + fflush(file); +} + + +/** + * Load index from a file handle + * + * Throws a std::runtime_error exception on i/o error. + * For more info, see FMIndex::Save(). + * + * index_mode_t is defined in TextCollection.h and + * defaults to both the index and "naive" text. + * + * Note: Samplerate can not be changed during load. + */ +FMIndex::FMIndex(FILE *file, index_mode_t im, unsigned samplerate_) + : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), + suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0) +{ + // NB: Type byte has already been read from input + + uchar verFlag = 0; + if (std::fread(&verFlag, 1, 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (version flag)."); + if (verFlag != FMIndex::versionFlag) + throw std::runtime_error("FMIndex::Load(): invalid save file version."); + + if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (n)."); + if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (samplerate)."); +// FIXME samplerate can not be changed during load. +// if (this->samplerate == 0) +// this->samplerate = samplerate; + + for(ulong i = 0; i < 256; ++i) + if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (C table)."); + + if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (bwt end position)."); + + alphabetrank = static_sequence::load(file); + if (im == index_mode_text_only) { delete alphabetrank; alphabetrank = 0; } + + sampled = static_bitsequence::load(file); + if (im == index_mode_text_only) { delete sampled; sampled = 0; } + suffixes = new BlockArray(file); + if (im == index_mode_text_only) { delete suffixes; suffixes = 0; } + suffixDocId = new BlockArray(file); + if (im == index_mode_text_only) { delete suffixDocId; suffixDocId = 0; } + + if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (numberOfTexts)."); + if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (maxTextLength)."); + + Doc = new ArrayDoc(file); //static_sequence::load(file); + if (im == index_mode_text_only) { delete Doc; Doc = 0; } + + textStorage = TextStorage::Load(file); + + // FIXME Construct data structures with new samplerate + //maketables(); +} + + + +/** + * Rest of the functions follow... + */ +ulong FMIndex::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const +{ + int c; + while (*sp<=*ep && i>=1) + { + c = (int)pattern[--i]; + *sp = C[c]+alphabetrank->rank(c,*sp-1); + *ep = C[c]+alphabetrank->rank(c,*ep)-1; + } + if (*sp<=*ep) + return *ep - *sp + 1; + else + return 0; +} + + +ulong FMIndex::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const +{ + if (sp>ep) return 0; + if (j == 0) + { + result.push_back(std::make_pair(sp,ep)); + return ep-sp+1; + } + int c; + ulong spnew; + ulong epnew; + int knew; + ulong sum=0; + if (k==0) + { + sum = searchPrefix(pattern, j, &sp, &ep); + if (sp<=ep) + result.push_back(std::make_pair(sp, ep)); + return sum; + } + vector chars = alphabetrank->accessAll(sp, ep); + for (vector::iterator it = chars.begin(); it != chars.end(); ++it) + { + if (*it == 0) + continue; // skip '\0' + c = *it; + spnew = C[c]+alphabetrank->rank(c,sp-1); + epnew = C[c]+alphabetrank->rank(c,ep)-1; + if (c!=pattern[j-1]) knew = (int)k-1; else knew = k; + if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew); + } + return sum; +} + +//first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i +ulong FMIndex::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const +{ + ulong sum=0; + if (d[m]<=k) // range of suffixes with at most k-errors found + { + if (sp<=ep) + result.push_back(std::make_pair(sp, ep)); + sum += (sp<=ep)?ep-sp+1:0ul; + } + if (sp>ep || j==0) + return sum; + ulong *dnew = new ulong[m+1]; + int c; + ulong spnew; + ulong p,lowerbound; + ulong epnew; + vector chars = alphabetrank->accessAll(sp, ep); + for (vector::iterator it = chars.begin(); it != chars.end(); ++it) + { + if (*it == 0) + continue; // skip '\0' + c = *it; + spnew = C[c]+alphabetrank->rank(c,sp-1); + epnew = C[c]+alphabetrank->rank(c,ep)-1; + if (spnew>epnew) continue; + dnew[0]=m+k-j+1; + lowerbound=k+1; + for (p=1; p<=m; p++) { + dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1)); + if (dnew[p]rank(c,i) + int c = (int)pattern[m-1]; + TextPosition i=m-1; + TextPosition sp = C[c]; + TextPosition ep = C[c+1]-1; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + sp = C[c]+alphabetrank->rank(c,sp-1); + ep = C[c]+alphabetrank->rank(c,ep)-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; +} + +ulong FMIndex::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const +{ + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + int c = (int)pattern[m-1]; + assert(c == 0); // Start from endmarkers + TextPosition i=m-1; + TextPosition sp = begin; + TextPosition ep = end; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + sp = C[c]+alphabetrank->rank(c,sp-1); + ep = C[c]+alphabetrank->rank(c,ep)-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; +} + + +ulong FMIndex::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +{ + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + uint c = (int)pattern[m-1]; + TextPosition i=m-1; + TextPosition sp = 1; + TextPosition ep = C[c+1]-1; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + uint result = alphabetrank->rankLessThan(c,ep); + if (result == ~0u) + ep = 0; + else + ep = C[c]+result-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; +} + + +FMIndex::~FMIndex() { + delete alphabetrank; + delete sampled; + delete suffixes; + delete suffixDocId; + delete Doc; + delete textStorage; +} + +void FMIndex::makewavelet(uchar *bwt) +{ + ulong i, min = 0, + max; + for (i=0;i<256;i++) + C[i]=0; + for (i=0;i0) {min = i; break;} + for (i=255;i>=min;--i) + if (C[i]>0) {max = i; break;} + + ulong prev=C[0], temp; + C[0]=0; + for (i=1;i<256;i++) { + temp = C[i]; + C[i]=C[i-1]+prev; + prev = temp; + } +// this->codetable = node::makecodetable(bwt,n); +// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); +// delete [] bwt; + //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! +// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + +#ifdef DEBUG_MEMUSAGE + std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate? + wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape + alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am); + delete bmb; + bwt = 0; // already deleted + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +#endif +} + +void FMIndex::maketables(ulong sampleLength, char tsType, CSA::DeltaVector & notIndexed, const string & niText) +{ + // Calculate BWT end-marker position (of last inserted text) + { + ulong i = 0; + uint alphabetrank_i_tmp = 0; + uchar c = alphabetrank->access(i, alphabetrank_i_tmp); + while (c != '\0') + { + i = C[c]+alphabetrank_i_tmp-1; + c = alphabetrank->access(i, alphabetrank_i_tmp); + } + + this->bwtEndPos = i; + } + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + // Build up array for text starting positions +// BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); +// (*textStartPos)[0] = 0; + + // Mapping from end-markers to doc ID's: + unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts); +// uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1]; + BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts); + + BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n)); + uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1]; + for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++) + sampledpositions[i] = 0; + + ulong x,p=bwtEndPos; + ulong sampleCount = 0; + // Keeping track of text position of prev. end-marker seen + ulong posOfSuccEndmarker = n-1; + DocId textId = numberOfTexts; + ulong ulongmax = 0; + ulongmax--; + uint alphabetrank_i_tmp =0; + + // Text length = n + number of bytes not indexed. + TextStorageBuilder tsbuilder(n + niText.length()); + ulong tsb_i = n + niText.length(); // Iterator from text length to 0. + string::const_reverse_iterator nit_i = niText.rbegin(); // Iterator through non-indexed texts + + for (ulong i=n-1;iGetPos(i) + x=(i==n-1)?0:i+1; + + uchar c = alphabetrank->access(p, alphabetrank_i_tmp); + + tsbuilder[--tsb_i] = c; // Build TextStorage + + if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0') + { + set_field(sampledpositions,1,p,1); + (*positions)[sampleCount] = p; + sampleCount ++; + } + + if (c == '\0') + { + unsigned prevTextId = textId; // Cache textId value. + --textId; + /** + * At first c == '\0' it holds that (prevTextId == numberOfTexts), thus, + * we have to search for the first text that is actually *indexed* + * to get correct prevTextId. + */ + if (prevTextId == numberOfTexts) + { + prevTextId = 0; + while (notIndexed.isSet(prevTextId)) + ++ prevTextId; + // Now prevTextId points to the first indexed Doc ID. + } + + /** + * Insert non-indexed texts + */ + while (notIndexed.isSet(textId)) + { + do { + tsbuilder[tsb_i] = *nit_i; + -- tsb_i; + ++ nit_i; + } while (nit_i != niText.rend() && *nit_i != '\0'); + + tsbuilder[tsb_i] = '\0'; + + if (textId == 0) + break; + --textId; + } + + // Record the order of end-markers in BWT: + ulong endmarkerRank = alphabetrank_i_tmp - 1; + //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts); + (*endmarkerDocId)[endmarkerRank] = prevTextId % numberOfTexts; + + // Store text length and text start position: + if (textId < (DocId)numberOfTexts - 1) + { +// (*textStartPos)[textId + 1] = x; // x-1 is text position of end-marker. + + posOfSuccEndmarker = i; + } + + // LF-mapping from '\0' does not work with this (pseudo) BWT. + // Correct LF-mapping to the last char of the previous text: + p = textId - notIndexed.rank(textId); + } + else // Now c != '\0', do LF-mapping: + p = C[c]+alphabetrank_i_tmp-1; + } + while (textId > 0 && notIndexed.isSet(textId-1)) + { + do { + -- tsb_i; + tsbuilder[tsb_i] = *nit_i; + ++ nit_i; + } while (nit_i != niText.rend() && *nit_i != '\0'); + --textId; + } + assert(textId == 0); + assert(tsb_i == 0); + assert(nit_i == niText.rend()); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + textStorage = tsbuilder.InitTextStorage(tsType); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + sampled = new static_bitsequence_rrr02(sampledpositions, n, 16); + delete [] sampledpositions; + assert(sampleCount == sampleLength); + assert(sampled->rank1(n-1) == sampleLength); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + // Suffixes store an offset from the text start position + suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); + suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); + + x = n + niText.length() - 2; + textId = numberOfTexts - 1; + posOfSuccEndmarker = x + 1; + for(ulong i = 0; i < sampleLength; i ++) { + // Find next sampled text position + while ((posOfSuccEndmarker - x) % samplerate != 0 + || notIndexed.isSet(textId)) // Loop over non-indexed + { + --x; + assert(x != ~0lu); + if (textStorage->IsEndmarker(x)) + { + posOfSuccEndmarker = x--; + -- textId; + } + } + assert((*positions)[i] < n); + ulong j = sampled->rank1((*positions)[i]); + + assert(j != 0); // if (j==0) j=sampleLength; + + TextPosition textPos = (x==n-1)?0:x+1; + (*suffixDocId)[j-1] = textId; // textStorage->DocIdAtTextPos(textPos); + assert(textStorage->DocIdAtTextPos(textPos) == textId); + + assert((*suffixDocId)[j-1] < numberOfTexts); + // calculate offset from text start: + (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]); + --x; + if (x != ~0lu && textStorage->IsEndmarker(x)) + { + posOfSuccEndmarker = x--; + -- textId; + } + } + + delete positions; + +#ifdef DEBUG_MEMUSAGE + std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + +#ifdef DEBUG_MEMUSAGE + std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; + HeapProfiler::ResetMaxHeapConsumption(); +#endif + + /*alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate? + Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true); + delete bmb;*/ + // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs! + + Doc = new ArrayDoc(endmarkerDocId); + +#ifdef DEBUG_MEMUSAGE + std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; +#endif +} + + +/** + * Finds document identifier for given text position + * + * Starting text position of the document is stored into second parameter. + * Binary searching on text starting positions. + */ +TextCollection::DocId FMIndex::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const +{ + assert(i < n); + + DocId a = 0; + DocId b = numberOfTexts - 1; + while (a < b) + { + DocId c = a + (b - a)/2; + if ((*textStartPos)[c] > i) + b = c - 1; + else if ((*textStartPos)[c+1] > i) + return c; + else + a = c + 1; + } + + assert(a < (DocId)numberOfTexts); + assert(i >= (*textStartPos)[a]); + assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); + return a; +} + + +} // namespace SXSI + diff --git a/FMIndex.h b/FMIndex.h new file mode 100644 index 0000000..e12be99 --- /dev/null +++ b/FMIndex.h @@ -0,0 +1,387 @@ +/****************************************************************************** + * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * + * * + * FMIndex implementation for the TextCollection interface * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + *****************************************************************************/ + +#ifndef _FMIndex_H_ +#define _FMIndex_H_ + +#include "incbwt/bits/deltavector.h" + +#include "BitRank.h" +#include "TextCollection.h" +#include "BlockArray.h" + +// Include from XMLTree/libcds +#include // Defines W == 32 +#include +#include +#include +#include + +// Re-define word size to ulong: +#undef W +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif +#undef bitset +#undef bitget + + +#include "TextStorage.h" +#include "ArrayDoc.h" +#include +#include + +namespace SXSI +{ + + +/** + * Implementation of the TextCollection interface + * + */ +class FMIndex : public SXSI::TextCollection { +public: + FMIndex(uchar *, ulong, unsigned, unsigned, ulong, ulong, + CSA::DeltaVector &, const std::string &, char); + ~FMIndex(); + + bool EmptyText(DocId) const; + + /** + * Extracting one text. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId) const; + void DeleteText(uchar *text) const + { textStorage->DeleteText(text); } + + /** + * Returns a pointer to the beginning of texts i, i+1, ..., j. + * Texts are separated by a '\0' byte. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId i, DocId j) const + { return textStorage->GetText(i, j); } + + /** + * Returns a substring of given text ID. + * + * FIXME This may be reimplemented via TextStorage. + */ +// uchar* GetText(DocId, TextPosition, TextPosition) const; + + bool IsPrefix(uchar const *) const; + bool IsSuffix(uchar const *) const; + bool IsEqual(uchar const *) const; + bool IsContains(uchar const *) const; + bool IsLessThan(uchar const *) const; + + bool IsPrefix(uchar const *, DocId, DocId) const; + bool IsSuffix(uchar const *, DocId, DocId) const; + bool IsEqual(uchar const *, DocId, DocId) const; + bool IsContains(uchar const *, DocId, DocId) const; + bool IsLessThan(uchar const *, DocId, DocId) const; + + ulong Count(uchar const *) const; + unsigned CountPrefix(uchar const *) const; + unsigned CountSuffix(uchar const *) const; + unsigned CountEqual(uchar const *) const; + unsigned CountContains(uchar const *) const; + unsigned CountLessThan(const unsigned char*) const; + + unsigned CountPrefix(uchar const *, DocId, DocId) const; + unsigned CountSuffix(uchar const *, DocId, DocId) const; + unsigned CountEqual(uchar const *, DocId, DocId) const; + unsigned CountContains(uchar const *, DocId, DocId) const; + unsigned CountLessThan(uchar const *, DocId, DocId) const; + + // Definition of document_result is inherited from SXSI::TextCollection. + document_result Prefix(uchar const *) const; + document_result Suffix(uchar const *) const; + document_result Equal(uchar const *) const; + document_result Contains(uchar const *) const; + document_result LessThan(uchar const *) const; + document_result KMismaches(uchar const *, unsigned) const; + document_result KErrors(uchar const *, unsigned) const; + + document_result Prefix(uchar const *, DocId, DocId) const; + document_result Suffix(uchar const *, DocId, DocId) const; + document_result Equal(uchar const *, DocId, DocId) const; + document_result Contains(uchar const *, DocId, DocId) const; + document_result LessThan(uchar const *, DocId, DocId) const; + + // Definition of full_result is inherited from SXSI::TextCollection. + full_result FullContains(uchar const *) const; + full_result FullContains(uchar const *, DocId, DocId) const; + full_result FullKMismatches(uchar const *, unsigned) const; + full_result FullKErrors(uchar const *, unsigned) const; + + // Index from/to disk + FMIndex(FILE *, index_mode_t, unsigned); + void Save(FILE *, char const *) const; + +private: + typedef std::vector > suffix_range_vector; + + static const uchar versionFlag; + TextPosition n; + unsigned samplerate; + unsigned C[256]; + TextPosition bwtEndPos; + static_sequence * alphabetrank; + + // Sample structures for texts longer than samplerate + static_bitsequence * sampled; + BlockArray *suffixes; + BlockArray *suffixDocId; + + // Total number of texts in the collection + unsigned numberOfTexts; + // Length of the longest text + ulong maxTextLength; + + // Array of document id's in the order of end-markers in BWT +// static_sequence *Doc; + ArrayDoc *Doc; + + // Text storage for fast extraction + TextStorage * textStorage; + + // Following methods are not part of the public API + uchar * BWT(uchar *); + void makewavelet(uchar *); + void maketables(ulong, char, CSA::DeltaVector &, const std::string &); + DocId DocIdAtTextPos(BlockArray*, TextPosition) const; + ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; + ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; + ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; + ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const; + ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const; + ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const; + /** + * Count end-markers in given interval + */ + inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const + { + if (sp > ep) + return 0; + + ulong ranksp = 0; + if (sp != 0) + ranksp = alphabetrank->rank(0, sp - 1); + + return alphabetrank->rank(0, ep) - ranksp; + } + + /** + * Count end-markers in given interval and + * within docIds [min,max] + */ + inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const + { + if (sp != 0) + sp = alphabetrank->rank(0, sp - 1); + ep = alphabetrank->rank(0, ep); + if (ep == 0) + return 0; + + return Doc->count(sp, ep-1, min, max); + } + + /** + * Enumerate all end-markers in given interval + */ + inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const + { + if (sp != 0) + sp = alphabetrank->rank(0, sp - 1); + ep = alphabetrank->rank(0, ep); + if (ep == 0) + return document_result(); + + return Doc->accessAll(sp, ep-1); + } + + /** + * Enumerate end-markers in given interval and + * within docIds [min,max] + */ + inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const + { + if (sp != 0) + sp = alphabetrank->rank(0, sp - 1); + ep = alphabetrank->rank(0, ep); + if (ep == 0) + return document_result(); + + return Doc->access(sp, ep-1, min, max); + } + + /** + * Enumerate documents in given interval [sp, ep] + */ + inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep) const + { + // We want unique document indentifiers, using std::set to collect them + // FIXME use unordered_set? + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + resultSet.insert(Doc->access(endmarkerRank)); + } + else + { + DocId di = (*suffixDocId)[sampled->rank1(i)-1]; + assert((unsigned)di < numberOfTexts); + resultSet.insert(di); + } + } + } + + /** + * Enumerate documents in given interval [sp, ep] + * and within [begin, end] + */ + inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const + { + // We want unique document indentifiers, using std::set to collect them + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + else + { + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + assert((unsigned)docId < numberOfTexts); + if (docId >= begin && docId <= end) + resultSet.insert(docId); + } + } + } + + /** + * Enumerate document+position pairs (full_result) of + * each suffix in given interval. + */ + inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const + { + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + result.push_back(std::make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + + result.push_back(std::make_pair(docId, textPos)); + } + } + } + + /** + * Enumerate document+position pairs (full_result) of + * each suffix in given interval and within [begin, end]. + */ + inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const + { + uint tmp_rank_c = 0; // Cache rank value of c. + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->access(i, tmp_rank_c); + while (c != '\0' && !sampled->access(i)) + { + i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i, tmp_rank_c); + ++ dist; + } + if (c == '\0') + { + // Rank among the end-markers in BWT + unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; + DocId docId = Doc->access(endmarkerRank); + if (docId >= begin && docId <= end) + result.push_back(std::make_pair(docId, dist)); + } + else + { + TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; + + if (docId >= begin && docId <= end) + result.push_back(std::make_pair(docId, textPos)); + } + } + } + +}; // class FMIndex + +} // namespace SXSI + +#endif diff --git a/FMIndexBuilder.cpp b/FMIndexBuilder.cpp new file mode 100644 index 0000000..ea52224 --- /dev/null +++ b/FMIndexBuilder.cpp @@ -0,0 +1,146 @@ +#include "incbwt/rlcsa_builder.h" +#include "incbwt/bits/deltavector.h" + +#include "FMIndexBuilder.h" +#include "FMIndex.h" + +using std::string; + +namespace SXSI +{ + +struct TCBuilderRep +{ + unsigned samplerate; + CSA::RLCSABuilder * sa; + + ulong n; + // Total number of texts in the collection + unsigned numberOfTexts; + // Length of the longest text + ulong maxTextLength; + ulong numberOfSamples; + + CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index. + string niText; // Texts that are not indexed. + +#ifdef TCB_TEST_BWT + DynFMI *dynFMI; +#endif +}; + +/** + * Init text collection + * + */ +FMIndexBuilder::FMIndexBuilder(unsigned samplerate, ulong estimatedInputLength) + : p_(new struct TCBuilderRep()) +{ + p_->n = 0; + p_->samplerate = samplerate; + p_->numberOfTexts = 0; + p_->numberOfSamples = 0; + p_->maxTextLength = 0; + p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32 + p_->niText = ""; + + // Current params: 8 bytes, no samples, buffer size n/10 bytes. + // Buffer size is always at least 15MB: + if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH) + estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH; + p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10); + assert(p_->sa->isOk()); + +#ifdef TCB_TEST_BWT + uchar temp[256]; + for (unsigned i = 0; i < 255; ++i) + temp[i] = i+1; + temp[255] = 0; + p_->dynFMI = new DynFMI(temp, 1, 255, false); +#endif +} + +FMIndexBuilder::~FMIndexBuilder() +{ +#ifdef TCB_TEST_BWT + delete p_->dynFMI; +#endif + + delete p_->sa; + delete p_->notIndexed; + delete p_; +} + +void FMIndexBuilder::InsertText(uchar const * text, bool index) +{ + TextCollection::TextPosition m = std::strlen((char *)text) + 1; + if (m <= 1) + { + // FIXME indexing empty texts + std::cerr << "FMIndexBuilder::InsertText() error: can not index empty texts!" << std::endl; + exit(1); + } + + p_->numberOfTexts ++; + + if (index) + { + /** + * Insert text into the index + */ + p_->n += m; + p_->numberOfSamples += (m-1)/p_->samplerate; + + if (m > p_->maxTextLength) + p_->maxTextLength = m; // Store length of the longest text seen so far. + + p_->sa->insertSequence((char*)text, m-1, 0); + assert(p_->sa->isOk()); + } + else + { + /** + * Insert text only to TextStorage + */ + p_->notIndexed->setBit(p_->numberOfTexts - 1); + p_->niText.append((const char *)text, m); + } +} + + +TextCollection * FMIndexBuilder::InitTextCollection(char type) +{ + uchar * bwt = 0; + CSA::usint length = 0; + if (p_->numberOfTexts == 0) + { + p_->numberOfTexts ++; // Add one empty text + bwt = new uchar[2]; + bwt[0] = '\0'; + bwt[1] = '\0'; + length = 1; + p_->maxTextLength = 1; + } + else + { + bwt = (uchar *)p_->sa->getBWT(length); + delete p_->sa; + p_->sa = 0; + + assert(length == p_->n); + } + + p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's + CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1); + delete p_->notIndexed; + p_->notIndexed = 0; + + TextCollection *result = new FMIndex(bwt, (ulong)length, + p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, + deltav, p_->niText, type); + + return result; +} + + +} // namespace SXSI diff --git a/FMIndexBuilder.h b/FMIndexBuilder.h new file mode 100644 index 0000000..a18540b --- /dev/null +++ b/FMIndexBuilder.h @@ -0,0 +1,85 @@ +/****************************************************************************** + * Copyright (C) 2009 by Niko Valimaki * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef _SXSI_FMIndexBuilder_h_ +#define _SXSI_FMIndexBuilder_h_ + +#include "TextCollectionBuilder.h" +#include "TextStorage.h" +#include "Tools.h" // Defines ulong and uchar. + +#include +#include +#include // Defines std::pair. +#include // Defines std::strlen, added by Kim + +// Un-comment to compare BWT against a BWT generated from class dynFMI: +//#define TCB_TEST_BWT + + +namespace SXSI +{ + struct TCBuilderRep; // Pimpl + + /** + * Build an instance of the TextCollection class. + */ + class FMIndexBuilder : public TextCollectionBuilder + { + public: + FMIndexBuilder(unsigned samplerate, ulong estimatedInputLength); + + virtual ~FMIndexBuilder(); + + /** + * Insert text + * + * Must be a zero-terminated string from alphabet [1,255]. + * Can not be called after makeStatic(). + * The i'th text insertion gets an identifier value i-1. + * In other words, document identifiers start from 0. + * + * Second parameter tells if the text will be added to the + * index also. If false, text is added only to the TextCollection + * and can not be searched for. + */ + virtual void InsertText(uchar const *, bool index = true); + /** + * Make static + * + * Convert to a static collection. + * New texts can not be inserted after this operation. + * + * TextStorage type defaults to TYPE_PLAIN_TEXT, another + * possible type is TYPE_LZ_INDEX. + */ + virtual TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT); + + private: + FMIndexBuilder(); + + // Using Pimpl idiom to hide RLCSA implementation. + struct TCBuilderRep * p_; + + // No copy constructor or assignment + FMIndexBuilder(FMIndexBuilder const&); + FMIndexBuilder& operator = (FMIndexBuilder const&); + }; +} +#endif diff --git a/SWCSABuilder.h b/SWCSABuilder.h new file mode 100644 index 0000000..e60bf02 --- /dev/null +++ b/SWCSABuilder.h @@ -0,0 +1,118 @@ +/****************************************************************************** + * Copyright (C) 2009 by Niko Valimaki * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef _SXSI_SWCSABuilder_h_ +#define _SXSI_SWCSABuilder_h_ + +#include "TextCollectionBuilder.h" +#include "TextStorage.h" +#include "Tools.h" // Defines ulong and uchar. +#include "SWCSAWrapper.h" + +#include +#include // Defines std::pair. +#include // Defines std::strlen, added by Kim + +namespace SXSI +{ + /** + * Build an instance of the TextCollection class. + */ + class SWCSABuilder : public TextCollectionBuilder + { + public: + SWCSABuilder(unsigned sampler) + : text(""), samplerate(sampler), numberOfTexts(0) + { /* NOP */ } + + virtual ~SWCSABuilder() + { /* NOP */ } + + /** + * Insert text + * + * Must be a zero-terminated string from alphabet [1,255]. + * Can not be called after makeStatic(). + * The i'th text insertion gets an identifier value i-1. + * In other words, document identifiers start from 0. + * + * All texts must be inserted into the index! + * The default (FMIndex) text collection supports non-indexed texts. + */ + virtual void InsertText(uchar const *t, bool index = true) + { + if (strlen((char const *) t) == 0) + { + std::cerr << "SWCSABuilder::InsertText(): Can not index empty texts!" << std::endl; + std::exit(1); + } + assert(index); + if (!index) + { + std::cerr << "SWCSABuilder::InsertText(): The implementation of SWCSA does not support non-indexed texts" + << std::endl << "Use the default (FMIndex) text collection instead." << std::endl; + std::exit(1); + } + text.append((char const *) t, strlen((char const *) t) + 1); // +1 for 0-byte. + ++ numberOfTexts; + } + + /** + * Make static + * + * Convert to a static collection. + * New texts can not be inserted after this operation. + * + * + */ + virtual TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT) + { + assert(type == TextStorage::TYPE_PLAIN_TEXT); + if (type != TextStorage::TYPE_PLAIN_TEXT) + { + std::cerr << "SWCSABuilder::InitTextCollection(): The implementation of SWCSA supports only TextStorage::TYPE_PLAIN_TEXT" + << std::endl << "Use the default (FMIndex) text collection instead." << std::endl; + std::exit(1); + } + + ulong n = text.size(); + uchar *t = new uchar[n]; // FIXME uses temporarily too much space + ulong l = text.copy((char *)t, n); + if (l != n) + { + std::cerr << "SWCSABuilder::InitTextCollection(): copy failed!" << std::endl; + std::exit(1); + } + text.clear(); + return new SWCSAWrapper(t, n, samplerate, numberOfTexts); // This will delete [] t. + } + + + private: + SWCSABuilder(); + std::string text; + unsigned samplerate; + unsigned numberOfTexts; + + // No copy constructor or assignment + SWCSABuilder(SWCSABuilder const&); + SWCSABuilder& operator = (SWCSABuilder const&); + }; +} +#endif diff --git a/SWCSAWrapper.h b/SWCSAWrapper.h new file mode 100644 index 0000000..731fc83 --- /dev/null +++ b/SWCSAWrapper.h @@ -0,0 +1,352 @@ +/****************************************************************************** + * Copyright (C) 2010 by Niko Välimäki * + * * + * FMIndex implementation for the TextCollection interface * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + *****************************************************************************/ + +#ifndef _SWCSAWrapper_H_ +#define _SWCSAWrapper_H_ + +#include "TextCollection.h" + +#include "TextStorage.h" + +#include "incbwt/bits/deltavector.h" +// Re-define word size to ulong: +#undef W +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif + +#define SWCSAWRAPPER_VERSION_FLAG 101 + +#include "swcsa/utils/defValues.h" +#include "swcsa/utils/valstring.h" +#include "swcsa/interface.h" + +#include +#include + +namespace SXSI +{ + +/** + * Partial implementation of the TextCollection interface + * + * Supports index construction, save, load and simple search. + * Use FMIndex implementation for full support. + */ +class SWCSAWrapper : public SXSI::TextCollection { +public: + SWCSAWrapper(uchar * text, ulong length, unsigned samplerate, + unsigned numberOfTexts_) + : index(0), offsets(0), n(length), seSize(0), numberOfTexts(numberOfTexts_) + { + // Inicializes the arrays used to detect if a char is valid or not. + StartValid(); + + // Delta encoded bitvector of text offsets. + CSA::DeltaEncoder encoder(32); + encoder.setBit(0); // Start of the first text. + + // Construct offset map from words to text elements + seSize = 0; + { + uchar const *pbeg,*pend; + + pbeg = text; + pend = text+length; + + while (pbeg = pend) {size++;} // a unique BLANK at the end of the file. + else { + if (_Valid [*pbeg] ) { + while ( (sizerank(seSize - 1) << std::endl; + + char opt[100]; + snprintf(opt, 99, "sA=%u;sAinv=%u;sPsi=%u", samplerate, samplerate, samplerate); + + int r = build_index(text, n, opt, &index); + if (r) + { + std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl; + std::exit(r); + } + } + + ~SWCSAWrapper() + { + int r = free_index (index); + if (r) + { + std::cout << "SWCSAWrapper destructor error: " << error_index(r) << std::endl; + std::exit(r); + } + index = 0; + delete offsets; offsets = 0; + } + + bool EmptyText(DocId k) const + { + assert(k < (DocId)numberOfTexts); + return false; // Empty texts are not indexed + } + + /** + * Extracting one text. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId i) const + { + return GetText(i, i); + } + void DeleteText(uchar *text) const + { + free(text); + } + + /** + * Returns a pointer to the beginning of texts i, i+1, ..., j. + * Texts are separated by a '\0' byte. + * + * Call DeleteText() for each pointer returned by GetText() + * to avoid possible memory leaks. + */ + uchar * GetText(DocId i, DocId j) const + { + ulong from, to, l; + uchar *text; + from = offsets->select(i); + to = offsets->select(i+1); // ADD one 1-bit in to end!!! + + int r = extractWords(index, from, to, &text, &l); + if (r) + { + std::cout << "SWCSAWrapper error: " << error_index(r) << std::endl; + std::exit(r); + } + text[l] = 0; + return text; + } + + bool IsPrefix(uchar const *) const { unsupported(); return false; }; + bool IsSuffix(uchar const *) const { unsupported(); return false; }; + bool IsEqual(uchar const *) const { unsupported(); return false; }; + bool IsContains(uchar const *) const { unsupported(); return false; }; + bool IsLessThan(uchar const *) const { unsupported(); return false; }; + + bool IsPrefix(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsSuffix(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsEqual(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsContains(uchar const *, DocId, DocId) const { unsupported(); return false; }; + bool IsLessThan(uchar const *, DocId, DocId) const { unsupported(); return false; }; + + ulong Count(uchar const *pattern) const + { + ulong occs = 0; + // FIXME Const correctness is broken! + int r = count (index, (uchar *)pattern, std::strlen((char const *)pattern), &occs); + if (r) + { + std::cout << "SWCSAWrapper::Count error " << error_index(r) << std::endl; + std::exit(r); + } + return occs; + } + unsigned CountPrefix(uchar const *) const { unsupported(); return 0; }; + unsigned CountSuffix(uchar const *) const { unsupported(); return 0; }; + unsigned CountEqual(uchar const *) const { unsupported(); return 0; }; + unsigned CountContains(uchar const *) const { unsupported(); return 0; }; + unsigned CountLessThan(const unsigned char*) const { unsupported(); return 0; }; + + unsigned CountPrefix(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountSuffix(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountEqual(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountContains(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + unsigned CountLessThan(uchar const *, DocId, DocId) const { unsupported(); return 0; }; + + // Definition of document_result is inherited from SXSI::TextCollection. + document_result Prefix(uchar const *) const { unsupported(); return document_result(); }; + document_result Suffix(uchar const *) const { unsupported(); return document_result(); }; + document_result Equal(uchar const *) const { unsupported(); return document_result(); }; + document_result Contains(uchar const *pattern) const + { + ulong *occ = 0, numocc = 0; + // FIXME Const correctness is broken! + int r = locateWord(index, (uchar *)pattern, std::strlen((char const *)pattern), &occ, &numocc, 0); + if (r) + { + std::cout << "SWCSAWrapper::Contains error: " << error_index(r) << std::endl; + std::exit(r); + } + + document_result dr; + dr.reserve(numocc+1); + for (ulong i = 0; i < numocc; ++i) + dr.push_back(offsets->rank(occ[i])-1); + + free(occ); + return dr; + } + document_result LessThan(uchar const *) const { unsupported(); return document_result(); }; + document_result KMismaches(uchar const *, unsigned) const { unsupported(); return document_result(); }; + document_result KErrors(uchar const *, unsigned) const { unsupported(); return document_result(); }; + + document_result Prefix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result Suffix(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result Equal(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result Contains(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + document_result LessThan(uchar const *, DocId, DocId) const { unsupported(); return document_result(); }; + + // Definition of full_result is inherited from SXSI::TextCollection. + full_result FullContains(uchar const *) const { unsupported(); return full_result(); }; + full_result FullContains(uchar const *, DocId, DocId) const { unsupported(); return full_result(); }; + full_result FullKMismatches(uchar const *, unsigned) const { unsupported(); return full_result(); }; + full_result FullKErrors(uchar const *, unsigned) const { unsupported(); return full_result(); }; + + // Index from/to disk + SWCSAWrapper(FILE *file, char const *filename) + : index(0), offsets(0), n(0), seSize(0), numberOfTexts(0) + { + uchar verFlag = 0; + if (std::fread(&verFlag, 1, 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Load(): file read error (version flag)."); + if (verFlag != SWCSAWRAPPER_VERSION_FLAG) + throw std::runtime_error("SWCSAWrapper::Load(): invalid save file version."); + + if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Load(): file read error (n)."); + if (std::fread(&seSize, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Load(): file read error (seSize)."); + if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("FMIndex::Load(): file read error (numberOfTexts)."); + + offsets = new CSA::DeltaVector(file); + + // FIXME Const correctness is broken! + int r = load_index((char *)filename, &index); + if (r) + { + std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl; + std::exit(r); + } + } + + void Save(FILE *file, char const *filename) const + { + const char type = 'W'; + // Saving type info: + if (std::fwrite(&type, 1, 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Save(): file write error (type flag)."); + + const uchar ver = SWCSAWRAPPER_VERSION_FLAG; + // Saving version info: + if (std::fwrite(&ver, 1, 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Save(): file write error (version flag)."); + + if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Save(): file write error (n)."); + if (std::fwrite(&(this->seSize), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Save(): file write error (seSize)."); + if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("SWCSAWrapper::Save(): file write error (numberOfTexts)."); + + offsets->writeTo(file); + + // FIXME Const correctness is broken! + int r = save_index(index, (char *)filename); + if (r) + { + std::cout << "SWCSAWrapper::Save error: " << error_index(r) << std::endl; + std::exit(r); + } + } + +private: + void *index; + CSA::DeltaVector *offsets; + + TextPosition n; + ulong seSize; + + // Total number of texts in the collection + unsigned numberOfTexts; + + void unsupported() const + { + std::cerr << std::endl << "-------------------------------------------------------------\n" + << "SWCSAWrapper: unsupported method!\nSee SWCSAWrapper.h for more details.\n" + << "The default index (FMIndex) implements this method!" << std::endl; + std::exit(5); + } +}; // class SWCSAWrapper + +} // namespace SXSI + +#endif diff --git a/TCImplementation.cpp b/TCImplementation.cpp deleted file mode 100644 index 1a29f25..0000000 --- a/TCImplementation.cpp +++ /dev/null @@ -1,1297 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * - * * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - *****************************************************************************/ -#include "TCImplementation.h" - -//#define DEBUG_MEMUSAGE -#ifdef DEBUG_MEMUSAGE -#include "HeapProfiler.h" // FIXME remove -#endif - -#include -#include -#include -#include -#include -#include -#include -#include // For strlen() -using std::vector; -using std::pair; -using std::make_pair; -using std::map; -using std::string; -namespace SXSI -{ - -// Save file version info -const uchar TCImplementation::versionFlag = 8; - -/** - * Constructor inits an empty dynamic FM-index. - * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. - */ -TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, - unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, - CSA::DeltaVector & notIndexed, const string & niText, char tsType) - : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), - suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0) -{ - makewavelet(bwt); // Deletes bwt! - bwt = 0; - - // Make sampling tables - maketables(numberOfSamples_, tsType, notIndexed, niText); -} - -bool TCImplementation::EmptyText(DocId k) const -{ - assert(k < (DocId)numberOfTexts); - return false; // Empty texts are not indexed -} - -uchar * TCImplementation::GetText(DocId k) const -{ - assert(k < (DocId)numberOfTexts); - - return textStorage->GetText(k); -/* TextPosition i = k; - - string result; - // Reserve average string length to avoid reallocs - result.reserve(n/numberOfTexts); - - uchar c = alphabetrank->access(i); - while (c != '\0') - { - result.push_back(c); - i = C[c]+alphabetrank->rank(c,i)-1; - - c = alphabetrank->access(i); // "next" char. - } - - // Convert to uchar (FIXME return string?) - i = result.size(); - uchar* res = new uchar[i+1]; - res[i] = '\0'; - for (ulong j = 0; j < i; ++j) - res[i-j-1] = result[j]; - return res;*/ -} - -/* - * Substring queries are supported via the pointer returned by TextStorage::GetText -uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const -{ - assert(k < (DocId)numberOfTexts); - assert(j < (*textLength)[k]); - assert(i <= j); - - ulong textRank = 0; - - // Start position of k'th text - ulong start = (*textStartPos)[k]; - - return Substring(i + start, j-i+1); - }*/ - - - -/****************************************************************** - * Existential queries - */ -bool TCImplementation::IsPrefix(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return true; - - TextPosition sp = 0, ep = 0; - Search(pattern, m, &sp, &ep); - - // Check for end-marker(s) in result interval - if (CountEndmarkers(sp, ep)) - return true; - return false; -} - -bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return true; - - TextPosition sp = 0, ep = 0; - Search(pattern, m, &sp, &ep); - - // Check for end-marker(s) in result interval - if (CountEndmarkers(sp, ep, begin, end)) - return true; - return false; -} - - -bool TCImplementation::IsSuffix(uchar const *pattern) const -{ - // Here counting is as fast as IsSuffix(): - if (CountSuffix(pattern) > 0) - return true; - return false; -} - -bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const -{ - // Here counting is as fast as IsSuffix(): - if (CountSuffix(pattern, begin, end) > 0) - return true; - return false; -} - -bool TCImplementation::IsEqual(uchar const *pattern) const -{ - TextPosition m = std::strlen((char *)pattern); - if (m == 0) - return false; // No empty texts exists - - TextPosition sp = 0, ep = 0; - // Match including end-marker - Search(pattern, m+1, &sp, &ep); - - // Check for end-marker(s) in result interval - if (CountEndmarkers(sp, ep)) - return true; - return false; -} - -bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const -{ - TextPosition m = std::strlen((char *)pattern); - if (m == 0) - return false; // No empty texts exists - - TextPosition sp = 0, ep = 0; - // Match including end-marker - Search(pattern, m+1, &sp, &ep, begin, end); - - // Check for end-marker(s) in result interval - if (CountEndmarkers(sp, ep)) - return true; - return false; -} - -bool TCImplementation::IsContains(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return true; - - TextPosition sp = 0, ep = 0; - // Just check if pattern exists somewhere - ulong count = Search(pattern, m, &sp, &ep); - - if (count > 0) - return true; - return false; -} - -bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const -{ - // Here counting is as fast as existential querying - if (CountContains(pattern, begin, end) > 0) - return true; // FIXME No need to filter result set - return false; -} - -bool TCImplementation::IsLessThan(uchar const * pattern) const -{ - if (CountLessThan(pattern) > 0) - return true; - return false; -} - -bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const -{ - if (CountLessThan(pattern, begin, end) > 0) - return true; - return false; -} - -/****************************************************************** - * Counting queries - */ -ulong TCImplementation::Count(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return 0; - - TextPosition sp = 0, ep = 0; - unsigned count = (unsigned) Search(pattern, m, &sp, &ep); - return count; -} - -unsigned TCImplementation::CountPrefix(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return numberOfTexts; - - TextPosition sp = 0, ep = 0; - Search(pattern, m, &sp, &ep); - - // Count end-markers in result interval - return CountEndmarkers(sp, ep); -} - -unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return numberOfTexts; - - TextPosition sp = 0, ep = 0; - Search(pattern, m, &sp, &ep); - - // Count end-markers in result interval - return CountEndmarkers(sp, ep, begin, end); -} - -unsigned TCImplementation::CountSuffix(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return numberOfTexts; - - TextPosition sp = 0, ep = 0; - // Search with end-marker - unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep); - - return count; -} - -unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return numberOfTexts; - - TextPosition sp = 0, ep = 0; - // Search with end-marker - unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end); - - return count; -} - -unsigned TCImplementation::CountEqual(uchar const *pattern) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return 0; // No empty texts. - - TextPosition sp = 0, ep = 0; - // Match including end-marker - Search(pattern, m+1, &sp, &ep); - - // Count end-markers in result interval - return CountEndmarkers(sp, ep); -} - -unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return 0; // No empty texts. - - TextPosition sp = 0, ep = 0; - // Match including end-marker - Search(pattern, m+1, &sp, &ep, begin, end); - - // Count end-markers in result interval - return CountEndmarkers(sp, ep); // Already within [begin, end] -} - -unsigned TCImplementation::CountContains(uchar const * pattern) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return numberOfTexts; // Total number of texts. - - // Here counting is as slow as fetching the result set - // because we have to filter out occ's that fall within same document. - TextCollection::document_result result = Contains(pattern); - return result.size(); -} - -unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return numberOfTexts; // Total number of texts. - - // Here counting is as slow as fetching the result set - // because we have to filter out occ's that fall within same document. - TextCollection::document_result result = Contains(pattern, begin, end); - return result.size(); -} - -// Less than or equal -unsigned TCImplementation::CountLessThan(uchar const * pattern) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return 0; // No empty texts. - - TextPosition sp = 0, ep = 0; - SearchLessThan(pattern, m, &sp, &ep); - - // Count end-markers in result interval - return CountEndmarkers(sp, ep); -} - -unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return 0; // No empty texts. - - TextPosition sp = 0, ep = 0; - SearchLessThan(pattern, m, &sp, &ep); - - // Count end-markers in result interval - return CountEndmarkers(sp, ep, begin, end); -} - -/** - * Document reporting queries - */ -TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // FIXME Should return all 1...k - - TextPosition sp = 0, ep = 0; - Search(pattern, m, &sp, &ep); - - // Iterate through end-markers in [sp,ep]: - return EnumerateEndmarkers(sp, ep); -} - -TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // FIXME Should return all 1...k - - TextPosition sp = 0, ep = 0; - Search(pattern, m, &sp, &ep); - - // Return end-markers in [sp,ep] and [begin, end]: - return EnumerateEndmarkers(sp, ep, begin, end); -} - -TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // FIXME Should return all 1...k - - TextPosition sp = 0, ep = 0; - // Search with end-marker - Search(pattern, m+1, &sp, &ep); - - TextCollection::document_result result; - result.reserve(ep-sp+1); // Try to avoid reallocation. - - // Check each occurrence - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - - uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i); - } - // Assert: c == '\0' OR sampled->IsBitSet(i) - - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; - result.push_back(Doc->access(endmarkerRank)); - } - else // Sampled position - { - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - result.push_back(docId); - } - } - - return result; -} - -TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // FIXME Should return all 1...k - - TextPosition sp = 0, ep = 0; - // Search with end-marker - Search(pattern, m+1, &sp, &ep, begin, end); - - TextCollection::document_result result; - result.reserve(ep-sp+1); // Try to avoid reallocation. - - // Check each occurrence, already within [begin, end] - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - - uchar c = alphabetrank->access(i); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i); - } - // Assert: c == '\0' OR sampled->IsBitSet(i) - - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; - result.push_back(Doc->access(endmarkerRank)); - } - else // Sampled position - { - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - result.push_back(docId); - } - } - - return result; -} - - -TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return TextCollection::document_result(); // FIXME Should return all empty texts - - TextPosition sp = 0, ep = 0; - // Match including end-marker - Search(pattern, m+1, &sp, &ep); - - // Report end-markers in result interval - return EnumerateEndmarkers(sp, ep); -} - -TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char const *)pattern); - if (m == 0) - return TextCollection::document_result(); // FIXME Should return all empty texts - - TextPosition sp = 0, ep = 0; - // Match including end-marker - Search(pattern, m+1, &sp, &ep, begin, end); - - // Report end-markers in result interval - return EnumerateEndmarkers(sp, ep, begin, end); -} - - -TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); - - TextPosition sp = 0, ep = 0; - // Search all occurrences - Search(pattern, m, &sp, &ep); - - // We want unique document indentifiers, using std::set to collect them - std::set resultSet; - EnumerateDocuments(resultSet, sp, ep); - - // Convert std::set to std::vector - TextCollection::document_result result(resultSet.begin(), resultSet.end()); - return result; -} - -TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); - - TextPosition sp = 0, ep = 0; - // Search all occurrences - Search(pattern, m, &sp, &ep); - - // We want unique document indentifiers, using std::set to collect them - std::set resultSet; - EnumerateDocuments(resultSet, sp, ep, begin, end); - - // Convert std::set to std::vector - TextCollection::document_result result(resultSet.begin(), resultSet.end()); - return result; -} - - -/** - *** -* * FIXME Lessthan or equal - */ -TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // empty result set - - TextPosition sp = 0, ep = 0; - SearchLessThan(pattern, m, &sp, &ep); - - // Report end-markers in result interval - return EnumerateEndmarkers(sp, ep); -} - -TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // empty result set - - TextPosition sp = 0, ep = 0; - SearchLessThan(pattern, m, &sp, &ep); - - // Iterate through end-markers in [sp,ep] and [begin, end]: - return EnumerateEndmarkers(sp, ep, begin, end); -} - - -TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // empty result set - - suffix_range_vector ranges; - kmismatches(ranges, pattern, 0, n-1, m, k); - std::set resultSet; - - for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) - // Iterate through docs in [sp,ep]: - EnumerateDocuments(resultSet, (*it).first, (*it).second); - - // Convert std::set to std::vector - TextCollection::document_result result(resultSet.begin(), resultSet.end()); - return result; -} - -TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::document_result(); // empty result set - - suffix_range_vector ranges; - ulong *dd = new ulong[m+1]; - for (ulong i=0;i resultSet; - for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) - // Iterate through docs in [sp,ep]: - EnumerateDocuments(resultSet, (*it).first, (*it).second); - - // Convert std::set to std::vector - TextCollection::document_result result(resultSet.begin(), resultSet.end()); - return result; -} - - -/** - * Full result set queries - */ -TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return full_result(); // FIXME Throw exception? - - TextPosition sp = 0, ep = 0; - // Search all occurrences - Search(pattern, m, &sp, &ep); - - full_result result; - result.reserve(ep-sp+1); // Try to avoid reallocation. - EnumeratePositions(result, sp, ep); - - return result; -} - -TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return full_result(); // FIXME Throw exception? - - TextPosition sp = 0, ep = 0; - // Search all occurrences - Search(pattern, m, &sp, &ep); - - full_result result; - result.reserve(ep-sp+1); // Try to avoid reallocation. - EnumeratePositions(result, sp, ep, begin, end); - - return result; -} - -TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::full_result(); // empty result set - - suffix_range_vector ranges; - ulong count = kmismatches(ranges, pattern, 0, n-1, m, k); - - TextCollection::full_result result; - result.reserve(count); // avoid reallocation. - for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it) - // Iterate through docs in [sp,ep]: - EnumeratePositions(result, (*it).first, (*it).second); - return result; -} - -TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const -{ - TextPosition m = strlen((char *)pattern); - if (m == 0) - return TextCollection::full_result(); // empty result set - - suffix_range_vector ranges; - ulong *dd = new ulong[m+1]; - for (unsigned i=0;in), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("TCImplementation::Save(): file write error (n)."); - if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("TCImplementation::Save(): file write error (samplerate)."); - - for(ulong i = 0; i < 256; ++i) - if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("TCImplementation::Save(): file write error (C table)."); - - if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position)."); - - alphabetrank->save(file); - sampled->save(file); - suffixes->Save(file); - suffixDocId->Save(file); - - if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts)."); - if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength)."); - - Doc->save(file); - textStorage->Save(file); - fflush(file); -} - - -/** - * Load index from a file handle - * - * Throws a std::runtime_error exception on i/o error. - * For more info, see TCImplementation::Save(). - * - * index_mode_t is defined in TextCollection.h and - * defaults to both the index and "naive" text. - * - * Note: Samplerate can not be changed during load. - */ -TCImplementation::TCImplementation(FILE *file, index_mode_t im, unsigned samplerate_) - : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), - suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0) -{ - uchar verFlag = 0; - if (std::fread(&verFlag, 1, 1, file) != 1) - throw std::runtime_error("TCImplementation::Load(): file read error (version flag)."); - if (verFlag != TCImplementation::versionFlag) - throw std::runtime_error("TCImplementation::Load(): invalid save file version."); - - if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("TCImplementation::Load(): file read error (n)."); - if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("TCImplementation::Load(): file read error (samplerate)."); -// FIXME samplerate can not be changed during load. -// if (this->samplerate == 0) -// this->samplerate = samplerate; - - for(ulong i = 0; i < 256; ++i) - if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("TCImplementation::Load(): file read error (C table)."); - - if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position)."); - - alphabetrank = static_sequence::load(file); - if (im == index_mode_text_only) { delete alphabetrank; alphabetrank = 0; } - - sampled = static_bitsequence::load(file); - if (im == index_mode_text_only) { delete sampled; sampled = 0; } - suffixes = new BlockArray(file); - if (im == index_mode_text_only) { delete suffixes; suffixes = 0; } - suffixDocId = new BlockArray(file); - if (im == index_mode_text_only) { delete suffixDocId; suffixDocId = 0; } - - if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts)."); - if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); - - Doc = new ArrayDoc(file); //static_sequence::load(file); - if (im == index_mode_text_only) { delete Doc; Doc = 0; } - - textStorage = TextStorage::Load(file); - - // FIXME Construct data structures with new samplerate - //maketables(); -} - - - -/** - * Rest of the functions follow... - */ -ulong TCImplementation::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const -{ - int c; - while (*sp<=*ep && i>=1) - { - c = (int)pattern[--i]; - *sp = C[c]+alphabetrank->rank(c,*sp-1); - *ep = C[c]+alphabetrank->rank(c,*ep)-1; - } - if (*sp<=*ep) - return *ep - *sp + 1; - else - return 0; -} - - -ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const -{ - if (sp>ep) return 0; - if (j == 0) - { - result.push_back(std::make_pair(sp,ep)); - return ep-sp+1; - } - int c; - ulong spnew; - ulong epnew; - int knew; - ulong sum=0; - if (k==0) - { - sum = searchPrefix(pattern, j, &sp, &ep); - if (sp<=ep) - result.push_back(std::make_pair(sp, ep)); - return sum; - } - vector chars = alphabetrank->accessAll(sp, ep); - for (vector::iterator it = chars.begin(); it != chars.end(); ++it) - { - if (*it == 0) - continue; // skip '\0' - c = *it; - spnew = C[c]+alphabetrank->rank(c,sp-1); - epnew = C[c]+alphabetrank->rank(c,ep)-1; - if (c!=pattern[j-1]) knew = (int)k-1; else knew = k; - if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew); - } - return sum; -} - -//first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i -ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const -{ - ulong sum=0; - if (d[m]<=k) // range of suffixes with at most k-errors found - { - if (sp<=ep) - result.push_back(std::make_pair(sp, ep)); - sum += (sp<=ep)?ep-sp+1:0ul; - } - if (sp>ep || j==0) - return sum; - ulong *dnew = new ulong[m+1]; - int c; - ulong spnew; - ulong p,lowerbound; - ulong epnew; - vector chars = alphabetrank->accessAll(sp, ep); - for (vector::iterator it = chars.begin(); it != chars.end(); ++it) - { - if (*it == 0) - continue; // skip '\0' - c = *it; - spnew = C[c]+alphabetrank->rank(c,sp-1); - epnew = C[c]+alphabetrank->rank(c,ep)-1; - if (spnew>epnew) continue; - dnew[0]=m+k-j+1; - lowerbound=k+1; - for (p=1; p<=m; p++) { - dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1)); - if (dnew[p]rank(c,i) - int c = (int)pattern[m-1]; - TextPosition i=m-1; - TextPosition sp = C[c]; - TextPosition ep = C[c+1]-1; - while (sp<=ep && i>=1) - { -// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); - c = (int)pattern[--i]; - sp = C[c]+alphabetrank->rank(c,sp-1); - ep = C[c]+alphabetrank->rank(c,ep)-1; - } - *spResult = sp; - *epResult = ep; - if (sp<=ep) - return ep - sp + 1; - else - return 0; -} - -ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const -{ - // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) - int c = (int)pattern[m-1]; - assert(c == 0); // Start from endmarkers - TextPosition i=m-1; - TextPosition sp = begin; - TextPosition ep = end; - while (sp<=ep && i>=1) - { -// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); - c = (int)pattern[--i]; - sp = C[c]+alphabetrank->rank(c,sp-1); - ep = C[c]+alphabetrank->rank(c,ep)-1; - } - *spResult = sp; - *epResult = ep; - if (sp<=ep) - return ep - sp + 1; - else - return 0; -} - - -ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const -{ - // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) - uint c = (int)pattern[m-1]; - TextPosition i=m-1; - TextPosition sp = 1; - TextPosition ep = C[c+1]-1; - while (sp<=ep && i>=1) - { -// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); - c = (int)pattern[--i]; - uint result = alphabetrank->rankLessThan(c,ep); - if (result == ~0u) - ep = 0; - else - ep = C[c]+result-1; - } - *spResult = sp; - *epResult = ep; - if (sp<=ep) - return ep - sp + 1; - else - return 0; -} - - -TCImplementation::~TCImplementation() { - delete alphabetrank; - delete sampled; - delete suffixes; - delete suffixDocId; - delete Doc; - delete textStorage; -} - -void TCImplementation::makewavelet(uchar *bwt) -{ - ulong i, min = 0, - max; - for (i=0;i<256;i++) - C[i]=0; - for (i=0;i0) {min = i; break;} - for (i=255;i>=min;--i) - if (C[i]>0) {max = i; break;} - - ulong prev=C[0], temp; - C[0]=0; - for (i=1;i<256;i++) { - temp = C[i]; - C[i]=C[i-1]+prev; - prev = temp; - } -// this->codetable = node::makecodetable(bwt,n); -// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); -// delete [] bwt; - //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! -// std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - -#ifdef DEBUG_MEMUSAGE - std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - - alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate? - wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape - alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am); - delete bmb; - bwt = 0; // already deleted - -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; -#endif -} - -void TCImplementation::maketables(ulong sampleLength, char tsType, CSA::DeltaVector & notIndexed, const string & niText) -{ - // Calculate BWT end-marker position (of last inserted text) - { - ulong i = 0; - uint alphabetrank_i_tmp = 0; - uchar c = alphabetrank->access(i, alphabetrank_i_tmp); - while (c != '\0') - { - i = C[c]+alphabetrank_i_tmp-1; - c = alphabetrank->access(i, alphabetrank_i_tmp); - } - - this->bwtEndPos = i; - } - -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - - // Build up array for text starting positions -// BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); -// (*textStartPos)[0] = 0; - - // Mapping from end-markers to doc ID's: - unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts); -// uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1]; - BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts); - - BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n)); - uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1]; - for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++) - sampledpositions[i] = 0; - - ulong x,p=bwtEndPos; - ulong sampleCount = 0; - // Keeping track of text position of prev. end-marker seen - ulong posOfSuccEndmarker = n-1; - DocId textId = numberOfTexts; - ulong ulongmax = 0; - ulongmax--; - uint alphabetrank_i_tmp =0; - - // Text length = n + number of bytes not indexed. - TextStorageBuilder tsbuilder(n + niText.length()); - ulong tsb_i = n + niText.length(); // Iterator from text length to 0. - string::const_reverse_iterator nit_i = niText.rbegin(); // Iterator through non-indexed texts - - for (ulong i=n-1;iGetPos(i) - x=(i==n-1)?0:i+1; - - uchar c = alphabetrank->access(p, alphabetrank_i_tmp); - - tsbuilder[--tsb_i] = c; // Build TextStorage - - if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0') - { - set_field(sampledpositions,1,p,1); - (*positions)[sampleCount] = p; - sampleCount ++; - } - - if (c == '\0') - { - unsigned prevTextId = textId; // Cache textId value. - --textId; - /** - * At first c == '\0' it holds that (prevTextId == numberOfTexts), thus, - * we have to search for the first text that is actually *indexed* - * to get correct prevTextId. - */ - if (prevTextId == numberOfTexts) - { - prevTextId = 0; - while (notIndexed.isSet(prevTextId)) - ++ prevTextId; - // Now prevTextId points to the first indexed Doc ID. - } - - /** - * Insert non-indexed texts - */ - while (notIndexed.isSet(textId)) - { - do { - tsbuilder[tsb_i] = *nit_i; - -- tsb_i; - ++ nit_i; - } while (nit_i != niText.rend() && *nit_i != '\0'); - - tsbuilder[tsb_i] = '\0'; - - if (textId == 0) - break; - --textId; - } - - // Record the order of end-markers in BWT: - ulong endmarkerRank = alphabetrank_i_tmp - 1; - //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts); - (*endmarkerDocId)[endmarkerRank] = prevTextId % numberOfTexts; - - // Store text length and text start position: - if (textId < (DocId)numberOfTexts - 1) - { -// (*textStartPos)[textId + 1] = x; // x-1 is text position of end-marker. - - posOfSuccEndmarker = i; - } - - // LF-mapping from '\0' does not work with this (pseudo) BWT. - // Correct LF-mapping to the last char of the previous text: - p = textId - notIndexed.rank(textId); - } - else // Now c != '\0', do LF-mapping: - p = C[c]+alphabetrank_i_tmp-1; - } - while (textId > 0 && notIndexed.isSet(textId-1)) - { - do { - -- tsb_i; - tsbuilder[tsb_i] = *nit_i; - ++ nit_i; - } while (nit_i != niText.rend() && *nit_i != '\0'); - --textId; - } - assert(textId == 0); - assert(tsb_i == 0); - assert(nit_i == niText.rend()); - -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - - textStorage = tsbuilder.InitTextStorage(tsType); - -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - - sampled = new static_bitsequence_rrr02(sampledpositions, n, 16); - delete [] sampledpositions; - assert(sampleCount == sampleLength); - assert(sampled->rank1(n-1) == sampleLength); - -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - - // Suffixes store an offset from the text start position - suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); - suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); - - x = n + niText.length() - 2; - textId = numberOfTexts - 1; - posOfSuccEndmarker = x + 1; - for(ulong i = 0; i < sampleLength; i ++) { - // Find next sampled text position - while ((posOfSuccEndmarker - x) % samplerate != 0 - || notIndexed.isSet(textId)) // Loop over non-indexed - { - --x; - assert(x != ~0lu); - if (textStorage->IsEndmarker(x)) - { - posOfSuccEndmarker = x--; - -- textId; - } - } - assert((*positions)[i] < n); - ulong j = sampled->rank1((*positions)[i]); - - assert(j != 0); // if (j==0) j=sampleLength; - - TextPosition textPos = (x==n-1)?0:x+1; - (*suffixDocId)[j-1] = textId; // textStorage->DocIdAtTextPos(textPos); - assert(textStorage->DocIdAtTextPos(textPos) == textId); - - assert((*suffixDocId)[j-1] < numberOfTexts); - // calculate offset from text start: - (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]); - --x; - if (x != ~0lu && textStorage->IsEndmarker(x)) - { - posOfSuccEndmarker = x--; - -- textId; - } - } - - delete positions; - -#ifdef DEBUG_MEMUSAGE - std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - -#ifdef DEBUG_MEMUSAGE - std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - HeapProfiler::ResetMaxHeapConsumption(); -#endif - - /*alphabet_mapper * am = new alphabet_mapper_none(); - static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate? - Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true); - delete bmb;*/ - // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs! - - Doc = new ArrayDoc(endmarkerDocId); - -#ifdef DEBUG_MEMUSAGE - std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; -#endif -} - - -/** - * Finds document identifier for given text position - * - * Starting text position of the document is stored into second parameter. - * Binary searching on text starting positions. - */ -TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const -{ - assert(i < n); - - DocId a = 0; - DocId b = numberOfTexts - 1; - while (a < b) - { - DocId c = a + (b - a)/2; - if ((*textStartPos)[c] > i) - b = c - 1; - else if ((*textStartPos)[c+1] > i) - return c; - else - a = c + 1; - } - - assert(a < (DocId)numberOfTexts); - assert(i >= (*textStartPos)[a]); - assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); - return a; -} - - -} // namespace SXSI - diff --git a/TCImplementation.h b/TCImplementation.h deleted file mode 100644 index ff5dff3..0000000 --- a/TCImplementation.h +++ /dev/null @@ -1,386 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * - * * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - *****************************************************************************/ - -#ifndef _TCImplementation_H_ -#define _TCImplementation_H_ - -#include "incbwt/bits/deltavector.h" - -#include "BitRank.h" -#include "TextCollection.h" -#include "BlockArray.h" - -// Include from XMLTree/libcds -#include // Defines W == 32 -#include -#include -#include -#include - -// Re-define word size to ulong: -#undef W -#if __WORDSIZE == 64 -# define W 64 -#else -# define W 32 -#endif -#undef bitset -#undef bitget - - -#include "TextStorage.h" -#include "ArrayDoc.h" -#include -#include - -namespace SXSI -{ - - -/** - * Implementation of the TextCollection interface - * - */ -class TCImplementation : public SXSI::TextCollection { -public: - TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong, - CSA::DeltaVector &, const std::string &, char); - ~TCImplementation(); - - bool EmptyText(DocId) const; - - /** - * Extracting one text. - * - * Call DeleteText() for each pointer returned by GetText() - * to avoid possible memory leaks. - */ - uchar * GetText(DocId) const; - void DeleteText(uchar *text) const - { textStorage->DeleteText(text); } - - /** - * Returns a pointer to the beginning of texts i, i+1, ..., j. - * Texts are separated by a '\0' byte. - * - * Call DeleteText() for each pointer returned by GetText() - * to avoid possible memory leaks. - */ - uchar * GetText(DocId i, DocId j) const - { return textStorage->GetText(i, j); } - - /** - * Returns a substring of given text ID. - * - * FIXME This may be reimplemented via TextStorage. - */ -// uchar* GetText(DocId, TextPosition, TextPosition) const; - - bool IsPrefix(uchar const *) const; - bool IsSuffix(uchar const *) const; - bool IsEqual(uchar const *) const; - bool IsContains(uchar const *) const; - bool IsLessThan(uchar const *) const; - - bool IsPrefix(uchar const *, DocId, DocId) const; - bool IsSuffix(uchar const *, DocId, DocId) const; - bool IsEqual(uchar const *, DocId, DocId) const; - bool IsContains(uchar const *, DocId, DocId) const; - bool IsLessThan(uchar const *, DocId, DocId) const; - - ulong Count(uchar const *) const; - unsigned CountPrefix(uchar const *) const; - unsigned CountSuffix(uchar const *) const; - unsigned CountEqual(uchar const *) const; - unsigned CountContains(uchar const *) const; - unsigned CountLessThan(const unsigned char*) const; - - unsigned CountPrefix(uchar const *, DocId, DocId) const; - unsigned CountSuffix(uchar const *, DocId, DocId) const; - unsigned CountEqual(uchar const *, DocId, DocId) const; - unsigned CountContains(uchar const *, DocId, DocId) const; - unsigned CountLessThan(uchar const *, DocId, DocId) const; - - // Definition of document_result is inherited from SXSI::TextCollection. - document_result Prefix(uchar const *) const; - document_result Suffix(uchar const *) const; - document_result Equal(uchar const *) const; - document_result Contains(uchar const *) const; - document_result LessThan(uchar const *) const; - document_result KMismaches(uchar const *, unsigned) const; - document_result KErrors(uchar const *, unsigned) const; - - document_result Prefix(uchar const *, DocId, DocId) const; - document_result Suffix(uchar const *, DocId, DocId) const; - document_result Equal(uchar const *, DocId, DocId) const; - document_result Contains(uchar const *, DocId, DocId) const; - document_result LessThan(uchar const *, DocId, DocId) const; - - // Definition of full_result is inherited from SXSI::TextCollection. - full_result FullContains(uchar const *) const; - full_result FullContains(uchar const *, DocId, DocId) const; - full_result FullKMismatches(uchar const *, unsigned) const; - full_result FullKErrors(uchar const *, unsigned) const; - - // Index from/to disk - TCImplementation(FILE *, index_mode_t, unsigned); - void Save(FILE *) const; - -private: - typedef std::vector > suffix_range_vector; - - static const uchar versionFlag; - TextPosition n; - unsigned samplerate; - unsigned C[256]; - TextPosition bwtEndPos; - static_sequence * alphabetrank; - - // Sample structures for texts longer than samplerate - static_bitsequence * sampled; - BlockArray *suffixes; - BlockArray *suffixDocId; - - // Total number of texts in the collection - unsigned numberOfTexts; - // Length of the longest text - ulong maxTextLength; - - // Array of document id's in the order of end-markers in BWT -// static_sequence *Doc; - ArrayDoc *Doc; - - // Text storage for fast extraction - TextStorage * textStorage; - - // Following methods are not part of the public API - uchar * BWT(uchar *); - void makewavelet(uchar *); - void maketables(ulong, char, CSA::DeltaVector &, const std::string &); - DocId DocIdAtTextPos(BlockArray*, TextPosition) const; - ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; - ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const; - ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const; - ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const; - /** - * Count end-markers in given interval - */ - inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const - { - if (sp > ep) - return 0; - - ulong ranksp = 0; - if (sp != 0) - ranksp = alphabetrank->rank(0, sp - 1); - - return alphabetrank->rank(0, ep) - ranksp; - } - - /** - * Count end-markers in given interval and - * within docIds [min,max] - */ - inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const - { - if (sp != 0) - sp = alphabetrank->rank(0, sp - 1); - ep = alphabetrank->rank(0, ep); - if (ep == 0) - return 0; - - return Doc->count(sp, ep-1, min, max); - } - - /** - * Enumerate all end-markers in given interval - */ - inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const - { - if (sp != 0) - sp = alphabetrank->rank(0, sp - 1); - ep = alphabetrank->rank(0, ep); - if (ep == 0) - return document_result(); - - return Doc->accessAll(sp, ep-1); - } - - /** - * Enumerate end-markers in given interval and - * within docIds [min,max] - */ - inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const - { - if (sp != 0) - sp = alphabetrank->rank(0, sp - 1); - ep = alphabetrank->rank(0, ep); - if (ep == 0) - return document_result(); - - return Doc->access(sp, ep-1, min, max); - } - - /** - * Enumerate documents in given interval [sp, ep] - */ - inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep) const - { - // We want unique document indentifiers, using std::set to collect them - // FIXME use unordered_set? - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - resultSet.insert(Doc->access(endmarkerRank)); - } - else - { - DocId di = (*suffixDocId)[sampled->rank1(i)-1]; - assert((unsigned)di < numberOfTexts); - resultSet.insert(di); - } - } - } - - /** - * Enumerate documents in given interval [sp, ep] - * and within [begin, end] - */ - inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const - { - // We want unique document indentifiers, using std::set to collect them - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - if (docId >= begin && docId <= end) - resultSet.insert(docId); - } - else - { - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - assert((unsigned)docId < numberOfTexts); - if (docId >= begin && docId <= end) - resultSet.insert(docId); - } - } - } - - /** - * Enumerate document+position pairs (full_result) of - * each suffix in given interval. - */ - inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const - { - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - TextPosition dist = 0; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - ++ dist; - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - result.push_back(std::make_pair(docId, dist)); - } - else - { - TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - - result.push_back(std::make_pair(docId, textPos)); - } - } - } - - /** - * Enumerate document+position pairs (full_result) of - * each suffix in given interval and within [begin, end]. - */ - inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const - { - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - TextPosition dist = 0; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - ++ dist; - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - if (docId >= begin && docId <= end) - result.push_back(std::make_pair(docId, dist)); - } - else - { - TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - - if (docId >= begin && docId <= end) - result.push_back(std::make_pair(docId, textPos)); - } - } - } - -}; // class TCImplementation - -} // namespace SXSI - -#endif diff --git a/TextCollection.cpp b/TextCollection.cpp index 2aef067..1755b6b 100644 --- a/TextCollection.cpp +++ b/TextCollection.cpp @@ -1,5 +1,6 @@ #include "TextCollection.h" -#include "TCImplementation.h" +#include "FMIndex.h" +#include "SWCSAWrapper.h" namespace SXSI { @@ -9,9 +10,22 @@ namespace SXSI * * See TCImplementation.h for more details. */ - TextCollection * TextCollection::Load(FILE *fp, index_mode_t im, unsigned samplerate) + TextCollection * TextCollection::Load(FILE *fp, char const *filename, index_mode_t im, unsigned samplerate) { - TextCollection *result = new TCImplementation(fp, im, samplerate); - return result; + char type = 0; + if (std::fread(&type, 1, 1, fp) != 1) + throw std::runtime_error("TextCollection::Load(): file read error (type flag)."); + switch (type) + { + case 'F': + return new FMIndex(fp, im, samplerate); + break; + case 'W': + return new SWCSAWrapper(fp, filename); + break; + } + + std::cerr << "TextCollection::Load(): invalid save file version or corrupted input file." << std::endl; + std::exit(1); } } diff --git a/TextCollection.h b/TextCollection.h index 2ea24ca..9c6591f 100644 --- a/TextCollection.h +++ b/TextCollection.h @@ -48,19 +48,25 @@ namespace SXSI /** * Load from a file * + * The second parameter is a prefix to be used for multiple + * files. (SWCSAWrapper uses multiple save files!) + * * New samplerate can be given, otherwise will use the one specified in the save file! * * Throws an exception if std::fread() fails. * */ - static TextCollection* Load(FILE *, index_mode_t = index_mode_default, unsigned samplerate = 0); + static TextCollection* Load(FILE *, char const *, index_mode_t = index_mode_default, unsigned samplerate = 0); /** * Save data structure into a file - * + * + * The second parameter is a prefix to be used for multiple + * files. (SWCSAWrapper uses multiple save files!) + * * Throws an exception if std::fwrite() fails. */ - virtual void Save(FILE *) const = 0; + virtual void Save(FILE *, char const *) const = 0; /** * Virtual destructor diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp index 552abb1..6137406 100644 --- a/TextCollectionBuilder.cpp +++ b/TextCollectionBuilder.cpp @@ -1,146 +1,24 @@ -#include "incbwt/rlcsa_builder.h" -#include "incbwt/bits/deltavector.h" - #include "TextCollectionBuilder.h" -#include "TCImplementation.h" - -using std::string; +#include "FMIndexBuilder.h" +#include "SWCSABuilder.h" namespace SXSI { - -struct TCBuilderRep -{ - unsigned samplerate; - CSA::RLCSABuilder * sa; - - ulong n; - // Total number of texts in the collection - unsigned numberOfTexts; - // Length of the longest text - ulong maxTextLength; - ulong numberOfSamples; - - CSA::DeltaEncoder *notIndexed; // Doc IDs of those texts that are excluded from index. - string niText; // Texts that are not indexed. - -#ifdef TCB_TEST_BWT - DynFMI *dynFMI; -#endif -}; - -/** - * Init text collection - * - */ -TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength) - : p_(new struct TCBuilderRep()) -{ - p_->n = 0; - p_->samplerate = samplerate; - p_->numberOfTexts = 0; - p_->numberOfSamples = 0; - p_->maxTextLength = 0; - p_->notIndexed = new CSA::DeltaEncoder(32); // Block size of 32 - p_->niText = ""; - - // Current params: 8 bytes, no samples, buffer size n/10 bytes. - // Buffer size is always at least 15MB: - if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH) - estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH; - p_->sa = new CSA::RLCSABuilder(8, 0, estimatedInputLength/10); - assert(p_->sa->isOk()); - -#ifdef TCB_TEST_BWT - uchar temp[256]; - for (unsigned i = 0; i < 255; ++i) - temp[i] = i+1; - temp[255] = 0; - p_->dynFMI = new DynFMI(temp, 1, 255, false); -#endif -} - -TextCollectionBuilder::~TextCollectionBuilder() -{ -#ifdef TCB_TEST_BWT - delete p_->dynFMI; -#endif - - delete p_->sa; - delete p_->notIndexed; - delete p_; -} - -void TextCollectionBuilder::InsertText(uchar const * text, bool index) -{ - TextCollection::TextPosition m = std::strlen((char *)text) + 1; - if (m <= 1) - { - // FIXME indexing empty texts - std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; - exit(1); - } - - p_->numberOfTexts ++; - - if (index) - { - /** - * Insert text into the index - */ - p_->n += m; - p_->numberOfSamples += (m-1)/p_->samplerate; - - if (m > p_->maxTextLength) - p_->maxTextLength = m; // Store length of the longest text seen so far. - - p_->sa->insertSequence((char*)text, m-1, 0); - assert(p_->sa->isOk()); - } - else - { - /** - * Insert text only to TextStorage - */ - p_->notIndexed->setBit(p_->numberOfTexts - 1); - p_->niText.append((const char *)text, m); - } -} - - -TextCollection * TextCollectionBuilder::InitTextCollection(char type) +TextCollectionBuilder* TextCollectionBuilder::create(unsigned samplerate, + index_type_t type, + ulong estimatedInputLength) { - uchar * bwt = 0; - CSA::usint length = 0; - if (p_->numberOfTexts == 0) - { - p_->numberOfTexts ++; // Add one empty text - bwt = new uchar[2]; - bwt[0] = '\0'; - bwt[1] = '\0'; - length = 1; - p_->maxTextLength = 1; - } - else + switch (type) { - bwt = (uchar *)p_->sa->getBWT(length); - delete p_->sa; - p_->sa = 0; - - assert(length == p_->n); + case index_type_default: + return new FMIndexBuilder(samplerate, estimatedInputLength); + break; + case index_type_swcsa: + return new SWCSABuilder(samplerate); + break; } - - p_->notIndexed->setBit(p_->numberOfTexts); // FIXME CSA::DeltaVector can not be all 0's - CSA::DeltaVector deltav = CSA::DeltaVector(*p_->notIndexed, p_->numberOfTexts+1); - delete p_->notIndexed; - p_->notIndexed = 0; - - TextCollection *result = new TCImplementation(bwt, (ulong)length, - p_->samplerate, p_->numberOfTexts, p_->maxTextLength, p_->numberOfSamples, - deltav, p_->niText, type); - - return result; + std::cerr << "TextCollectionBuilder::create(): unknown type given: expecting enum value, type = " << type << std::endl; + std::exit(2); } - -} // namespace SXSI +} // Namespace SXSI diff --git a/TextCollectionBuilder.h b/TextCollectionBuilder.h index 6b3819a..43d4204 100644 --- a/TextCollectionBuilder.h +++ b/TextCollectionBuilder.h @@ -23,18 +23,10 @@ #include "TextCollection.h" #include "TextStorage.h" -#include "Tools.h" // Defines ulong and uchar. - -#include -#include -#include // Defines std::pair. -#include // Defines std::strlen, added by Kim - -// Un-comment to compare BWT against a BWT generated from class dynFMI: -//#define TCB_TEST_BWT +#include "Tools.h" // Default samplerate for suffix array samples -#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64 +#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 32 // Default input length, used to calculate the buffer size. #define TEXTCOLLECTION_DEFAULT_INPUT_LENGTH (150 * 1024 * 1024) @@ -42,17 +34,23 @@ namespace SXSI { - struct TCBuilderRep; // Pimpl - /** - * Build an instance of the TextCollection class. + * Builder for an instance of the TextCollection class. */ class TextCollectionBuilder { public: - explicit TextCollectionBuilder(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE, - ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH); - ~TextCollectionBuilder(); + // Index type defaults to FM-index. + // SWCSA can be used for natural language inputs. + // NB: Current SWCSA uses a lot of memory during construction! + enum index_type_t { index_type_default, index_type_swcsa }; + + static TextCollectionBuilder* create(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE, + index_type_t type = index_type_default, + ulong estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH); + + + virtual ~TextCollectionBuilder() { }; /** * Insert text @@ -66,7 +64,7 @@ namespace SXSI * index also. If false, text is added only to the TextCollection * and can not be searched for. */ - void InsertText(uchar const *, bool index = true); + virtual void InsertText(uchar const *, bool index = true) = 0; /** * Make static * @@ -76,15 +74,18 @@ namespace SXSI * TextStorage type defaults to TYPE_PLAIN_TEXT, another * possible type is TYPE_LZ_INDEX. */ - TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT); + virtual TextCollection * InitTextCollection(char type = TextStorage::TYPE_PLAIN_TEXT) = 0; - private: - // Using Pimpl idiom to hide RLCSA implementation. - struct TCBuilderRep * p_; + protected: + // Protected constructor; use the static method TextCollectionBuilder::create() + TextCollectionBuilder() { }; + private: // No copy constructor or assignment TextCollectionBuilder(TextCollectionBuilder const&); TextCollectionBuilder& operator = (TextCollectionBuilder const&); }; + } + #endif diff --git a/TextStorage.h b/TextStorage.h index 24633fd..6294c6b 100644 --- a/TextStorage.h +++ b/TextStorage.h @@ -23,7 +23,16 @@ #include "TextCollection.h" #include "Tools.h" + #include "incbwt/bits/deltavector.h" +// Re-define word size to ulong: +#undef W +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif + #include #include diff --git a/dependencies.mk b/dependencies.mk index c688689..24c4db6 100644 --- a/dependencies.mk +++ b/dependencies.mk @@ -1,39 +1,48 @@ BitRank.o: BitRank.cpp BitRank.h BlockArray.h Tools.h +FMIndex.o: FMIndex.cpp FMIndex.h incbwt/bits/deltavector.h \ + incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ + incbwt/bits/bitbuffer.h BitRank.h BlockArray.h Tools.h TextCollection.h \ + TextStorage.h ArrayDoc.h +FMIndexBuilder.o: FMIndexBuilder.cpp incbwt/rlcsa_builder.h \ + incbwt/rlcsa.h incbwt/bits/vectors.h incbwt/bits/deltavector.h \ + incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ + incbwt/bits/bitbuffer.h incbwt/bits/rlevector.h incbwt/sasamples.h \ + incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \ + incbwt/bits/deltavector.h incbwt/misc/parameters.h \ + incbwt/misc/definitions.h incbwt/bits/deltavector.h FMIndexBuilder.h \ + TextCollectionBuilder.h TextCollection.h Tools.h TextStorage.h FMIndex.h \ + BitRank.h BlockArray.h ArrayDoc.h HeapProfiler.o: HeapProfiler.cpp HeapProfiler.h -TCImplementation.o: TCImplementation.cpp TCImplementation.h BitRank.h \ - BlockArray.h Tools.h TextCollection.h TextStorage.h \ - incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ - incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h -TextCollection.o: TextCollection.cpp TextCollection.h Tools.h \ - TCImplementation.h BitRank.h BlockArray.h TextStorage.h \ - incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ - incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h -TextCollectionBuilder.o: TextCollectionBuilder.cpp incbwt/rlcsa_builder.h \ - incbwt/rlcsa.h incbwt/bits/vectors.h incbwt/bits/deltavector.h \ - incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ - incbwt/bits/bitbuffer.h incbwt/bits/rlevector.h incbwt/sasamples.h \ - incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \ - incbwt/bits/deltavector.h incbwt/misc/parameters.h \ - incbwt/misc/definitions.h TextCollectionBuilder.h TextCollection.h \ - Tools.h TCImplementation.h BitRank.h BlockArray.h TextStorage.h \ - incbwt/bits/deltavector.h +TextCollection.o: TextCollection.cpp TextCollection.h Tools.h FMIndex.h \ + incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ + incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h BitRank.h \ + BlockArray.h TextStorage.h ArrayDoc.h SWCSAWrapper.h \ + swcsa/utils/defValues.h swcsa/utils/valstring.h swcsa/interface.h +TextCollectionBuilder.o: TextCollectionBuilder.cpp \ + TextCollectionBuilder.h TextCollection.h Tools.h TextStorage.h \ + incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ + incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h \ + FMIndexBuilder.h SWCSABuilder.h SWCSAWrapper.h swcsa/utils/defValues.h \ + swcsa/utils/valstring.h swcsa/interface.h +TextStorage.o: TextStorage.cpp TextStorage.h TextCollection.h Tools.h \ + incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ + incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h \ + lzindex/lztrie.h lzindex/basics.h lzindex/trie.h lzindex/heap.h \ + lzindex/parentheses.h lzindex/bitmap.h lzindex/hash.h lzindex/nodemap.h \ + lzindex/position.h Tools.o: Tools.cpp Tools.h bittree.o: bittree.cpp bittree.h rbtree.h Tools.h dynFMI.o: dynFMI.cpp dynFMI.h bittree.h rbtree.h Tools.h rbtree.o: rbtree.cpp rbtree.h -test2dRange.o: test2dRange.cpp testTextCollection.o: testTextCollection.cpp TextCollectionBuilder.h \ - TextCollection.h Tools.h HeapProfiler.h -testTextCollection2.o: testTextCollection2.cpp TextCollection.h Tools.h \ - HeapProfiler.h -testTextCollection3.o: testTextCollection3.cpp TextCollectionBuilder.h \ - TextCollection.h Tools.h -testTextCollection4.o: testTextCollection4.cpp TCImplementation.h \ - BitRank.h BlockArray.h Tools.h TextCollection.h TextStorage.h \ - incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ - incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h \ - HeapProfiler.h + TextCollection.h Tools.h TextStorage.h incbwt/bits/deltavector.h \ + incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ + incbwt/bits/bitbuffer.h HeapProfiler.h testTextCollection5.o: testTextCollection5.cpp HeapProfiler.h \ - TextCollectionBuilder.h TextCollection.h Tools.h + TextCollectionBuilder.h TextCollection.h Tools.h TextStorage.h \ + incbwt/bits/deltavector.h incbwt/bits/bitvector.h \ + incbwt/bits/../misc/definitions.h incbwt/bits/bitbuffer.h timeTextCollection.o: timeTextCollection.cpp TextCollectionBuilder.h \ - TextCollection.h Tools.h + TextCollection.h Tools.h TextStorage.h incbwt/bits/deltavector.h \ + incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ + incbwt/bits/bitbuffer.h diff --git a/incbwt/dependencies.mk b/incbwt/dependencies.mk index 49ec47b..415788b 100644 --- a/incbwt/dependencies.mk +++ b/incbwt/dependencies.mk @@ -1,27 +1,26 @@ rlcsa.o: rlcsa.cpp rlcsa.h bits/vectors.h bits/deltavector.h \ - bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ - bits/rlevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ - bits/deltavector.h misc/parameters.h misc/definitions.h misc/utils.h \ - qsufsort/qsufsort.h qsufsort/../misc/definitions.h + bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ + bits/rlevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ + bits/deltavector.h misc/parameters.h misc/definitions.h misc/utils.h \ + qsufsort/qsufsort.h qsufsort/../misc/definitions.h rlcsa_builder.o: rlcsa_builder.cpp rlcsa_builder.h rlcsa.h bits/vectors.h \ - bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \ - bits/bitbuffer.h bits/rlevector.h sasamples.h misc/definitions.h \ - bits/bitbuffer.h bits/deltavector.h misc/parameters.h \ - misc/definitions.h + bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \ + bits/bitbuffer.h bits/rlevector.h sasamples.h misc/definitions.h \ + bits/bitbuffer.h bits/deltavector.h misc/parameters.h misc/definitions.h sasamples.o: sasamples.cpp sasamples.h misc/definitions.h \ - bits/bitbuffer.h bits/../misc/definitions.h bits/deltavector.h \ - bits/bitvector.h bits/bitbuffer.h misc/utils.h misc/definitions.h + bits/bitbuffer.h bits/../misc/definitions.h bits/deltavector.h \ + bits/bitvector.h bits/bitbuffer.h misc/utils.h misc/definitions.h bitvector.o: bits/bitvector.cpp bits/bitvector.h \ - bits/../misc/definitions.h bits/bitbuffer.h + bits/../misc/definitions.h bits/bitbuffer.h deltavector.o: bits/deltavector.cpp bits/deltavector.h bits/bitvector.h \ - bits/../misc/definitions.h bits/bitbuffer.h + bits/../misc/definitions.h bits/bitbuffer.h rlevector.o: bits/rlevector.cpp bits/rlevector.h bits/bitvector.h \ - bits/../misc/definitions.h bits/bitbuffer.h bits/../misc/utils.h \ - bits/../misc/definitions.h + bits/../misc/definitions.h bits/bitbuffer.h bits/../misc/utils.h \ + bits/../misc/definitions.h vectors.o: bits/vectors.cpp bits/vectors.h bits/deltavector.h \ - bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ - bits/rlevector.h bits/../misc/utils.h bits/../misc/definitions.h + bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ + bits/rlevector.h bits/../misc/utils.h bits/../misc/definitions.h parameters.o: misc/parameters.cpp misc/parameters.h misc/definitions.h utils.o: misc/utils.cpp misc/utils.h misc/definitions.h qsufsort.o: qsufsort/qsufsort.c qsufsort/qsufsort.h \ - qsufsort/../misc/definitions.h + qsufsort/../misc/definitions.h diff --git a/makefile b/makefile index f12c965..6a8e159 100644 --- a/makefile +++ b/makefile @@ -8,7 +8,7 @@ LIBSWCSA = swcsa/swcsa.a dcover_obs = dcover/difference_cover.o -TextCollection_obs = TextCollection.o TextCollectionBuilder.o TCImplementation.o Tools.o BitRank.o \ +TextCollection_obs = TextCollection.o TextCollectionBuilder.o FMIndexBuilder.o FMIndex.o Tools.o \ TextStorage.o ${LIBRLCSA} ${LIBCDSA} ${LIBLZTRIE} ${LIBSWCSA} TCDebug_obs = bittree.o rbtree.o dynFMI.o diff --git a/testTextCollection.cpp b/testTextCollection.cpp index 82bf79e..a19d2b2 100644 --- a/testTextCollection.cpp +++ b/testTextCollection.cpp @@ -40,7 +40,7 @@ int main() int i = 0 ,j = 0; int heap_base = HeapProfiler::GetHeapConsumption(); std::cerr << "Initial heap usage : " << heap_base << "\n"; - TextCollectionBuilder *tcb = new TextCollectionBuilder(5); + TextCollectionBuilder *tcb = TextCollectionBuilder::create(5); heap_base = HeapProfiler::GetHeapConsumption (); std::cerr << "Heap usage after InitTextCollection : " << heap_base << "\n"; Tools::StartTimer();