From 40ddf9aca842bdc081b6350a4ebfe36b066c94c9 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Mon, 23 Mar 2009 15:20:38 +0000 Subject: [PATCH] Jouni's Incremental BWT integrated into TextCollection git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@271 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- HeapProfiler.cpp | 3 + HeapProfiler.h | 1 + CSA.cpp => TCImplementation.cpp | 506 +++---------------- CSA.h => TCImplementation.h | 205 +------- TextCollection.cpp | 10 +- TextCollection.h | 36 +- TextCollectionBuilder.cpp | 153 ++++++ TextCollectionBuilder.h | 68 +++ dependencies.mk | 28 +- incbwt/Makefile | 25 + incbwt/bits/bitbuffer.h | 486 ++++++++++++++++++ incbwt/bits/bitvector.cpp | 270 ++++++++++ incbwt/bits/bitvector.h | 170 +++++++ incbwt/bits/deltavector.cpp | 174 +++++++ incbwt/bits/deltavector.h | 62 +++ incbwt/bits/rlevector.cpp | 247 ++++++++++ incbwt/bits/rlevector.h | 100 ++++ incbwt/bits/vectors.cpp | 214 ++++++++ incbwt/bits/vectors.h | 25 + incbwt/dependencies.mk | 27 + incbwt/misc/definitions.h | 67 +++ incbwt/misc/parameters.cpp | 108 ++++ incbwt/misc/parameters.h | 43 ++ incbwt/misc/utils.cpp | 55 +++ incbwt/misc/utils.h | 27 + incbwt/qsufsort/qsufsort.c | 315 ++++++++++++ incbwt/qsufsort/qsufsort.h | 17 + incbwt/rlcsa.cpp | 839 ++++++++++++++++++++++++++++++++ incbwt/rlcsa.h | 138 ++++++ incbwt/rlcsa_builder.cpp | 199 ++++++++ incbwt/rlcsa_builder.h | 58 +++ incbwt/sasamples.cpp | 188 +++++++ incbwt/sasamples.h | 69 +++ makefile | 22 +- testTextCollection.cpp | 49 +- timeTextCollection.cpp | 17 +- 36 files changed, 4332 insertions(+), 689 deletions(-) rename CSA.cpp => TCImplementation.cpp (70%) rename CSA.h => TCImplementation.h (51%) create mode 100644 TextCollectionBuilder.cpp create mode 100644 TextCollectionBuilder.h create mode 100644 incbwt/Makefile create mode 100644 incbwt/bits/bitbuffer.h create mode 100644 incbwt/bits/bitvector.cpp create mode 100644 incbwt/bits/bitvector.h create mode 100644 incbwt/bits/deltavector.cpp create mode 100644 incbwt/bits/deltavector.h create mode 100644 incbwt/bits/rlevector.cpp create mode 100644 incbwt/bits/rlevector.h create mode 100644 incbwt/bits/vectors.cpp create mode 100644 incbwt/bits/vectors.h create mode 100644 incbwt/dependencies.mk create mode 100644 incbwt/misc/definitions.h create mode 100644 incbwt/misc/parameters.cpp create mode 100644 incbwt/misc/parameters.h create mode 100644 incbwt/misc/utils.cpp create mode 100644 incbwt/misc/utils.h create mode 100644 incbwt/qsufsort/qsufsort.c create mode 100644 incbwt/qsufsort/qsufsort.h create mode 100644 incbwt/rlcsa.cpp create mode 100644 incbwt/rlcsa.h create mode 100644 incbwt/rlcsa_builder.cpp create mode 100644 incbwt/rlcsa_builder.h create mode 100644 incbwt/sasamples.cpp create mode 100644 incbwt/sasamples.h diff --git a/HeapProfiler.cpp b/HeapProfiler.cpp index d3c46f8..348fee6 100644 --- a/HeapProfiler.cpp +++ b/HeapProfiler.cpp @@ -37,6 +37,9 @@ void *(*HeapProfiler::old_realloc_hook)(void *, size_t, const void *); unsigned long HeapProfiler::GetMaxHeapConsumption() { return maxConsumption; } +void HeapProfiler::ResetMaxHeapConsumption() { + maxConsumption = 0; +} unsigned long HeapProfiler::GetHeapConsumption() { return consumption; diff --git a/HeapProfiler.h b/HeapProfiler.h index e1c0378..ff34ee8 100644 --- a/HeapProfiler.h +++ b/HeapProfiler.h @@ -35,6 +35,7 @@ class HeapProfiler { public: static unsigned long GetMaxHeapConsumption(); + static void ResetMaxHeapConsumption(); static unsigned long GetHeapConsumption(); // Prototypes for hooks diff --git a/CSA.cpp b/TCImplementation.cpp similarity index 70% rename from CSA.cpp rename to TCImplementation.cpp index 3f75433..290ca9c 100644 --- a/CSA.cpp +++ b/TCImplementation.cpp @@ -17,18 +17,12 @@ * Free Software Foundation, Inc., * * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * *****************************************************************************/ -#include "CSA.h" -#include "TextCollection.h" - -// Conflicts with std::min and std::max -#undef min -#undef max -#include "dcover/bwt.hpp" +#include "TCImplementation.h" //#include "HeapProfiler.h" // FIXME remove #include -#include +#include #include #include #include @@ -44,274 +38,28 @@ namespace SXSI { // Save file version info -const uchar CSA::versionFlag = 2; - -//////////////////////////////////////////////////////////////////////////// -// Class CSA::THuffAlphabetRank -// FIXME Unused code -CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) { - left = NULL; - right = NULL; - bitrank = NULL; - ch = s[0]; - leaf = false; - this->codetable = codetable; - - bool *B = new bool[n]; - TextPosition sum=0,i; - /* - for (i=0; i< n; i++) { - printf("%c:", (char)((int)s[i]-128)); - for (r=0;r 100) return;*/ - for (i=0;i 0) - sfirst = new uchar[n-sum]; - //if (sum > 0) - ssecond = new uchar[sum]; - unsigned j=0,k=0; - for (i=0;i 0) { - left = new THuffAlphabetRank(sfirst,j,codetable,level+1); - delete [] sfirst; - //} - //if (k>0) { - right = new THuffAlphabetRank(ssecond,k,codetable,level+1); - delete [] ssecond; - //} -} - - -bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) { - // testing that the code works correctly - int C[256]; - unsigned i,j; - bool correct=true; - for (j=0;j<256;j++) - C[j] = 0; - for (i=0;i%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i)); - } - } - return correct; -} - -CSA::THuffAlphabetRank::~THuffAlphabetRank() { - if (left!=NULL) delete left; - if (right!=NULL) delete right; - if (bitrank!=NULL) - delete bitrank; -} - - -//////////////////////////////////////////////////////////////////////////// -// Class CSA +const uchar TCImplementation::versionFlag = 3; /** * Constructor inits an empty dynamic FM-index. * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ -CSA::CSA(unsigned samplerate) - : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0), - suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), +TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_) + : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), + suffixDocId(0), numberOfTexts(numberOfTexts_), numberOfAllTexts(numberOfTexts_), maxTextLength(maxTextLength_), endmarkerDocId(0), emptyTextRank(0) { - this->samplerate = samplerate; -#ifdef CSA_TEST_BWT - // FIXME TODO : DynFMI needs distribution of characters before hand - // This will create fully balanced wavelet tree for all chars [0, 255]. - uchar temp[256]; - for (unsigned i = 0; i < 255; ++i) - temp[i] = i+1; - temp[255] = 0; - dynFMI = new DynFMI(temp, 1, 255, false); - - /* Debug code: take char distribution from data.txt. - uchar *temp = Tools::GetFileContents("data.txt", 0); - dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false); - delete [] temp;*/ -#endif -} - -/** - * Insert new text - * - * Given text must include end-marker. - * Text identifiers are numbered starting from 0. - */ -void CSA::InsertText(uchar const * text) -{ - // Sanity check: -#ifdef CSA_TEST_BWT - assert(dynFMI != 0); -#endif - - TextPosition m = std::strlen((char *)text) + 1; - if (m > maxTextLength) - maxTextLength = m; // Store length of the longest text seen so far. - - if (m > 1) - { - this->n += m; - this->numberOfTexts ++; - this->numberOfAllTexts ++; -#ifdef CSA_TEST_BWT - dynFMI->addText(text, m); -#endif - - texts.append((char *) text); - texts.append(1, '\0'); - } - else - { - emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here - this->numberOfAllTexts ++; - } -} - -void CSA::MakeStatic() -{ - // Sanity check: -#ifdef CSA_TEST_BWT - if (dynFMI == 0) - throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); -#endif - - if (texts.size() == 0) // Empty collection - { - ++n; - ++numberOfTexts; - texts.append(1, '\0'); - } - // Bitvector of empty texts, FIXME Remove? { //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it) - Tools::SetField(tempB, 1, (*it), 1); - emptyTextId.clear(); - emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true); - } - - texts.reserve(0); // Release extra capacity - -/* FILE *fp = fopen("texts.txt", "wb"); - std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl; - fclose(fp);*/ - - uchar *bwt = new uchar[n]; - { - // More succinct solution via StringIterator, see below. -/* unsigned* maptexts = new unsigned[n+1]; - // Map text chars to [0..255+numberOfTexts] - unsigned count = 0; - for (ulong i = 0; i < n; ++i) - if (texts[i] == 0) - maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts] - else { - uchar c = (uchar)texts[i]; - maptexts[i] = (unsigned)c + numberOfTexts; - } - maptexts[n] = '\0'; - assert(count == numberOfTexts); - - std::cout << "maptext: "; - for (ulong i = 0; i <= n; ++i) - std::cout << maptexts[i] << ", "; - std::cout << std::endl;*/ - - // Mark endmarker positions into bitvector - ulong * endmarkers = new ulong[n/W+1]; - for (ulong i = 0; i < n/W+1; ++i) - endmarkers[i] = 0; - for (ulong i = 0; i < n; ++i) - if (texts[i] == 0) - Tools::SetField(endmarkers, 1, i, 1); - BitRank* br = new BitRank(endmarkers, n, true); - assert(numberOfTexts == br->rank(n-1)); - - // Build iterators, FIXME clean up iterator construction. - StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br); - StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br); - - bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n], - &bwt[0], numberOfTexts); - - bwt[--bwtEndPos] = '\0'; - texts.erase(); - texts.reserve(0); // Release capacity - delete br; // bitvector of endmarkers - } // End of bw transform -// std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl; - -/* fp = fopen("bwt.txt", "wb"); - std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl; - fclose(fp);*/ - - -#ifdef CSA_TEST_BWT - { - uchar *bwtTest = dynFMI->getBWT(); - printf("123456789012345678901234567890123456789\n"); - for (TextPosition i = 0; i < n && i < 100; i ++) - if (bwt[i] < 50) - printf("%d", (int)bwt[i]); - else - printf("%c", bwt[i]); - printf("\n"); - for (TextPosition i = 0; i < n && i < 100; i ++) - if (bwtTest[i] < 50) - printf("%d", (int)bwtTest[i]); - else - printf("%c", bwtTest[i]); - printf("\n"); - - // Sanity check - assert(numberOfTexts == dynFMI->getCollectionSize()); - - delete dynFMI; - dynFMI = 0; - for (ulong i = 0; i < n; ++i) - if (bwt[i] != bwtTest[i]) - { - std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl; - assert(0); - } - delete [] bwtTest; - } -#endif // CSA_TEST_BWT - +// for (std::set::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it) +// Tools::SetField(tempB, 1, (*it), 1); + emptyTextRank = new BSGAP(tempB, numberOfTexts, true); + } makewavelet(bwt); // Deletes bwt! bwt = 0; @@ -319,7 +67,7 @@ void CSA::MakeStatic() maketables(); } -bool CSA::EmptyText(DocId k) const +bool TCImplementation::EmptyText(DocId k) const { assert(k < (DocId)numberOfTexts); if (emptyTextRank->IsBitSet(k)) @@ -327,7 +75,7 @@ bool CSA::EmptyText(DocId k) const return false; } -uchar* CSA::GetText(DocId k) const +uchar* TCImplementation::GetText(DocId k) const { assert(k < (DocId)numberOfTexts); @@ -366,7 +114,7 @@ uchar* CSA::GetText(DocId k) const } /* * Not supported -uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const +uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const { assert(k < (DocId)numberOfTexts); assert(j < (*textLength)[k]); @@ -392,7 +140,7 @@ uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const /****************************************************************** * Existential queries */ -bool CSA::IsPrefix(uchar const * pattern) const +bool TCImplementation::IsPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -407,7 +155,7 @@ bool CSA::IsPrefix(uchar const * pattern) const return false; } -bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const +bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -423,7 +171,7 @@ bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const } -bool CSA::IsSuffix(uchar const *pattern) const +bool TCImplementation::IsSuffix(uchar const *pattern) const { // Here counting is as fast as IsSuffix(): if (CountSuffix(pattern) > 0) @@ -431,7 +179,7 @@ bool CSA::IsSuffix(uchar const *pattern) const return false; } -bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const +bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const { // Here counting is as fast as IsSuffix(): if (CountSuffix(pattern, begin, end) > 0) @@ -439,7 +187,7 @@ bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const return false; } -bool CSA::IsEqual(uchar const *pattern) const +bool TCImplementation::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); if (m == 0) @@ -459,7 +207,7 @@ bool CSA::IsEqual(uchar const *pattern) const return false; } -bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const +bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const { TextPosition m = std::strlen((char *)pattern); if (m == 0) @@ -479,7 +227,7 @@ bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const return false; } -bool CSA::IsContains(uchar const * pattern) const +bool TCImplementation::IsContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -494,7 +242,7 @@ bool CSA::IsContains(uchar const * pattern) const return false; } -bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const +bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const { // Here counting is as fast as existential querying if (CountContains(pattern, begin, end) > 0) @@ -502,14 +250,14 @@ bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const return false; } -bool CSA::IsLessThan(uchar const * pattern) const +bool TCImplementation::IsLessThan(uchar const * pattern) const { if (CountLessThan(pattern) > 0) return true; return false; } -bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const +bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const { if (CountLessThan(pattern, begin, end) > 0) return true; @@ -519,7 +267,7 @@ bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const /****************************************************************** * Counting queries */ -ulong CSA::Count(uchar const * pattern) const +ulong TCImplementation::Count(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -530,7 +278,7 @@ ulong CSA::Count(uchar const * pattern) const return count; } -unsigned CSA::CountPrefix(uchar const * pattern) const +unsigned TCImplementation::CountPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -543,7 +291,7 @@ unsigned CSA::CountPrefix(uchar const * pattern) const return CountEndmarkers(sp, ep); } -unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const +unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -556,7 +304,7 @@ unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const return CountEndmarkers(sp, ep, begin, end); } -unsigned CSA::CountSuffix(uchar const * pattern) const +unsigned TCImplementation::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -569,7 +317,7 @@ unsigned CSA::CountSuffix(uchar const * pattern) const return count; } -unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const +unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -582,7 +330,7 @@ unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const return count; } -unsigned CSA::CountEqual(uchar const *pattern) const +unsigned TCImplementation::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -596,7 +344,7 @@ unsigned CSA::CountEqual(uchar const *pattern) const return CountEndmarkers(sp, ep); } -unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const +unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -610,7 +358,7 @@ unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const return CountEndmarkers(sp, ep); // Already within [begin, end] } -unsigned CSA::CountContains(uchar const * pattern) const +unsigned TCImplementation::CountContains(uchar const * pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -622,7 +370,7 @@ unsigned CSA::CountContains(uchar const * pattern) const return result.size(); } -unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const +unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -635,7 +383,7 @@ unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const } // Less than or equal -unsigned CSA::CountLessThan(uchar const * pattern) const +unsigned TCImplementation::CountLessThan(uchar const * pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -648,7 +396,7 @@ unsigned CSA::CountLessThan(uchar const * pattern) const return CountEndmarkers(sp, ep); } -unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const +unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -664,7 +412,7 @@ unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const /** * Document reporting queries */ -TextCollection::document_result CSA::Prefix(uchar const * pattern) const +TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -701,7 +449,7 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const return result; } -TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const +TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -739,7 +487,7 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, return result; } -TextCollection::document_result CSA::Suffix(uchar const * pattern) const +TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -789,7 +537,7 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const return result; } -TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const +TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -840,7 +588,7 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, } -TextCollection::document_result CSA::Equal(uchar const *pattern) const +TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -877,7 +625,7 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const return result; } -TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const +TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const { TextPosition m = strlen((char const *)pattern); if (m == 0) @@ -915,7 +663,7 @@ TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, Do } -TextCollection::document_result CSA::Contains(uchar const * pattern) const +TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -964,7 +712,7 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const return result; } -TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const +TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -1015,7 +763,7 @@ TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin return result; } -TextCollection::document_result CSA::LessThan(uchar const * pattern) const +TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -1052,7 +800,7 @@ TextCollection::document_result CSA::LessThan(uchar const * pattern) const return result; } -TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const +TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -1093,7 +841,7 @@ TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin /** * Full result set queries */ -TextCollection::full_result CSA::FullContains(uchar const * pattern) const +TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -1145,7 +893,7 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const return result; } -TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const +TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const { TextPosition m = strlen((char *)pattern); if (m == 0) @@ -1221,23 +969,23 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin BlockArray *endmarkerDocId; BSGAP *emptyTextRank; */ -void CSA::Save(FILE *file) const +void TCImplementation::Save(FILE *file) const { // Saving version info: if (std::fwrite(&versionFlag, 1, 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (version flag)."); + throw std::runtime_error("TCImplementation::Save(): file write error (version flag)."); if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (n)."); + throw std::runtime_error("TCImplementation::Save(): file write error (n)."); if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (samplerate)."); + throw std::runtime_error("TCImplementation::Save(): file write error (samplerate)."); for(ulong i = 0; i < 256; ++i) if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (C table)."); + throw std::runtime_error("TCImplementation::Save(): file write error (C table)."); if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (bwt end position)."); + throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position)."); alphabetrank->save(file); sampled->Save(file); @@ -1245,11 +993,11 @@ void CSA::Save(FILE *file) const suffixDocId->Save(file); if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (numberOfTexts)."); + throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts)."); if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts)."); + throw std::runtime_error("TCImplementation::Save(): file write error (numberOfAllTexts)."); if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (maxTextLength)."); + throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength)."); endmarkerDocId->Save(file); emptyTextRank->Save(file); @@ -1261,49 +1009,33 @@ void CSA::Save(FILE *file) const * Load index from a file handle * * Throws a std::runtime_error exception on i/o error. - * For more info, see CSA::Save(). + * For more info, see TCImplementation::Save(). */ -void CSA::Load(FILE *file, unsigned samplerate) +TCImplementation::TCImplementation(FILE *file, unsigned samplerate_) + : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0), + suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), + emptyTextRank(0) { - // Delete everything -#ifdef CSA_TEST_BWT - delete dynFMI; dynFMI = 0; -#endif - delete alphabetrank; alphabetrank = 0; - delete sampled; sampled = 0; - delete suffixes; suffixes = 0; - delete suffixDocId; suffixDocId = 0; - delete [] codetable; codetable = 0; - - delete endmarkerDocId; endmarkerDocId = 0; - delete emptyTextRank; emptyTextRank = 0; - - this->maxTextLength = 0; - this->numberOfTexts = 0; - this->numberOfAllTexts = 0; - this->samplerate = samplerate; - this->n = 0; - uchar verFlag = 0; if (std::fread(&verFlag, 1, 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (version flag)."); - if (verFlag != CSA::versionFlag) - throw std::runtime_error("CSA::Load(): invalid save file version."); + throw std::runtime_error("TCImplementation::Load(): file read error (version flag)."); + if (verFlag != TCImplementation::versionFlag) + throw std::runtime_error("TCImplementation::Load(): invalid save file version."); if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (n)."); + throw std::runtime_error("TCImplementation::Load(): file read error (n)."); if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (samplerate)."); + throw std::runtime_error("TCImplementation::Load(): file read error (samplerate)."); // FIXME samplerate can not be changed during load. // if (this->samplerate == 0) // this->samplerate = samplerate; for(ulong i = 0; i < 256; ++i) if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (C table)."); + throw std::runtime_error("TCImplementation::Load(): file read error (C table)."); if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (bwt end position)."); + throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position)."); alphabetrank = static_sequence::load(file); sampled = new BSGAP(file); @@ -1311,11 +1043,11 @@ void CSA::Load(FILE *file, unsigned samplerate) suffixDocId = new BlockArray(file); if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (numberOfTexts)."); + throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts)."); if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts)."); + throw std::runtime_error("TCImplementation::Load(): file read error (numberOfAllTexts)."); if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (maxTextLength)."); + throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength)."); endmarkerDocId = new BlockArray(file); emptyTextRank = new BSGAP(file); @@ -1332,7 +1064,7 @@ void CSA::Load(FILE *file, unsigned samplerate) // FIXME Use 2D-search structure -unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const +unsigned TCImplementation::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const { if (sp > ep) return 0; @@ -1363,7 +1095,7 @@ unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, Doc return count; } -ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) int c = (int)pattern[m-1]; @@ -1385,7 +1117,7 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; } -ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const +ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const { // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) int c = (int)pattern[m-1]; @@ -1409,7 +1141,7 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, } -ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) uint c = (int)pattern[m-1]; @@ -1435,7 +1167,7 @@ ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *s } -CSA::~CSA() { +TCImplementation::~TCImplementation() { #ifdef CSA_TEST_BWT delete dynFMI; #endif @@ -1443,13 +1175,12 @@ CSA::~CSA() { delete sampled; delete suffixes; delete suffixDocId; - delete [] codetable; // FIXME remove delete endmarkerDocId; delete emptyTextRank; } -void CSA::makewavelet(uchar *bwt) +void TCImplementation::makewavelet(uchar *bwt) { ulong i, min = 0, max; @@ -1490,31 +1221,21 @@ void CSA::makewavelet(uchar *bwt) // std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; } -void CSA::maketables() +void TCImplementation::maketables() { // Calculate BWT end-marker position (of last inserted text) - // Note: bwtEndPos already set! - // and the length of the first text (counter l): -#ifdef CSA_TEST_BWT { ulong i = 0; - ulong l = 1; uint alphabetrank_i_tmp = 0; uchar c = alphabetrank->access(i, alphabetrank_i_tmp); while (c != '\0') { i = C[c]+alphabetrank_i_tmp-1; - ++l; c = alphabetrank->access(i, alphabetrank_i_tmp); } - if (i != bwtEndPos) // compare result - { - std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl; - exit(0); - } + this->bwtEndPos = i; } -#endif // Build up arrays for text length and starting positions // FIXME Temp, remove @@ -1616,7 +1337,7 @@ void CSA::maketables() * Starting text position of the document is stored into second parameter. * Binary searching on text starting positions. */ -TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const +TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const { assert(i < n); @@ -1639,71 +1360,6 @@ TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition return a; } -CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n) -{ - TCodeEntry *result = new TCodeEntry[ 256 ]; - - count_chars( text, n, result ); - std::priority_queue< node, std::vector< node >, std::greater > q; -// -// First I push all the leaf nodes into the queue -// - for ( unsigned int i = 0 ; i < 256 ; i++ ) - if ( result[ i ].count ) - q.push(node( i, result[ i ].count ) ); -// -// This loop removes the two smallest nodes from the -// queue. It creates a new internal node that has -// those two nodes as children. The new internal node -// is then inserted into the priority queue. When there -// is only one node in the priority queue, the tree -// is complete. -// - - while ( q.size() > 1 ) { - node *child0 = new node( q.top() ); - q.pop(); - node *child1 = new node( q.top() ); - q.pop(); - q.push( node( child0, child1 ) ); - } -// -// Now I compute and return the codetable -// - q.top().maketable(0u,0u, result); - q.pop(); - return result; -} - - -void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const -{ - if ( child0 ) - { - child0->maketable( SetBit(code,bits,0), bits+1, codetable ); - child1->maketable( SetBit(code,bits,1), bits+1, codetable ); - delete child0; - delete child1; - } - else - { - codetable[value].code = code; - codetable[value].bits = bits; - } -} - -void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts ) -{ - ulong i; - for (i = 0 ; i < 256 ; i++ ) - counts[ i ].count = 0; - for (i=0; i -#include +#include "BSGAP.h" // Include from XMLTree/libcds #include // Defines W == 32 @@ -54,19 +50,11 @@ namespace SXSI * Implementation of the TextCollection interface * */ -class CSA : public SXSI::TextCollection { +class TCImplementation : public SXSI::TextCollection { public: - /** - * Constructor with (default) samplerate - */ - explicit CSA(unsigned); - ~CSA(); - /** - * Following functions implement the interface described in TextCollection.h. - * For details/documentation, see TextCollection.h. - */ - void InsertText(uchar const *); - void MakeStatic(); + TCImplementation(uchar *, ulong, unsigned, unsigned, ulong); + ~TCImplementation(); + bool EmptyText(DocId) const; uchar* GetText(DocId) const; /** @@ -119,181 +107,19 @@ public: full_result FullContains(uchar const *, DocId, DocId) const; // Index from/to disk - void Load(FILE *, unsigned); + TCImplementation(FILE *, unsigned); void Save(FILE *) const; - - // FIXME Remove - void deleteWT() - { - delete alphabetrank; - alphabetrank = 0; - delete [] codetable; - codetable = 0; - } - void deleteSamples() - { - delete sampled; - sampled =0; - delete suffixes; - suffixes = 0; - delete suffixDocId; - suffixDocId = 0; - } - void deleteEndmarker() - { - delete endmarkerDocId; - endmarkerDocId = 0; - } - private: - // FIXME Unused code - class TCodeEntry { - public: - unsigned count; - unsigned bits; - unsigned code; - TCodeEntry() {count=0;bits=0;code=0u;}; - }; - - - // FIXME Unused code - class THuffAlphabetRank { - // using fixed 0...255 alphabet - private: - BitRank *bitrank; - THuffAlphabetRank *left; - THuffAlphabetRank *right; - TCodeEntry *codetable; - uchar ch; - bool leaf; - public: - THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned); - THuffAlphabetRank(FILE *); - ~THuffAlphabetRank(); - - void Save(FILE *); - bool Test(uchar *, TextPosition); - - inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i - THuffAlphabetRank const * temp=this; - if (codetable[c].count == 0) return 0; - unsigned level = 0; - unsigned code = codetable[c].code; - while (!temp->leaf) { - if ((code & (1u<bitrank->rank(i); - temp = temp->left; - } - else { - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } - ++level; - } - return i+1; - }; - inline bool IsCharAtPos(int c, TextPosition i) const { - THuffAlphabetRank const * temp=this; - if (codetable[c].count == 0) return false; - unsigned level = 0; - unsigned code = codetable[c].code; - while (!temp->leaf) { - if ((code & (1u<bitrank->IsBitSet(i)) return false; - i = i-temp->bitrank->rank(i); - temp = temp->left; - } - else { - if (!temp->bitrank->IsBitSet(i)) return false; - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } - ++level; - } - return true; - } - inline uchar access(TextPosition i) const { - THuffAlphabetRank const * temp=this; - while (!temp->leaf) { - if (temp->bitrank->IsBitSet(i)) { - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } - else { - i = i-temp->bitrank->rank(i); - temp = temp->left; - } - } - return temp->ch; - } - - inline uchar charAtPos(ulong i, TextPosition *rank) const { - THuffAlphabetRank const * temp=this; - while (!temp->leaf) { - if (temp->bitrank->IsBitSet(i)) { - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } else { - i = i-temp->bitrank->rank(i); - temp = temp->left; - } - } - (*rank)=i; - return temp->ch; - } - }; - - // FIXME Unused code - class node { - private: - unsigned weight; - uchar value; - node *child0; - node *child1; - - void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const; - static void count_chars(uchar *, TextPosition , TCodeEntry *); - static unsigned SetBit(unsigned , unsigned , unsigned ); - public: - node( unsigned char c = 0, unsigned i = 0 ) { - value = c; - weight = i; - child0 = 0; - child1 = 0; - } - - node( node* c0, node *c1 ) { - value = 0; - weight = c0->weight + c1->weight; - child0 = c0; - child1 = c1; - } - - - bool operator>( const node &a ) const { - return weight > a.weight; - } - - static TCodeEntry *makecodetable(uchar *, TextPosition); - }; - static const uchar versionFlag; TextPosition n; unsigned samplerate; unsigned C[256]; TextPosition bwtEndPos; -// THuffAlphabetRank *alphabetrank; - // RLWaveletTree *alphabetrank; static_sequence * alphabetrank; -#ifdef CSA_TEST_BWT - DynFMI *dynFMI; -#endif - TCodeEntry *codetable; - // Sample structures for texts longer than samplerate - BSGAP *sampled; + BSGAP *sampled; // FIXME Replace with RRR02 BlockArray *suffixes; BlockArray *suffixDocId; @@ -310,11 +136,7 @@ private: // FIXME Replace with a more succinct data structure // Note: Empty texts are already handled inside XMLTree class. - std::set emptyTextId; - BSGAP *emptyTextRank; - - // FIXME A better solution? - std::string texts; + BSGAP *emptyTextRank; // FIXME Remove // Following are not part of the public API uchar * BWT(uchar *); @@ -324,11 +146,6 @@ private: ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; -// TextPosition Inverse(TextPosition) const; -// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; -// TextPosition Psi(TextPosition) const; -// uchar * Substring(TextPosition, TextPosition) const; -// TextPosition Lookup(TextPosition) const; /** * Count end-markers in given interval @@ -345,7 +162,7 @@ private: return alphabetrank->rank(0, ep) - ranksp; } unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const; -}; // class CSA +}; // class TCImplementation } // namespace SXSI diff --git a/TextCollection.cpp b/TextCollection.cpp index 0300899..9c94724 100644 --- a/TextCollection.cpp +++ b/TextCollection.cpp @@ -1,16 +1,16 @@ #include "TextCollection.h" -#include "CSA.h" +#include "TCImplementation.h" namespace SXSI { /** - * Init text collection + * Init text collection from a file * - * See CSA.h for more details. + * See TCImplementation.h for more details. */ - TextCollection * TextCollection::InitTextCollection(unsigned samplerate) + TextCollection * TextCollection::Load(FILE *fp, unsigned samplerate) { - TextCollection *result = new CSA(samplerate); + TextCollection *result = new TCImplementation(fp, samplerate); return result; } } diff --git a/TextCollection.h b/TextCollection.h index f41084d..dad5c88 100644 --- a/TextCollection.h +++ b/TextCollection.h @@ -30,6 +30,7 @@ namespace SXSI { + /** * General interface for a text collection * @@ -44,12 +45,6 @@ namespace SXSI // Type for text position (FIXME ulong or long?) typedef ulong TextPosition; - /** - * Init an instance of a text collection object - * - * Returns a pointer to an object implementing this interface. - */ - static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE); /** * Load from a file * @@ -59,38 +54,23 @@ namespace SXSI * Throws an exception if std::fread() fails. * */ - virtual void Load(FILE *, unsigned samplerate = 0) = 0; + static TextCollection* Load(FILE *, unsigned samplerate = 0); + /** * Save data structure into a file * * Throws an exception if std::fwrite() fails. */ virtual void Save(FILE *) const = 0; + /** * Virtual destructor */ virtual ~TextCollection() { }; - /** - * Insert text - * - * Must be a zero-terminated string from alphabet [1,255]. - * Can not be called after makeStatic(). - * The i'th text insertion gets an identifier value i-1. - * In other words, document identifiers start from 0. - */ - virtual void InsertText(uchar const *) = 0; - /** - * Make static - * - * Convert to a static collection; reduces space and time complexities. - * New texts can not be inserted after this operation. - */ - virtual void MakeStatic() = 0; - + /** - tests if the string pointed to by DocId is empty - */ - + * Tests if the string pointed to by DocId is empty + */ virtual bool EmptyText(DocId) const = 0; /** @@ -198,7 +178,7 @@ namespace SXSI virtual full_result FullContains(uchar const *, DocId, DocId) const = 0; protected: - // Protected constructor; call the static function InitTextCollection(). + // Protected constructor; use TextCollectionBuilder TextCollection() { }; // No copy constructor or assignment diff --git a/TextCollectionBuilder.cpp b/TextCollectionBuilder.cpp new file mode 100644 index 0000000..67657bf --- /dev/null +++ b/TextCollectionBuilder.cpp @@ -0,0 +1,153 @@ +#include "incbwt/rlcsa_builder.h" +#include "TextCollectionBuilder.h" + +// Un-comment next line to run a comparison of resulting BWT +//#define TCB_TEST_BWT + +#ifdef TCB_TEST_BWT +#include "dynFMI.h" +#endif + +#include "TCImplementation.h" + +namespace SXSI +{ + +struct TCBuilderRep +{ + unsigned samplerate; + CSA::RLCSABuilder * sa; + + ulong n; + // Total number of texts in the collection + unsigned numberOfTexts; + // Length of the longest text + ulong maxTextLength; + +#ifdef TCB_TEST_BWT + DynFMI *dynFMI; +#endif +}; + +/** + * Init text collection + * + * See CSA.h for more details. + */ +TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate) + : p_(new struct TCBuilderRep()) +{ + p_->n = 0; + p_->samplerate = samplerate; + p_->numberOfTexts = 0; + + // Current params: 8 bytes, 15 MB, no samples + p_->sa = new CSA::RLCSABuilder(8, 0, 15 * 1024 * 1024); + assert(p_->sa->isOk()); + +#ifdef TCB_TEST_BWT + uchar temp[256]; + for (unsigned i = 0; i < 255; ++i) + temp[i] = i+1; + temp[255] = 0; + p_->dynFMI = new DynFMI(temp, 1, 255, false); +#endif +} + +TextCollectionBuilder::~TextCollectionBuilder() +{ +#ifdef TCB_TEST_BWT + delete p_->dynFMI; +#endif + + delete p_->sa; + delete p_; +} + +void TextCollectionBuilder::InsertText(uchar const * text) +{ + TextCollection::TextPosition m = std::strlen((char *)text) + 1; + if (m > p_->maxTextLength) + p_->maxTextLength = m; // Store length of the longest text seen so far. + + if (m > 1) + { + p_->n += m; + p_->numberOfTexts ++; + +#ifdef TCB_TEST_BWT + p_->dynFMI->addText(text, m); +#endif + + p_->sa->insertSequence((char*)text, m-1, 0); + assert(p_->sa->isOk()); + } + else + { + // FIXME indexing empty texts + std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl; + exit(1); + } +} + + +TextCollection * TextCollectionBuilder::InitTextCollection() +{ + uchar * bwt = 0; + CSA::usint length = 0; + if (p_->numberOfTexts == 0) + { + p_->numberOfTexts ++; // Add one empty text + bwt = new uchar[2]; + bwt[0] = '\0'; + bwt[1] = '\0'; + length = 1; + p_->maxTextLength = 1; + } + else + { + bwt = (uchar *)p_->sa->getBWT(length); + delete p_->sa; + p_->sa = 0; + + assert(length == p_->n); + +#ifdef TCB_TEST_BWT + { + uchar *bwtTest = p_->dynFMI->getBWT(); + printf("123456789012345678901234567890123456789\n"); + for (ulong i = 0; i < p_->n && i < 100; i ++) + if (bwt[i] < 50) + printf("%d", (int)bwt[i]); + else + printf("%c", bwt[i]); + printf("\n"); + for (ulong i = 0; i < p_->n && i < 100; i ++) + if (bwtTest[i] < 50) + printf("%d", (int)bwtTest[i]); + else + printf("%c", bwtTest[i]); + printf("\n"); + + // Sanity check + assert(p_->numberOfTexts == p_->dynFMI->getCollectionSize()); + + delete p_->dynFMI; + p_->dynFMI = 0; + for (ulong i = 0; i < p_->n; ++i) + if (bwt[i] != bwtTest[i]) + { + std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl; + assert(0); + } + delete [] bwtTest; + } +#endif // TCB_TEST_BWT + } + + TextCollection *result = new TCImplementation(bwt, (ulong)length, p_->samplerate, p_->numberOfTexts, p_->maxTextLength); + return result; +} + + +} // namespace SXSI diff --git a/TextCollectionBuilder.h b/TextCollectionBuilder.h new file mode 100644 index 0000000..3c72512 --- /dev/null +++ b/TextCollectionBuilder.h @@ -0,0 +1,68 @@ +/****************************************************************************** + * Copyright (C) 2009 by Niko Valimaki * + * Text collection interface for an in-memory XQuery/XPath engine * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef _SXSI_TextCollectionBuilder_h_ +#define _SXSI_TextCollectionBuilder_h_ + +#include "TextCollection.h" +#include "Tools.h" // Defines ulong and uchar. +#include +#include // Defines std::pair. + +namespace SXSI +{ + struct TCBuilderRep; // Pimpl + + /** + * Build an instance of the TextCollection class. + */ + class TextCollectionBuilder + { + public: + explicit TextCollectionBuilder(unsigned); + ~TextCollectionBuilder(); + + /** + * Insert text + * + * Must be a zero-terminated string from alphabet [1,255]. + * Can not be called after makeStatic(). + * The i'th text insertion gets an identifier value i-1. + * In other words, document identifiers start from 0. + */ + void InsertText(uchar const *); + /** + * Make static + * + * Convert to a static collection; reduces space and time complexities. + * New texts can not be inserted after this operation. + */ + TextCollection * InitTextCollection(); + + private: + struct TCBuilderRep * p_; + + // No copy constructor or assignment + TextCollectionBuilder(); + TextCollectionBuilder(TextCollectionBuilder const&); + TextCollectionBuilder& operator = (TextCollectionBuilder const&); + }; +} +#endif diff --git a/dependencies.mk b/dependencies.mk index 1cb05db..f7747de 100644 --- a/dependencies.mk +++ b/dependencies.mk @@ -1,19 +1,33 @@ BSGAP.o: BSGAP.cpp BSGAP.h Tools.h BlockArray.h BitRank.o: BitRank.cpp BitRank.h BlockArray.h Tools.h -CSA.o: CSA.cpp CSA.h dynFMI.h bittree.h rbtree.h Tools.h BitRank.h \ - BlockArray.h TextCollection.h RLWaveletTree.h GapEncode.h BSGAP.h GapEncode.o: GapEncode.cpp GapEncode.h Tools.h BlockArray.h BSGAP.h HeapProfiler.o: HeapProfiler.cpp HeapProfiler.h RLWaveletTree.o: RLWaveletTree.cpp RLWaveletTree.h GapEncode.h Tools.h \ BlockArray.h BSGAP.h BitRank.h -TextCollection.o: TextCollection.cpp TextCollection.h Tools.h CSA.h \ - dynFMI.h bittree.h rbtree.h BitRank.h BlockArray.h RLWaveletTree.h \ - GapEncode.h BSGAP.h +TCImplementation.o: TCImplementation.cpp TCImplementation.h BitRank.h \ + BlockArray.h Tools.h TextCollection.h BSGAP.h +TextCollection.o: TextCollection.cpp TextCollection.h Tools.h \ + TCImplementation.h BitRank.h BlockArray.h BSGAP.h +TextCollectionBuilder.o: TextCollectionBuilder.cpp \ + TextCollectionBuilder.h TextCollection.h Tools.h incbwt/rlcsa_builder.h \ + incbwt/rlcsa.h incbwt/bits/vectors.h incbwt/bits/deltavector.h \ + incbwt/bits/bitvector.h incbwt/bits/../misc/definitions.h \ + incbwt/bits/bitbuffer.h incbwt/bits/rlevector.h incbwt/sasamples.h \ + incbwt/misc/definitions.h incbwt/bits/bitbuffer.h \ + incbwt/bits/deltavector.h incbwt/misc/parameters.h \ + incbwt/misc/definitions.h TCImplementation.h BitRank.h BlockArray.h \ + BSGAP.h dynFMI.h bittree.h rbtree.h Tools.o: Tools.cpp Tools.h bittree.o: bittree.cpp bittree.h rbtree.h Tools.h dynFMI.o: dynFMI.cpp dynFMI.h bittree.h rbtree.h Tools.h rbtree.o: rbtree.cpp rbtree.h -testTextCollection.o: testTextCollection.cpp HeapProfiler.h \ - TextCollection.h Tools.h +testTextCollection.o: testTextCollection.cpp TextCollectionBuilder.h \ + TextCollection.h Tools.h HeapProfiler.h +testTextCollection2.o: testTextCollection2.cpp TextCollection.h Tools.h \ + HeapProfiler.h +testTextCollection3.o: testTextCollection3.cpp TextCollection.h Tools.h +testTextCollection4.o: testTextCollection4.cpp TCImplementation.h \ + BitRank.h BlockArray.h Tools.h TextCollection.h BSGAP.h HeapProfiler.h testTextCollection5.o: testTextCollection5.cpp HeapProfiler.h \ TextCollection.h Tools.h +timeTextCollection.o: timeTextCollection.cpp TextCollection.h Tools.h diff --git a/incbwt/Makefile b/incbwt/Makefile new file mode 100644 index 0000000..6b004e6 --- /dev/null +++ b/incbwt/Makefile @@ -0,0 +1,25 @@ +CC = g++ +CPPFLAGS = -Wall -O3 +OBJS = rlcsa.o rlcsa_builder.o sasamples.o bits/bitvector.o bits/deltavector.o bits/rlevector.o bits/vectors.o misc/parameters.o misc/utils.o qsufsort/qsufsort.o + + +all: library + +library: $(OBJS) + ar rc rlcsa.a $(OBJS) + +depend: + g++ -MM *.cpp bits/*.cpp misc/*.cpp qsufsort/*.c > dependencies.mk + +update: + cp ../rlcsa.* ../rlcsa_builder.* ../sasamples.* . + cp ../bits/* bits/ + cp ../misc/* misc/ + cp ../qsufsort/* qsufsort/ + +clean: + rm -f rlcsa.a + rm -f $(OBJS) + + +include dependencies.mk diff --git a/incbwt/bits/bitbuffer.h b/incbwt/bits/bitbuffer.h new file mode 100644 index 0000000..572fa8a --- /dev/null +++ b/incbwt/bits/bitbuffer.h @@ -0,0 +1,486 @@ +#ifndef BITBUFFER_H +#define BITBUFFER_H + +#include +#include +#include +#include + +#include "../misc/definitions.h" + + +namespace CSA +{ + + +template +class GenericBitBuffer +{ + public: + GenericBitBuffer(usint words); + GenericBitBuffer(usint _items, usint item_size); + GenericBitBuffer(std::ifstream& file, usint words); + GenericBitBuffer(std::ifstream& file, usint _items, usint item_size); + GenericBitBuffer(Data* buffer, usint words); + GenericBitBuffer(Data* buffer, usint _items, usint item_size); + ~GenericBitBuffer(); + + void writeBuffer(std::ofstream& file, bool erase = true); + void readBuffer(std::ifstream& file, usint words, bool erase = true); + void setBuffer(Data* buffer, usint words); + + // We assume we are already using a buffer not owned by this. + inline void moveBuffer(Data* buffer) + { + this->data = buffer; + this->pos = 0; + this->bits = DATA_BITS; + this->current = 0; + } + +//-------------------------------------------------------------------------- + + inline void reset(bool erase) + { + this->pos = 0; + this->bits = DATA_BITS; + this->current = 0; + if(erase) + { + memset(this->data, 0, this->size * sizeof(Data)); + } + } + + inline void skipBits(usint count) + { + if(count < this->bits) + { + this->bits -= count; + return; + } + + count -= this->bits; + this->pos += 1 + count / DATA_BITS; + this->bits = DATA_BITS - count % DATA_BITS; + } + + inline void rewind(usint count) + { + this->bits += count; + if(this->bits > DATA_BITS) + { + usint back = (this->bits - 1) / DATA_BITS; + this->pos -= back; + this->bits -= back * DATA_BITS; + } + } + +//-------------------------------------------------------------------------- + + inline usint bitsLeft() + { + return this->bits + DATA_BITS * (this->size - this->pos - 1); + } + + inline void writeBits(usint value, usint count) + { + while(count >= this->bits) + { + count -= this->bits; + this->data[this->pos] |= GET(LOWER(value, count), this->bits); + this->pos++; this->bits = DATA_BITS; + } + if(count > 0) + { + this->bits -= count; + this->data[this->pos] |= HIGHER(GET(value, count), this->bits); + } + } + + // Returns nonzero if bit is 1 + inline usint readBit() + { + this->bits--; + usint bit = this->data[this->pos] & ((usint)1 << this->bits); + + if(this->bits == 0) { this->pos++; this->bits = DATA_BITS; } + + return bit; + } + + inline usint readBits(usint count) + { + usint value = 0; + + while(count >= this->bits) + { + count -= this->bits; + value |= HIGHER(GET(this->data[this->pos], this->bits), count); + this->pos++; this->bits = DATA_BITS; + } + if(count > 0) + { + this->bits -= count; + value |= GET(LOWER(this->data[this->pos], this->bits), count); + } + + return value; + } + +//-------------------------------------------------------------------------- + + /* + These operations work on fixed-size items. + */ + + inline usint getItemSize() + { + return this->item_bits; + } + +/* + inline void setItemSize(usint new_size) + { + if(new_size == 0) { return; } + this->item_bits = new_size; + } +*/ + + inline void goToItem(usint item) + { + usint b = item * this->item_bits; + this->pos = b / DATA_BITS; + this->bits = DATA_BITS - b % DATA_BITS; + this->current = item; + } + + inline usint readItem() + { + this->current++; + return this->readBits(this->item_bits); + } + + inline usint readItem(usint item) + { + this->goToItem(item); + return this->readItem(); + } + + inline usint readFirstItem() + { + return this->readItem(0); + } + + inline bool hasNextItem() + { + return (this->current < this->items); + } + + inline void writeItem(usint item) + { + this->writeBits(item, this->item_bits); + this->current++; + } + + inline void skipItem() + { + this->skipBits(this->item_bits); + this->current++; + } + +//-------------------------------------------------------------------------- + + /* + Delta coding for positive integers + */ + + inline bool canDeltaCode(usint value) + { + return this->deltaCodeLength(value) <= this->bitsLeft(); + } + + inline usint deltaCodeLength(usint value) + { + usint len = length(value); + usint llen = length(len); + return (len + llen + llen - 2); + } + + // This version returns false if there is no space left for the encoding. + inline bool writeDeltaCode(usint value) + { + usint len = length(value); + usint llen = length(len); + + if(len + llen + llen - 2 > this->bitsLeft()) { return false; } + + // this->writeBits(0, llen - 1); // Now included in the next writeBits() + this->writeBits(len, llen + llen - 1); + this->writeBits(value, len - 1); + return true; + } + + // This version assumes the code fits into the buffer. + inline void writeDeltaCodeDirect(usint value) + { + usint len = length(value); + usint llen = length(len); + + // this->writeBits(0, llen - 1); // Now included in the next writeBits() + this->writeBits(len, llen + llen - 1); + this->writeBits(value, len - 1); + } + + // We assume the code fits into usint: + // 32-bit: value < 2^24 + // 64-bit: value < 2^54 + inline void writeDeltaCodeFast(usint value) + { + usint len = length(value); + + value ^= ((usint)1 << (len - 1)); + this->writeBits((len << (len - 1)) | value, len + 2 * length(len) - 2); + } + + inline usint readDeltaCode() + { + usint len = 0; + while(this->readBit() == 0) { len++; } + + usint temp = (((usint)1 << len) | this->readBits(len)) - 1; + temp = ((usint)1 << temp) | this->readBits(temp); + return temp; + } + +//-------------------------------------------------------------------------- + + /* + Gamma coding for positive integers + */ + + inline bool canGammaCode(usint value) + { + return this->gammaCodeLength(value) <= this->bitsLeft(); + } + + inline usint gammaCodeLength(usint value) + { + return 2 * length(value) - 1; + } + + // This version returns false if there is no space left for the encoding. + inline bool writeGammaCode(usint value) + { + usint len = length(value); + + if(len > this->bitsLeft()) { return false; } + + this->writeBits(0, len - 1); + this->writeBits(value, len); + return true; + } + + // This version assumes the code fits into the buffer. + inline void writeGammaCodeDirect(usint value) + { + usint len = length(value); + + this->writeBits(0, len - 1); + this->writeBits(value, len); + } + + // We assume the code fits into usint: + // 32-bit: value < 2^16 + // 64-bit: value < 2^32 + inline void writeGammaCodeFast(usint value) + { + this->writeBits(value, this->gammaCodeLength(value)); + } + + inline usint readGammaCode() + { + usint len = 1; + while(this->readBit() == 0) { len++; } + return ((usint)1 << len) | this->readBits(len); + } + +//-------------------------------------------------------------------------- + + usint reportSize(); + +//-------------------------------------------------------------------------- + + private: + Data* data; + usint size, pos, bits; + usint item_bits, items, current; + bool free_buffer; + + const static usint DATA_BITS = sizeof(Data) * CHAR_BIT; + + inline static usint bitsToData(usint _bits) { return (_bits + DATA_BITS - 1) / DATA_BITS; } +}; + + +typedef GenericBitBuffer BitBuffer; +typedef GenericBitBuffer FastBitBuffer; + + +//-------------------------------------------------------------------------- + + +template +GenericBitBuffer::GenericBitBuffer(usint words) : + size(words), + pos(0), + bits(DATA_BITS), + item_bits(1), + items(0), + current(0), + free_buffer(true) +{ + this->data = new Data[words]; + memset(this->data, 0, words * sizeof(Data)); +} + +template +GenericBitBuffer::GenericBitBuffer(usint _items, usint item_size) : + pos(0), + bits(DATA_BITS), + item_bits(item_size), + items(_items), + current(0), + free_buffer(true) +{ + this->size = bitsToData(items * item_size); + this->data = new Data[this->size]; + memset(this->data, 0, this->size * sizeof(Data)); +} + +template +GenericBitBuffer::GenericBitBuffer(std::ifstream& file, usint words) : + size(words), + pos(0), + bits(DATA_BITS), + item_bits(1), + items(0), + current(0), + free_buffer(true) +{ + this->data = new Data[words]; + memset(this->data, 0, words * sizeof(Data)); + file.read((char*)this->data, words * sizeof(Data)); +} + +template +GenericBitBuffer::GenericBitBuffer(std::ifstream& file, usint _items, usint item_size) : + size(BITS_TO_WORDS(_items * item_size)), + pos(0), + bits(DATA_BITS), + item_bits(item_size), + items(_items), + current(0), + free_buffer(true) +{ + this->data = new Data[this->size]; + memset(this->data, 0, this->size * sizeof(Data)); + file.read((char*)this->data, this->size * sizeof(Data)); +} + +template +GenericBitBuffer::GenericBitBuffer(Data* buffer, usint words) : + size(words), + pos(0), + bits(DATA_BITS), + item_bits(1), + items(0), + current(0), + free_buffer(false) +{ + this->data = buffer; +} + +template +GenericBitBuffer::GenericBitBuffer(Data* buffer, usint _items, usint item_size) : + size(BITS_TO_WORDS(_items * item_size)), + pos(0), + bits(DATA_BITS), + item_bits(item_size), + items(_items), + current(0), + free_buffer(false) +{ + this->data = buffer; +} + +template +GenericBitBuffer::~GenericBitBuffer() +{ + if(this->free_buffer) + { + delete[] this->data; + } +} + +//-------------------------------------------------------------------------- + +template +void +GenericBitBuffer::writeBuffer(std::ofstream& file, bool erase) +{ + file.write((char*)this->data, this->size * sizeof(Data)); + this->reset(erase); +} + +template +void +GenericBitBuffer::readBuffer(std::ifstream& file, usint words, bool erase) +{ + if(words > this->size || !(this->free_buffer)) + { + if(this->free_buffer) + { + delete[] this->data; + } + this->size = words; + this->data = new Data[words]; + this->free_buffer = true; + } + + this->reset(erase); + file.read((char*)this->data, words * sizeof(Data)); +} + +template +void +GenericBitBuffer::setBuffer(Data* buffer, usint words) +{ + if(this->free_buffer) + { + delete[] this->data; + this->free_buffer = false; + } + + this->data = buffer; + this->size = words; + this->reset(false); +} + +//-------------------------------------------------------------------------- + +template +usint +GenericBitBuffer::reportSize() +{ + usint bytes = sizeof(*this); + if(this->free_buffer) { bytes += this->size * sizeof(Data); } + return bytes; +} + +//-------------------------------------------------------------------------- + + +} // namespace CSA + + +#endif // BITBUFFER_H diff --git a/incbwt/bits/bitvector.cpp b/incbwt/bits/bitvector.cpp new file mode 100644 index 0000000..b76fa9d --- /dev/null +++ b/incbwt/bits/bitvector.cpp @@ -0,0 +1,270 @@ +#include + +#include "bitvector.h" + + +namespace CSA +{ + + +BitVector::BitVector(std::ifstream& file) : + rank_index(0), select_index(0) +{ + file.read((char*)&(this->size), sizeof(this->size)); + file.read((char*)&(this->items), sizeof(this->items)); + file.read((char*)&(this->number_of_blocks), sizeof(this->number_of_blocks)); + file.read((char*)&(this->block_size), sizeof(this->block_size)); + + this->array = new usint[this->block_size * this->number_of_blocks]; + file.read((char*)(this->array), this->block_size * this->number_of_blocks * sizeof(usint)); + this->buffer = new FastBitBuffer(this->array, this->block_size); + + this->integer_bits = length(this->size); + this->samples = new FastBitBuffer(file, 2 * (this->number_of_blocks + 1), this->integer_bits); + + this->indexForRank(); + this->indexForSelect(); +} + +BitVector::BitVector(VectorEncoder& encoder, usint universe_size) : + size(universe_size), items(encoder.items), + block_size(encoder.block_size), + number_of_blocks(encoder.blocks), + rank_index(0), select_index(0) +{ + this->array = new usint[this->block_size * this->number_of_blocks]; + this->buffer = new FastBitBuffer(this->array, this->block_size); + + this->integer_bits = length(this->size); + this->samples = new FastBitBuffer(2 * (this->number_of_blocks + 1), this->integer_bits); + + // Copy & linearize the array. + usint pos = 0; + for(std::list::iterator iter = encoder.array_blocks.begin(); iter != encoder.array_blocks.end(); iter++) + { + memcpy(this->array + pos, *iter, encoder.superblock_bytes); + pos += encoder.block_size * encoder.blocks_in_superblock; + } + memcpy(this->array + pos, encoder.array, encoder.current_blocks * encoder.block_size * sizeof(usint)); + + // Compress the samples. + for(std::list::iterator iter = encoder.sample_blocks.begin(); iter != encoder.sample_blocks.end(); iter++) + { + usint* buf = *iter; + for(usint i = 0; i < 2 * encoder.samples_in_superblock; i++) + { + this->samples->writeItem(buf[i]); + } + } + for(usint i = 0; i < 2 * encoder.current_samples; i++) + { + this->samples->writeItem(encoder.samples[i]); + } + this->samples->writeItem(this->items); + this->samples->writeItem(this->size); + + this->indexForRank(); + this->indexForSelect(); +} + +BitVector::~BitVector() +{ + delete[] this->array; + delete this->buffer; + delete this->samples; + delete this->rank_index; + delete this->select_index; +} + +void +BitVector::writeTo(std::ofstream& file) +{ + file.write((char*)&(this->size), sizeof(this->size)); + file.write((char*)&(this->items), sizeof(this->items)); + file.write((char*)&(this->number_of_blocks), sizeof(this->number_of_blocks)); + file.write((char*)&(this->block_size), sizeof(this->block_size)); + file.write((char*)(this->array), this->block_size * this->number_of_blocks * sizeof(usint)); + this->samples->writeBuffer(file, false); +} + +//-------------------------------------------------------------------------- + +usint +BitVector::reportSize() +{ + usint bytes = this->buffer->reportSize(); + bytes += this->block_size * this->number_of_blocks * sizeof(usint); + bytes += this->samples->reportSize(); + bytes += this->rank_index->reportSize(); + bytes += this->select_index->reportSize(); + return bytes; +} + +//-------------------------------------------------------------------------- + +usint +BitVector::sampleForIndex(usint index) +{ + usint low = this->select_index->readItem(index / this->select_rate); + usint high = this->select_index->readItem(index / this->select_rate + 1); + + this->samples->goToItem(2 * low + 2); + for(; low < high; low++) + { + if(this->samples->readItem() > index) { return low; } + this->samples->skipItem(); + } + + return low; +} + +usint +BitVector::sampleForValue(usint value) +{ + usint low = this->rank_index->readItem(value / this->rank_rate); + usint high = this->rank_index->readItem(value / this->rank_rate + 1); + + this->samples->goToItem(2 * low + 3); + for(; low < high; low++) + { + if(this->samples->readItem() > value) { return low; } + this->samples->skipItem(); + } + + return low; +} + +//-------------------------------------------------------------------------- + +void +BitVector::indexForRank() +{ + delete this->rank_index; + + usint value_samples = (this->number_of_blocks + BitVector::INDEX_RATE - 1) / BitVector::INDEX_RATE; + this->rank_rate = (this->size + value_samples - 1) / value_samples; + value_samples = (this->size + this->rank_rate - 1) / this->rank_rate + 1; + this->rank_index = new FastBitBuffer(value_samples, length(this->number_of_blocks - 1)); + + usint current = 0, pointer = 0; + this->samples->goToItem(2); + while(this->samples->hasNextItem()) + { + this->samples->skipItem(); + usint limit = this->samples->readItem(); + while(current < limit) + { + this->rank_index->writeItem(pointer); + current += this->rank_rate; + } + pointer++; + } + this->rank_index->writeItem(this->number_of_blocks - 1); +} + +void +BitVector::indexForSelect() +{ + delete this->select_index; + + usint index_samples = (this->number_of_blocks + BitVector::INDEX_RATE - 1) / BitVector::INDEX_RATE; + this->select_rate = (this->items + index_samples - 1) / index_samples; + index_samples = (this->items + this->select_rate - 1) / this->select_rate + 1; + this->select_index = new FastBitBuffer(index_samples, length(this->number_of_blocks - 1)); + + usint current = 0, pointer = 0; + this->samples->goToItem(2); + while(this->samples->hasNextItem()) + { + usint limit = this->samples->readItem(); + this->samples->skipItem(); + while(current < limit) + { + this->select_index->writeItem(pointer); + current += this->select_rate; + } + pointer++; + } + this->select_index->writeItem(this->number_of_blocks - 1); +} + +//-------------------------------------------------------------------------- + +VectorEncoder::VectorEncoder(usint block_bytes, usint superblock_size) : + size(0), items(0), blocks(0), + block_size(BYTES_TO_WORDS(block_bytes)), + superblock_bytes(superblock_size) +{ + this->array = new usint[this->superblock_bytes / sizeof(usint)]; + memset(this->array, 0, this->superblock_bytes); + this->blocks_in_superblock = this->superblock_bytes / (sizeof(usint) * this->block_size); + this->current_blocks = 0; + + this->samples = new usint[this->superblock_bytes / sizeof(usint)]; + this->samples_in_superblock = this->superblock_bytes / (2 * sizeof(usint)); + this->current_samples = 0; + + this->buffer = new FastBitBuffer(this->array, this->block_size); +} + +VectorEncoder::~VectorEncoder() +{ + delete[] this->array; + + delete this->buffer; + for(std::list::iterator iter = this->array_blocks.begin(); iter != this->array_blocks.end(); iter++) + { + delete[] *iter; + } + + delete[] this->samples; + for(std::list::iterator iter = this->sample_blocks.begin(); iter != this->sample_blocks.end(); iter++) + { + delete[] *iter; + } +} + +void +VectorEncoder::addNewBlock() +{ + this->blocks++; + this->current_blocks++; + this->current_samples++; + + // Do we need a new superblock for the block? + if(this->current_blocks > this->blocks_in_superblock) + { + this->array_blocks.push_back(this->array); + this->array = new usint[this->superblock_bytes / sizeof(usint)]; + memset(this->array, 0, this->superblock_bytes); + this->current_blocks = 1; + } + this->buffer->moveBuffer(this->array + (this->block_size * (this->current_blocks - 1))); + + // Do we need a new superblock for the sample? + if(this->current_samples > this->samples_in_superblock) + { + this->sample_blocks.push_back(this->samples); + this->samples = new usint[this->superblock_bytes / sizeof(usint)]; + this->current_samples = 1; + } + this->samples[2 * this->current_samples - 2] = this->items - 1; + this->samples[2 * this->current_samples - 1] = this->size - 1; +} + +void +VectorEncoder::setFirstBit(usint value) +{ + this->samples[0] = 0; + this->samples[1] = value; + + this->size = value + 1; + this->items = 1; + this->blocks = 1; + + this->current_blocks = 1; + this->current_samples = 1; +} + + +} // namespace CSA diff --git a/incbwt/bits/bitvector.h b/incbwt/bits/bitvector.h new file mode 100644 index 0000000..fb3db90 --- /dev/null +++ b/incbwt/bits/bitvector.h @@ -0,0 +1,170 @@ +#ifndef BITVECTOR_H +#define BITVECTOR_H + +#include +#include + +#include "../misc/definitions.h" +#include "bitbuffer.h" + + +namespace CSA +{ + + +/* + This class provides the core functionality for encoding a bit vector. +*/ + +class VectorEncoder +{ + public: + const static usint SUPERBLOCK_SIZE = MEGABYTE; + + // We assume superblock size is divisible by block and sample size. + VectorEncoder(usint block_bytes, usint superblock_size = SUPERBLOCK_SIZE); + ~VectorEncoder(); + +/* + This should be implemented in any inherited class. + + void setBit(usint value); // Values must be in increasing order. +*/ + + void addNewBlock(); + void setFirstBit(usint value); + + usint size, items, blocks; + usint block_size, superblock_bytes; + + FastBitBuffer* buffer; + + std::list array_blocks; + usint* array; + usint blocks_in_superblock, current_blocks; + + std::list sample_blocks; + usint* samples; + usint samples_in_superblock, current_samples; +}; + + +/* + This class provides the core functionality for a bit vector. +*/ + +class BitVector +{ + public: + const static usint INDEX_RATE = 5; + + BitVector(std::ifstream& file); + BitVector(VectorEncoder& encoder, usint universe_size); + ~BitVector(); + + void writeTo(std::ofstream& file); + +//-------------------------------------------------------------------------- + +/* + These should be implemented in any actual bit vector. + + // rank is not iterator-safe + // regular: \sum_{i = 0}^{value} V[i] + // at_least: \sum_{i = 0}^{value - 1} V[i] + 1 + usint rank(usint value, bool at_least = false); + + usint select(usint index); // \min value: \sum_{i = 0}^{value} V[i] = index + 1 + usint selectNext(); + + pair_type valueAfter(usint value); // (\min i >= value: V[i] = 1, rank(i) - 1) + pair_type nextValue(); + + // These versions of select return (value, length_of_run). + // max_length is an upper bound for the length of the run returned. + // V[value] is not included in the length of the run + // These functions are not greedy: the actual length of the run can be more than reported. + // This can happen even if max_length was not reached. + // length_of_run is actually the number of extra items returned past value + + pair_type selectRun(usint index, usint max_length); + pair_type selectNextRun(usint max_length); + + // isSet is not iterator-safe + bool isSet(usint value); // V[value] +*/ + + inline bool hasNext() + { + return (this->sample.first + cur < this->items - 1); + } + +//-------------------------------------------------------------------------- + + inline usint getSize() { return this->size; } + inline usint getNumberOfItems() { return this->items; } + inline usint getBlockSize() { return this->block_size; } + + // This returns only the sizes of dynamically allocated structures. + usint reportSize(); + + protected: + usint size, items; + + FastBitBuffer* buffer; + usint* array; + usint block_size; + usint number_of_blocks; + + /* + Each sample is (rank(i) - 1, i) where V[i] = 1. + Number of samples is number_of_blocks + 1. + */ + FastBitBuffer* samples; + usint integer_bits; + + FastBitBuffer* rank_index; + usint rank_rate; + + FastBitBuffer* select_index; + usint select_rate; + + // Iterator data. These are enough for a gap-encoded vector. + usint block; + pair_type sample; + usint cur, val; // cur == 0 is the sample + usint block_items; + + /* + These functions return the sample corresponding to the block the given + index/value might be found in. Parameters are assumed to be valid. + */ + usint sampleForIndex(usint index); + usint sampleForValue(usint value); + + inline void getSample(usint sample_number) + { + this->block = sample_number; + this->samples->goToItem(2 * sample_number); + this->sample.first = this->samples->readItem(); + this->sample.second = this->samples->readItem(); + this->cur = 0; + this->val = this->sample.second; + this->block_items = this->samples->readItem() - this->sample.first - 1; + this->buffer->moveBuffer(this->array + (this->block * this->block_size)); + } + + /* + These functions build a higher level index for faster rank/select queries. + The index consists of about (number of samples) / INDEX_RATE pointers. + The bit vector cannot be used without the index. + */ + void indexForRank(); + void indexForSelect(); +}; + + +} // namespace CSA + + +#endif // BITVECTOR_H diff --git a/incbwt/bits/deltavector.cpp b/incbwt/bits/deltavector.cpp new file mode 100644 index 0000000..4aa02a1 --- /dev/null +++ b/incbwt/bits/deltavector.cpp @@ -0,0 +1,174 @@ +#include + +#include "deltavector.h" + + +namespace CSA +{ + + +DeltaVector::DeltaVector(std::ifstream& file) : + BitVector(file) +{ +} + +DeltaVector::DeltaVector(DeltaEncoder& encoder, usint universe_size) : + BitVector(encoder, universe_size) +{ +} + +DeltaVector::~DeltaVector() +{ +} + +//-------------------------------------------------------------------------- + +usint +DeltaVector::rank(usint value, bool at_least) +{ + if(value >= this->size) { return this->items; } + this->getSample(this->sampleForValue(value)); + + while(this->cur < this->block_items && this->val < value) + { + this->val += this->buffer->readDeltaCode(); + this->cur++; + } + + usint idx = this->sample.first + this->cur + 1; + if(!at_least && this->val > value) { idx--; } + if(at_least && this->val < value) { this->getSample(this->block + 1); } + return idx; +} + +usint +DeltaVector::select(usint index) +{ + if(index >= this->items) { return this->size; } + this->getSample(this->sampleForIndex(index)); + + usint lim = index - this->sample.first; + for(; this->cur < lim; this->cur++) + { + this->val += this->buffer->readDeltaCode(); + } + + return this->val; +} + +usint +DeltaVector::selectNext() +{ + if(this->cur >= this->block_items) + { + this->getSample(this->block + 1); + return this->val; + } + + this->cur++; + this->val += this->buffer->readDeltaCode(); + return this->val; +} + +pair_type +DeltaVector::valueAfter(usint value) +{ + if(value >= this->size) { return pair_type(this->size, this->items); } + this->getSample(this->sampleForValue(value)); + + while(this->cur < this->block_items && this->val < value) + { + this->val += this->buffer->readDeltaCode(); + this->cur++; + } + if(this->val < value) + { + this->getSample(this->block + 1); + } + + return pair_type(this->val, this->sample.first + this->cur); +} + +pair_type +DeltaVector::nextValue() +{ + if(this->cur >= this->block_items) + { + this->getSample(this->block + 1); + return pair_type(this->val, this->sample.first); + } + + this->cur++; + this->val += this->buffer->readDeltaCode(); + return pair_type(this->val, this->sample.first + this->cur); +} + +pair_type +DeltaVector::selectRun(usint index, usint max_length) +{ + return pair_type(this->select(index), 0); +} + +pair_type +DeltaVector::selectNextRun(usint max_length) +{ + return pair_type(this->selectNext(), 0); +} + +bool +DeltaVector::isSet(usint value) +{ + if(value >= this->size) { return false; } + this->getSample(this->sampleForValue(value)); + + while(this->cur < this->block_items && this->val < value) + { + this->val += this->buffer->readDeltaCode(); + this->cur++; + } + + return (this->val == value); +} + +//-------------------------------------------------------------------------- + +usint +DeltaVector::reportSize() +{ + usint bytes = sizeof(*this); + bytes += BitVector::reportSize(); + return bytes; +} + +//-------------------------------------------------------------------------- + +DeltaEncoder::DeltaEncoder(usint block_bytes, usint superblock_size) : + VectorEncoder(block_bytes, superblock_size) +{ +} + +DeltaEncoder::~DeltaEncoder() +{ +} + +void +DeltaEncoder::setBit(usint value) +{ + if(this->items == 0) + { + this->setFirstBit(value); + return; + } + if(value < this->size) { return; } + + usint diff = value + 1 - this->size; + this->size = value + 1; + this->items++; + if(this->buffer->writeDeltaCode(diff)) { return; } + + // Didn't fit into the block. A new sample & block required. + this->addNewBlock(); +} + + +} // namespace CSA diff --git a/incbwt/bits/deltavector.h b/incbwt/bits/deltavector.h new file mode 100644 index 0000000..e0692d6 --- /dev/null +++ b/incbwt/bits/deltavector.h @@ -0,0 +1,62 @@ +#ifndef DELTAVECTOR_H +#define DELTAVECTOR_H + +#include + +#include "bitvector.h" + + +namespace CSA +{ + + +/* + This class is used to construct a DeltaVector. +*/ + +class DeltaEncoder : public VectorEncoder +{ + public: + DeltaEncoder(usint block_bytes, usint superblock_size = VectorEncoder::SUPERBLOCK_SIZE); + ~DeltaEncoder(); + + void setBit(usint value); +}; + + +/* + This is a gap-encoded bit vector using delta coding. +*/ + +class DeltaVector : public BitVector +{ + public: + DeltaVector(std::ifstream& file); + DeltaVector(DeltaEncoder& encoder, usint universe_size); + ~DeltaVector(); + +//-------------------------------------------------------------------------- + + usint rank(usint value, bool at_least = false); + + usint select(usint index); + usint selectNext(); + + pair_type valueAfter(usint value); + pair_type nextValue(); + + pair_type selectRun(usint index, usint max_length); + pair_type selectNextRun(usint max_length); + + bool isSet(usint value); + +//-------------------------------------------------------------------------- + + usint reportSize(); +}; + + +} // namespace CSA + + +#endif // DELTAVECTOR_H diff --git a/incbwt/bits/rlevector.cpp b/incbwt/bits/rlevector.cpp new file mode 100644 index 0000000..e07c2b4 --- /dev/null +++ b/incbwt/bits/rlevector.cpp @@ -0,0 +1,247 @@ +#include + +#include "rlevector.h" +#include "../misc/utils.h" + + +namespace CSA +{ + + +RLEVector::RLEVector(std::ifstream& file) : + BitVector(file) +{ +} + +RLEVector::RLEVector(RLEEncoder& encoder, usint universe_size) : + BitVector(encoder, universe_size) +{ +} + +RLEVector::~RLEVector() +{ +} + +//-------------------------------------------------------------------------- + +usint +RLEVector::rank(usint value, bool at_least) +{ + if(value >= this->size) { return this->items; } + + this->valueLoop(value); + + usint idx = this->sample.first + this->cur + 1; + if(!at_least && this->val > value) + { + idx--; + } + if(at_least && this->val < value) + { + this->getSample(this->block + 1); + this->run = 0; + idx = this->sample.first + this->cur + 1; + } + return idx; +} + +usint +RLEVector::select(usint index) +{ + if(index >= this->items) { return this->size; } + this->getSample(this->sampleForIndex(index)); + this->run = 0; + + usint lim = index - this->sample.first; + while(this->cur < lim) + { + this->val += this->buffer->readDeltaCode(); + usint temp = this->buffer->readDeltaCode(); + this->val += temp - 1; + this->cur += temp; + } + if(this->cur > lim) + { + this->run = this->cur - lim; + this->cur -= this->run; + this->val -= this->run; + } + + return this->val; +} + +usint +RLEVector::selectNext() +{ + if(this->cur >= this->block_items) + { + this->getSample(this->block + 1); + this->run = 0; + return this->val; + } + + this->cur++; + if(this->run > 0) + { + this->val++; + this->run--; + } + else + { + this->val += this->buffer->readDeltaCode(); + this->run = this->buffer->readDeltaCode() - 1; + } + + return this->val; +} + +pair_type +RLEVector::valueAfter(usint value) +{ + if(value >= this->size) { return pair_type(this->size, this->items); } + + this->valueLoop(value); + + if(this->val < value) + { + this->getSample(this->block + 1); + this->run = 0; + } + + return pair_type(this->val, this->sample.first + this->cur); +} + +pair_type +RLEVector::nextValue() +{ + if(this->cur >= this->block_items) + { + this->getSample(this->block + 1); + this->run = 0; + return pair_type(this->val, this->sample.first); + } + + this->cur++; + if(this->run > 0) + { + this->val++; + this->run--; + } + else + { + this->val += this->buffer->readDeltaCode(); + this->run = this->buffer->readDeltaCode() - 1; + } + + return pair_type(this->val, this->sample.first + this->cur); +} + +pair_type +RLEVector::selectRun(usint index, usint max_length) +{ + usint value = this->select(index); + + usint len = std::min(max_length, this->run); + this->run -= len; this->cur += len; this->val += len; + + return pair_type(value, len); +} + +pair_type +RLEVector::selectNextRun(usint max_length) +{ + usint value = this->selectNext(); + + usint len = std::min(max_length, this->run); + this->run -= len; this->cur += len; this->val += len; + + return pair_type(value, len); +} + +bool +RLEVector::isSet(usint value) +{ + if(value >= this->size) { return false; } + + this->valueLoop(value); + + return (this->val == value); +} + +//-------------------------------------------------------------------------- + +usint +RLEVector::reportSize() +{ + usint bytes = sizeof(*this); + bytes += BitVector::reportSize(); + return bytes; +} + +//-------------------------------------------------------------------------- + +RLEEncoder::RLEEncoder(usint block_bytes, usint superblock_size) : + VectorEncoder(block_bytes, superblock_size) +{ +} + +RLEEncoder::~RLEEncoder() +{ +} + +void +RLEEncoder::setRun(usint start, usint len) +{ + if(this->items == 0) + { + this->setFirstBit(start); + if(len > 1) + { + this->RLEncode(1, len - 1); + } + return; + } + if(start < this->size || len == 0) { return; } + + // Write as much into the buffer as possible. + usint diff = start + 1 - this->size; + usint free_bits = this->buffer->bitsLeft(); + usint code_bits = this->buffer->deltaCodeLength(diff); + if(free_bits > code_bits) // At least a part of the run fits into the block. + { + free_bits -= code_bits; + usint run_bits = this->buffer->deltaCodeLength(len); + if(run_bits <= free_bits) + { + this->RLEncode(diff, len); + return; + } + + // Encode as much as possible and let the rest spill. + usint llen = 1; + while(llen + 2 * length(llen + 1) - 1 <= free_bits) { llen++; } + llen = ((usint)1 << llen) - 1; + + this->RLEncode(diff, llen); + len -= llen; + + // A new sample will be added. + this->size++; + this->items++; + } + else + { + this->size = start + 1; + this->items++; + } + + // Didn't fit into the block. A new sample & block required. + this->addNewBlock(); + if(len > 1) + { + this->RLEncode(1, len - 1); + } +} + + +} // namespace CSA diff --git a/incbwt/bits/rlevector.h b/incbwt/bits/rlevector.h new file mode 100644 index 0000000..847ce58 --- /dev/null +++ b/incbwt/bits/rlevector.h @@ -0,0 +1,100 @@ +#ifndef RLEVECTOR_H +#define RLEVECTOR_H + +#include + +#include "bitvector.h" + + +namespace CSA +{ + + +/* + This class is used to construct a RLEVector. +*/ + +class RLEEncoder : public VectorEncoder +{ + public: + RLEEncoder(usint block_bytes, usint superblock_size = VectorEncoder::SUPERBLOCK_SIZE); + ~RLEEncoder(); + +// void setBit(usint value); + void setRun(usint start, usint len); + + inline void RLEncode(usint diff, usint len) + { + this->size += diff + len - 1; + this->items += len; + this->buffer->writeDeltaCode(diff); + this->buffer->writeDeltaCode(len); + } +}; + + +/* + This is a run-length encoded bit vector using delta coding. +*/ + +class RLEVector : public BitVector +{ + public: + RLEVector(std::ifstream& file); + RLEVector(RLEEncoder& encoder, usint universe_size); + ~RLEVector(); + +//-------------------------------------------------------------------------- + + usint rank(usint value, bool at_least = false); + + usint select(usint index); + usint selectNext(); + + pair_type valueAfter(usint value); + pair_type nextValue(); + + pair_type selectRun(usint index, usint max_length); + pair_type selectNextRun(usint max_length); + + bool isSet(usint value); + +//-------------------------------------------------------------------------- + + usint reportSize(); + + protected: + usint run; + + inline void valueLoop(usint value) + { + this->getSample(this->sampleForValue(value)); + this->run = 0; + + if(this->val >= value) { return; } + while(this->cur < this->block_items) + { + this->val += this->buffer->readDeltaCode(); + this->cur++; + this->run = this->buffer->readDeltaCode() - 1; + if(this->val >= value) { break; } + + this->cur += this->run; + this->val += this->run; + if(this->val >= value) + { + this->run = this->val - value; + this->val = value; + this->cur -= this->run; + break; + } + this->run = 0; + } + } +}; + + +} // namespace CSA + + +#endif // RLEVECTOR_H diff --git a/incbwt/bits/vectors.cpp b/incbwt/bits/vectors.cpp new file mode 100644 index 0000000..417688f --- /dev/null +++ b/incbwt/bits/vectors.cpp @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include + +#include "vectors.h" +#include "../misc/utils.h" + + +namespace CSA +{ + + +inline void +handleOne(RLEEncoder& encoder, pair_type& run, usint position) +{ + if(run.second == 0) + { + run.first = position; + run.second = 1; + return; + } + if(position == run.first + run.second) + { + run.second++; + return; + } + encoder.setRun(run.first, run.second); + run.first = position; + run.second = 1; +} + +inline void +handleRun(RLEEncoder& encoder, pair_type& run, pair_type& next, usint limit) +{ + if(run.second == 0) + { + run.first = next.first; + run.second = std::min(limit - run.first, next.second); + next.first += run.second; + next.second -= run.second; + return; + } + + if(next.first == run.first + run.second) + { + usint cont = std::min(limit - next.first, next.second); + run.second += cont; + next.first += cont; + next.second -= cont; + return; + } + + encoder.setRun(run.first, run.second); + run.first = next.first; + run.second = std::min(limit - run.first, next.second);; + next.first += run.second; + next.second -= run.second; +} + +RLEVector* +mergeVectors(RLEVector* first, RLEVector* second, usint* positions, usint n, usint size, usint block_size) +{ + if((first == 0 && second == 0) || positions == 0) { return 0; } + + pair_type first_run; + bool first_finished; + if(first == 0) + { + first_run = pair_type(size, 0); + first_finished = true; + } + else + { + first_run = first->selectRun(0, size); + first_run.second++; + first_finished = false; + } + + usint second_bit; + if(second == 0) + { + second_bit = n; + } + else + { + second_bit = second->select(0); + } + + RLEEncoder encoder(block_size); + pair_type run = pair_type(size, 0); + for(usint i = 0; i < n; i++, first_run.first++) + { + while(!first_finished && first_run.first < positions[i]) + { + handleRun(encoder, run, first_run, positions[i]); + if(first_run.second == 0) + { + if(first->hasNext()) + { + first_run = first->selectNextRun(size); + first_run.first += i; + first_run.second++; + } + else + { + first_finished = true; + } + } + } + + if(i == second_bit) // positions[i] is one + { + handleOne(encoder, run, positions[i]); + second_bit = second->selectNext(); + } + else // positions[i] is zero + { + if(run.second != 0) + { + encoder.setRun(run.first, run.second); + run.second = 0; + } + } + } + + while(!first_finished) + { + handleRun(encoder, run, first_run, size); + if(first->hasNext()) + { + first_run = first->selectNextRun(size); + first_run.first += n; + first_run.second++; + } + else { break; } + } + + if(run.second != 0) + { + encoder.setRun(run.first, run.second); + } + + delete first; delete second; + return new RLEVector(encoder, size); +} + +//-------------------------------------------------------------------------- + +DeltaVector* +mergeVectors(DeltaVector* first, DeltaVector* second, usint* positions, usint n, usint size, usint block_size) +{ + if((first == 0 && second == 0) || positions == 0) { return 0; } + + usint first_bit; + bool first_finished; + if(first == 0) + { + first_bit = 0; + first_finished = true; + } + else + { + first_bit = first->select(0); + first_finished = false; + } + + usint second_bit; + if(second == 0) + { + second_bit = n; + } + else + { + second_bit = second->select(0); + } + + DeltaEncoder encoder(block_size); + for(usint i = 0; i < n; i++, first_bit++) + { + while(!first_finished && first_bit < positions[i]) + { + encoder.setBit(first_bit); + if(first->hasNext()) + { + first_bit = first->selectNext() + i; + } + else + { + first_finished = true; + } + } + + if(i == second_bit) // positions[i] is one + { + encoder.setBit(positions[i]); + second_bit = second->selectNext(); + } + } + + while(!first_finished) + { + encoder.setBit(first_bit); + if(!first->hasNext()) { break; } + first_bit = first->selectNext() + n; + } + + delete first; delete second; + return new DeltaVector(encoder, size); +} + + +} // namespace CSA diff --git a/incbwt/bits/vectors.h b/incbwt/bits/vectors.h new file mode 100644 index 0000000..dfd4a21 --- /dev/null +++ b/incbwt/bits/vectors.h @@ -0,0 +1,25 @@ +#ifndef VECTORS_H +#define VECTORS_H + +#include "deltavector.h" +#include "rlevector.h" + + +namespace CSA +{ + + +/* + These functions merge two vectors using marked positions. + The original vectors are deleted. +*/ + +RLEVector* mergeVectors(RLEVector* first, RLEVector* second, usint* positions, usint n, usint size, usint block_size); + +DeltaVector* mergeVectors(DeltaVector* first, DeltaVector* second, usint* positions, usint n, usint size, usint block_size); + + +} // namespace CSA + + +#endif // VECTORS_H diff --git a/incbwt/dependencies.mk b/incbwt/dependencies.mk new file mode 100644 index 0000000..49ec47b --- /dev/null +++ b/incbwt/dependencies.mk @@ -0,0 +1,27 @@ +rlcsa.o: rlcsa.cpp rlcsa.h bits/vectors.h bits/deltavector.h \ + bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ + bits/rlevector.h sasamples.h misc/definitions.h bits/bitbuffer.h \ + bits/deltavector.h misc/parameters.h misc/definitions.h misc/utils.h \ + qsufsort/qsufsort.h qsufsort/../misc/definitions.h +rlcsa_builder.o: rlcsa_builder.cpp rlcsa_builder.h rlcsa.h bits/vectors.h \ + bits/deltavector.h bits/bitvector.h bits/../misc/definitions.h \ + bits/bitbuffer.h bits/rlevector.h sasamples.h misc/definitions.h \ + bits/bitbuffer.h bits/deltavector.h misc/parameters.h \ + misc/definitions.h +sasamples.o: sasamples.cpp sasamples.h misc/definitions.h \ + bits/bitbuffer.h bits/../misc/definitions.h bits/deltavector.h \ + bits/bitvector.h bits/bitbuffer.h misc/utils.h misc/definitions.h +bitvector.o: bits/bitvector.cpp bits/bitvector.h \ + bits/../misc/definitions.h bits/bitbuffer.h +deltavector.o: bits/deltavector.cpp bits/deltavector.h bits/bitvector.h \ + bits/../misc/definitions.h bits/bitbuffer.h +rlevector.o: bits/rlevector.cpp bits/rlevector.h bits/bitvector.h \ + bits/../misc/definitions.h bits/bitbuffer.h bits/../misc/utils.h \ + bits/../misc/definitions.h +vectors.o: bits/vectors.cpp bits/vectors.h bits/deltavector.h \ + bits/bitvector.h bits/../misc/definitions.h bits/bitbuffer.h \ + bits/rlevector.h bits/../misc/utils.h bits/../misc/definitions.h +parameters.o: misc/parameters.cpp misc/parameters.h misc/definitions.h +utils.o: misc/utils.cpp misc/utils.h misc/definitions.h +qsufsort.o: qsufsort/qsufsort.c qsufsort/qsufsort.h \ + qsufsort/../misc/definitions.h diff --git a/incbwt/misc/definitions.h b/incbwt/misc/definitions.h new file mode 100644 index 0000000..4903af2 --- /dev/null +++ b/incbwt/misc/definitions.h @@ -0,0 +1,67 @@ +#ifndef DEFINITIONS_H +#define DEFINITIONS_H + +#include +#include + + +namespace CSA +{ + + +#ifdef MASSIVE_DATA_RLCSA +typedef unsigned long usint; +typedef signed long sint; +#else +typedef unsigned int usint; +typedef signed int sint; +#endif + + +typedef unsigned char uchar; +typedef std::pair pair_type; + + +inline usint length(usint n) +{ + usint b = 0; + while(n > 0) { b++; n >>= 1; } + return b; +} + +inline bool isEmpty(const pair_type& data) +{ + return (data.first > data.second); +} + +inline usint length(const pair_type& data) +{ + return data.second + 1 - data.first; +} + + +const usint CHARS = ((usint)1 << CHAR_BIT); +const usint MEGABYTE = 1048576; +const usint MILLION = 1000000; +const usint WORD_BITS = CHAR_BIT * sizeof(usint); +const usint WORD_MAX = ~((usint)0); + +const pair_type EMPTY_PAIR = pair_type(1, 0); + + +// Previous GET was broken when BITS == WORD_BITS +// Current version works for usints and less +//#define GET(FIELD, BITS) ((FIELD) & ((1 << (BITS)) - 1)) +#define GET(FIELD, BITS) ((FIELD) & (WORD_MAX >> (WORD_BITS - (BITS)))) +#define LOWER(FIELD, N) ((FIELD) >> (N)) +#define HIGHER(FIELD, N) ((FIELD) << (N)) + +#define BITS_TO_BYTES(BITS) (((BITS) + CHAR_BIT - 1) / CHAR_BIT) +#define BYTES_TO_WORDS(BYTES) (((BYTES) + sizeof(usint) - 1) / sizeof(usint)) +#define BITS_TO_WORDS(BITS) (((BITS) + WORD_BITS - 1) / WORD_BITS) + + +} // namespace CSA + + +#endif diff --git a/incbwt/misc/parameters.cpp b/incbwt/misc/parameters.cpp new file mode 100644 index 0000000..7f7d0dd --- /dev/null +++ b/incbwt/misc/parameters.cpp @@ -0,0 +1,108 @@ +#include +#include +#include + +#include "parameters.h" + + +namespace CSA +{ + + +Parameters::Parameters() +{ +} + +Parameters::~Parameters() +{ +} + +bool +Parameters::contains(const std::string& key) +{ + return (this->parameters.find(key) != this->parameters.end()); +} + +usint +Parameters::get(const std::string& key) +{ + std::map::iterator iter = this->parameters.find(key); + if(iter == this->parameters.end()) { return 0; } + return iter->second; +} + +usint +Parameters::get(const parameter_type& param) +{ + return this->get(param.first); +} + +void +Parameters::set(const std::string& key, usint value) +{ + this->parameters[key] = value; +} + +void +Parameters::set(const parameter_type& param) +{ + this->set(param.first, param.second); +} + +void +Parameters::read(std::ifstream& file) +{ + while(file) + { + std::string key; + std::string c; + usint value; + + file >> key >> c >> value; + if(c == "=") { this->parameters[key] = value; } + } +} + +void Parameters::read(const std::string& file_name) +{ + std::ifstream file(file_name.c_str(), std::ios_base::binary); + if(!file) + { + std::cerr << "Cannot open parameter file " << file_name << " for reading!" << std::endl; + return; + } + this->read(file); + file.close(); +} + +void +Parameters::print() +{ + this->write(std::cout); + std::cout << std::endl; +} + +void +Parameters::write(std::ostream& stream) +{ + for(std::map::iterator iter = this->parameters.begin(); iter != this->parameters.end(); iter++) + { + stream << iter->first << " = " << iter->second << std::endl; + } +} + +void +Parameters::write(const std::string& file_name) +{ + std::ofstream file(file_name.c_str(), std::ios_base::binary); + if(!file) + { + std::cerr << "Cannot open parameter file " << file_name << " for writing!" << std::endl; + return; + } + this->write(file); + file.close(); +} + + +} // namespace CSA diff --git a/incbwt/misc/parameters.h b/incbwt/misc/parameters.h new file mode 100644 index 0000000..c5cc7c2 --- /dev/null +++ b/incbwt/misc/parameters.h @@ -0,0 +1,43 @@ +#ifndef PARAMETERS_H +#define PARAMETERS_H + +#include +#include + +#include "definitions.h" + + +namespace CSA +{ + + +typedef std::pair parameter_type; + + +class Parameters +{ + public: + Parameters(); + ~Parameters(); + + bool contains(const std::string& key); + usint get(const std::string& key); + usint get(const parameter_type& param); + void set(const std::string& key, usint value); + void set(const parameter_type& param); + + void read(std::ifstream& file); + void read(const std::string& file_name); + void print(); + void write(std::ostream& stream); + void write(const std::string& file_name); + + private: + std::map parameters; +}; + + +} // namespace CSA + + +#endif diff --git a/incbwt/misc/utils.cpp b/incbwt/misc/utils.cpp new file mode 100644 index 0000000..ae49883 --- /dev/null +++ b/incbwt/misc/utils.cpp @@ -0,0 +1,55 @@ +#include "utils.h" + + +namespace CSA +{ + + +std::streamoff +fileSize(std::ifstream& file) +{ + std::streamoff curr = file.tellg(); + + file.seekg(0, std::ios::end); + std::streamoff size = file.tellg(); + file.seekg(0, std::ios::beg); + size -= file.tellg(); + + file.seekg(curr, std::ios::beg); + return size; +} + +std::streamoff +fileSize(std::ofstream& file) +{ + std::streamoff curr = file.tellp(); + + file.seekp(0, std::ios::end); + std::streamoff size = file.tellp(); + file.seekp(0, std::ios::beg); + size -= file.tellp(); + + file.seekp(curr, std::ios::beg); + return size; +} + +std::ostream& +operator<<(std::ostream& stream, pair_type data) +{ + return stream << "(" << data.first << ", " << data.second << ")"; +} + +void +readRows(std::ifstream& file, std::list& rows, bool skipEmptyRows) +{ + while(file) + { + std::string buf; + std::getline(file, buf); + if(skipEmptyRows && buf.length() == 0) { continue; } + rows.push_back(buf); + } +} + + +} // namespace CSA diff --git a/incbwt/misc/utils.h b/incbwt/misc/utils.h new file mode 100644 index 0000000..48cd07f --- /dev/null +++ b/incbwt/misc/utils.h @@ -0,0 +1,27 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +#include "definitions.h" + + +namespace CSA +{ + + +std::streamoff fileSize(std::ifstream& file); +std::streamoff fileSize(std::ofstream& file); + + +std::ostream& operator<<(std::ostream& stream, pair_type data); + + +void readRows(std::ifstream& file, std::list& rows, bool skipEmptyRows); + + +} // namespace CSA + + +#endif // UTILS_H diff --git a/incbwt/qsufsort/qsufsort.c b/incbwt/qsufsort/qsufsort.c new file mode 100644 index 0000000..3cc1cbe --- /dev/null +++ b/incbwt/qsufsort/qsufsort.c @@ -0,0 +1,315 @@ +/* qsufsort.c + Copyright 1999, N. Jesper Larsson, all rights reserved. + + This file contains an implementation of the algorithm presented in "Faster + Suffix Sorting" by N. Jesper Larsson (jesper@@cs.lth.se) and Kunihiko + Sadakane (sada@@is.s.u-tokyo.ac.jp). + + This software may be used freely for any purpose. However, when distributed, + the original source must be clearly stated, and, when the source code is + distributed, the copyright notice must be retained and any alterations in + the code must be clearly marked. No warranty is given regarding the quality + of this software.*/ + +/* + Replaced int -> sint for massive data support and moved into namespace CSA. + + - Jouni Siren 2009-03-16 +*/ + +#include +#include "qsufsort.h" + + +namespace CSA +{ + + +static sint *I, /* group array, ultimately suffix array.*/ + *V, /* inverse array, ultimately inverse of I.*/ + r, /* number of symbols aggregated by transform.*/ + h; /* length of already-sorted prefixes.*/ + +#define KEY(p) (V[*(p)+(h)]) +#define SWAP(p, q) (tmp=*(p), *(p)=*(q), *(q)=tmp) +#define MED3(a, b, c) (KEY(a)KEY(c) ? b : KEY(a)>KEY(c) ? c : (a))) + +/* Subroutine for select_sort_split and sort_split. Sets group numbers for a + group whose lowest position in I is pl and highest position is pm.*/ + +static void update_group(sint *pl, sint *pm) +{ + sint g; + + g=pm-I; /* group number.*/ + V[*pl]=g; /* update group number of first position.*/ + if (pl==pm) + *pl=-1; /* one element, sorted group.*/ + else + do /* more than one element, unsorted group.*/ + V[*++pl]=g; /* update group numbers.*/ + while (pl>1); /* small arrays, middle element.*/ + if (n>7) { + pl=p; + pn=p+n-1; + if (n>40) { /* big arrays, pseudomedian of 9.*/ + s=n>>3; + pl=MED3(pl, pl+s, pl+s+s); + pm=MED3(pm-s, pm, pm+s); + pn=MED3(pn-s-s, pn-s, pn); + } + pm=MED3(pl, pm, pn); /* midsize arrays, median of 3.*/ + } + return KEY(pm); +} + +/* Sorting routine called for each unsorted group. Sorts the array of integers + (suffix numbers) of length n starting at p. The algorithm is a ternary-split + quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", + Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This + function is based on Program 7.*/ + +static void sort_split(sint *p, sint n) +{ + sint *pa, *pb, *pc, *pd, *pl, *pm, *pn; + sint f, v, s, t, tmp; + + if (n<7) { /* multi-selection sort smallest arrays.*/ + select_sort_split(p, n); + return; + } + + v=choose_pivot(p, n); + pa=pb=p; + pc=pd=p+n-1; + while (1) { /* split-end partition.*/ + while (pb<=pc && (f=KEY(pb))<=v) { + if (f==v) { + SWAP(pa, pb); + ++pa; + } + ++pb; + } + while (pc>=pb && (f=KEY(pc))>=v) { + if (f==v) { + SWAP(pc, pd); + --pd; + } + --pc; + } + if (pb>pc) + break; + SWAP(pb, pc); + ++pb; + --pc; + } + pn=p+n; + if ((s=pa-p)>(t=pb-pa)) + s=t; + for (pl=p, pm=pb-s; s; --s, ++pl, ++pm) + SWAP(pl, pm); + if ((s=pd-pc)>(t=pn-pd-1)) + s=t; + for (pl=pb, pm=pn-s; s; --s, ++pl, ++pm) + SWAP(pl, pm); + + s=pb-pa; + t=pd-pc; + if (s>0) + sort_split(p, s); + update_group(p+s, p+n-t-1); + if (t>0) + sort_split(p+n-t, t); +} + +/* Bucketsort for first iteration. + + Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear + at least once. x[n] is 0. (This is the corresponding output of transform.) k + must be at most n+1. p is array of size n+1 whose contents are disregarded. + + Output: x is V and p is I after the initial sorting stage of the refined + suffix sorting algorithm.*/ + +static void bucketsort(sint *x, sint *p, sint n, sint k) +{ + sint *pi, i, c, d, g; + + for (pi=p; pi=p; --pi) { + d=x[c=*pi]; /* c is position, d is next in list.*/ + x[c]=g=i; /* last position equals group number.*/ + if (d>=0) { /* if more than one element in group.*/ + p[i--]=c; /* p is permutation for the sorted x.*/ + do { + d=x[c=d]; /* next in linked list.*/ + x[c]=g; /* group number in x.*/ + p[i--]=c; /* permutation in p.*/ + } while (d>=0); + } else + p[i--]=-1; /* one element, sorted group.*/ + } +} + +/* Transforms the alphabet of x by attempting to aggregate several symbols into + one, while preserving the suffix order of x. The alphabet may also be + compacted, so that x on output comprises all integers of the new alphabet + with no skipped numbers. + + Input: x is an array of size n+1 whose first n elements are positive + integers in the range l...k-1. p is array of size n+1, used for temporary + storage. q controls aggregation and compaction by defining the maximum value + for any symbol during transformation: q must be at least k-l; if q<=n, + compaction is guaranteed; if k-l>n, compaction is never done; if q is + INT_MAX, the maximum number of symbols are aggregated into one. + + Output: Returns an integer j in the range 1...q representing the size of the + new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is + set to the number of old symbols grouped into one. Only x[n] is 0.*/ + +static sint transform(sint *x, sint *p, sint n, sint k, sint l, sint q) +{ + sint b, c, d, e, i, j, m, s; + sint *pi, *pj; + + for (s=0, i=k-l; i; i>>=1) + ++s; /* s is number of bits in old symbol.*/ + e=INT_MAX>>s; /* e is for overflow checking.*/ + for (b=d=r=0; r=k-l) { /* if bucketing possible,*/ + j=transform(V, I, n, k, l, n); + bucketsort(V, I, n, j); /* bucketsort on first r positions.*/ + } else { + transform(V, I, n, k, l, INT_MAX); + for (i=0; i<=n; ++i) + I[i]=i; /* initialize I with suffix numbers.*/ + h=0; + sort_split(I, n+1); /* quicksort on first r positions.*/ + } + h=r; /* number of symbols aggregated by transform.*/ + + while (*I>=-n) { + pi=I; /* pi is first position of group.*/ + sl=0; /* sl is negated length of sorted groups.*/ + do { + if ((s=*pi)<0) { + pi-=s; /* skip over sorted group.*/ + sl+=s; /* add negated length to sl.*/ + } else { + if (sl) { + *(pi+sl)=sl; /* combine sorted groups before pi.*/ + sl=0; + } + pk=I+V[s]+1; /* pk-1 is last position of unsorted group.*/ + sort_split(pi, pk-pi); + pi=pk; /* next group.*/ + } + } while (pi<=I+n); + if (sl) /* if the array ends with a sorted group.*/ + *(pi+sl)=sl; /* combine sorted groups at end of I.*/ + h=2*h; /* double sorted-depth.*/ + } + + for (i=0; i<=n; ++i) /* reconstruct suffix array from inverse.*/ + I[V[i]]=i; +} + + +} // namespace CSA diff --git a/incbwt/qsufsort/qsufsort.h b/incbwt/qsufsort/qsufsort.h new file mode 100644 index 0000000..f5a83f5 --- /dev/null +++ b/incbwt/qsufsort/qsufsort.h @@ -0,0 +1,17 @@ +#ifndef _QSUFSORT_H_ +#define _QSUFSORT_H_ + +#include "../misc/definitions.h" + + +namespace CSA +{ + + +void suffixsort(sint *, sint *, sint, sint, sint); + + +} // namespace CSA + + +#endif diff --git a/incbwt/rlcsa.cpp b/incbwt/rlcsa.cpp new file mode 100644 index 0000000..8e3d2d9 --- /dev/null +++ b/incbwt/rlcsa.cpp @@ -0,0 +1,839 @@ +#include +#include +#include +#include +#include + +#include "rlcsa.h" +#include "misc/utils.h" +#include "qsufsort/qsufsort.h" + + +namespace CSA +{ + + +RLCSA::RLCSA(const std::string& base_name, bool print) : + ok(false), + sa_samples(0), + end_points(0) +{ + for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; } + + std::string array_name = base_name + ARRAY_EXTENSION; + std::ifstream array_file(array_name.c_str(), std::ios_base::binary); + if(!array_file) + { + std::cerr << "RLCSA: Error opening Psi array file!" << std::endl; + return; + } + + usint distribution[CHARS]; + array_file.read((char*)distribution, CHARS * sizeof(usint)); + this->buildCharIndexes(distribution); + + Parameters parameters; + parameters.read(base_name + PARAMETERS_EXTENSION); + for(usint c = 0; c < CHARS; c++) + { + if(!isEmpty(this->index_ranges[c])) { this->array[c] = new RLEVector(array_file); } + } + + this->end_points = new DeltaVector(array_file); + this->number_of_sequences = this->end_points->getNumberOfItems(); + + array_file.read((char*)&(this->sample_rate), sizeof(this->sample_rate)); + array_file.close(); + + this->support_locate = parameters.get(SUPPORT_LOCATE); + this->support_display = parameters.get(SUPPORT_DISPLAY); + + if(this->support_locate || this->support_display) + { + std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION; + std::ifstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary); + if(!sa_sample_file) + { + std::cerr << "RLCSA: Error opening suffix array sample file!" << std::endl; + return; + } + this->sa_samples = new SASamples(sa_sample_file, this->sample_rate); + sa_sample_file.close(); + } + + if(print) { parameters.print(); } + + this->ok = true; +} + +RLCSA::RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data) : + ok(false), + sa_samples(0), + sample_rate(sa_sample_rate), + end_points(0) +{ + for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; } + + if(!data || bytes == 0) + { + std::cerr << "RLCSA: No input data given!" << std::endl; + if(delete_data) { delete[] data; } + return; + } + if(block_size < 2 * sizeof(usint) || block_size % sizeof(usint) != 0) + { + std::cerr << "RLCSA: Block size must be a multiple of " << sizeof(usint) << " bytes!" << std::endl; + if(delete_data) { delete[] data; } + return; + } + + + // Do we store SA samples? + if(this->sample_rate > 0) + { + this->support_locate = this->support_display = true; + } + else + { + this->support_locate = this->support_display = false; + this->sample_rate = 1; + } + + + // Determine the number of sequences and mark their end points. + if(multiple_sequences) + { + DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE); + this->number_of_sequences = 0; + usint marker = 0; + usint padding = 0, chars_encountered = 0; + + for(usint i = 0; i < bytes; i++) + { + if(data[i] == 0) + { + if(i == marker) { break; } // Empty sequence. + this->number_of_sequences++; + marker = i + 1; + + usint pos = chars_encountered + padding - 1; + endings.setBit(pos); + padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - chars_encountered; + } + else + { + chars_encountered++; + } + } + + if(this->number_of_sequences == 0 || marker != bytes) + { + std::cerr << "RLCSA: Collection must consist of 0-terminated nonempty sequences!" << std::endl; + if(delete_data) { delete[] data; } + return; + } + this->end_points = new DeltaVector(endings, chars_encountered + padding); + } + else + { + this->number_of_sequences = 1; + DeltaEncoder endings(RLCSA::ENDPOINT_BLOCK_SIZE, RLCSA::ENDPOINT_BLOCK_SIZE); + usint pos = bytes - 1; + endings.setBit(pos); + usint padding = ((pos + this->sample_rate) / this->sample_rate) * this->sample_rate - bytes; + this->end_points = new DeltaVector(endings, bytes + padding); + } + + + // Build character tables etc. + usint distribution[CHARS]; + sint low = CHARS, high = 0; + for(usint c = 0; c < CHARS; c++) { distribution[c] = 0; } + for(usint i = 0; i < bytes; i++) + { + if(data[i] < low) { low = data[i]; } + if(data[i] > high) { high = data[i]; } + distribution[(usint)data[i]]++; + } + if(multiple_sequences) + { + distribution[0] = 0; + low = 0; + high = high + this->number_of_sequences; + } + this->buildCharIndexes(distribution); + + + // Build suffix array. + sint* inverse = new sint[bytes + 1]; + if(multiple_sequences) + { + sint zeros = 0; + for(usint i = 0; i < bytes; i++) + { + if(data[i] == 0) + { + inverse[i] = zeros; + zeros++; + } + else + { + inverse[i] = (sint)data[i] + this->number_of_sequences; + } + } + } + else + { + for(usint i = 0; i < bytes; i++) { inverse[i] = (sint)data[i]; } + } + if(delete_data) { delete[] data; } + sint* sa = new sint[bytes + 1]; + suffixsort(inverse, sa, bytes, high + 1, low); + + + // Sample SA. + usint incr = (multiple_sequences ? this->number_of_sequences + 1 : 1); // No e_of_s markers in SA. + if(this->support_locate || this->support_display) + { + if(multiple_sequences) + { + std::cout << "We shouldn't be here!" << std::endl; + // Real SA starts at sa + incr. + // for each sequence + // find starting position + // sample relative to starting position + // sort samples to SA order + } + else + { + this->sa_samples = new SASamples((usint*)(sa + incr), this->data_size, this->sample_rate); + } + } + + + // Build Psi. + for(usint i = 0; i <= bytes; i++) + { + sa[i] = inverse[(sa[i] + 1) % (bytes + 1)]; + } + delete[] inverse; + + + // Build RLCSA. + usint decr = (multiple_sequences ? 1 : 0); // Ignore the last e_of_s marker if multiple sequences. + for(usint c = 0; c < CHARS; c++) + { + if(distribution[c] == 0) { this->array[c] = 0; continue; } + + usint* curr = (usint*)(sa + index_ranges[c].first + incr); + usint* limit = (usint*)(sa + index_ranges[c].second + incr + 1); + RLEEncoder encoder(block_size); + pair_type run(*curr, 1); + curr++; + + for(; curr < limit; curr++) + { + if(*curr == run.first + run.second) { run.second++; } + else + { + encoder.setRun(run.first - decr, run.second); + run = pair_type(*curr, 1); + } + } + encoder.setRun(run.first - decr, run.second); + + this->array[c] = new RLEVector(encoder, this->data_size + incr - decr); + } + delete[] sa; + + + this->ok = true; +} + +RLCSA::RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size) : + ok(false), + sa_samples(0), + end_points(0) +{ + for(usint c = 0; c < CHARS; c++) { this->array[c] = 0; } + + if(!index.isOk() || !increment.isOk()) + { + return; // Fail silently. Actual error has already been reported. + } + if(positions == 0) + { + std::cerr << "RLCSA: Positions for insertions not available!" << std::endl; + return; + } + if(index.sample_rate != increment.sample_rate) + { + std::cerr << "RLCSA: Cannot combine indexes with different sample rates!" << std::endl; + std::cout << "Index: " << index.sample_rate << ", increment: " << increment.sample_rate << std::endl; + return; + } + + if(index.sa_samples == 0 || increment.sa_samples == 0) + { + this->support_locate = this->support_display = false; + } + else + { + this->support_locate = this->support_display = true; + } + + + // Build character tables etc. + usint distribution[CHARS]; + for(usint c = 0; c < CHARS; c++) + { + distribution[c] = length(index.index_ranges[c]) + length(increment.index_ranges[c]); + } + this->buildCharIndexes(distribution); + this->sample_rate = index.sample_rate; + + + // Combine end points of sequences. + this->number_of_sequences = index.number_of_sequences + increment.number_of_sequences; + DeltaEncoder* endings = new DeltaEncoder(RLCSA::ENDPOINT_BLOCK_SIZE); + + endings->setBit(index.end_points->select(0)); + for(usint i = 1; i < index.number_of_sequences; i++) + { + endings->setBit(index.end_points->selectNext()); + } + usint sum = index.end_points->getSize(); + delete index.end_points; index.end_points = 0; + + endings->setBit(sum + increment.end_points->select(0)); + for(usint i = 1; i < increment.number_of_sequences; i++) + { + endings->setBit(sum + increment.end_points->selectNext()); + } + sum += increment.end_points->getSize(); + delete increment.end_points; increment.end_points = 0; + + this->end_points = new DeltaVector(*endings, sum); + delete endings; + + + // Combine Psi. + usint psi_size = this->data_size + this->number_of_sequences; + for(usint c = 0; c < CHARS; c++) + { + if(distribution[c] == 0) { this->array[c] = 0; continue; } + this->array[c] = mergeVectors(index.array[c], increment.array[c], positions, increment.data_size + increment.number_of_sequences, psi_size, block_size); + index.array[c] = 0; + increment.array[c] = 0; + + if(this->array[c] == 0) + { + std::cerr << "RLCSA: Merge failed for vectors " << c << "!" << std::endl; + return; + } + } + + + // Combine suffix array samples. + if(this->support_locate || this->support_display) + { + positions += increment.number_of_sequences; + for(usint i = 0; i < increment.data_size; i++) + { + positions[i] -= this->number_of_sequences; + } + this->sa_samples = new SASamples(*(index.sa_samples), *(increment.sa_samples), positions, increment.data_size); + } + + this->ok = true; +} + +RLCSA::~RLCSA() +{ + for(usint c = 0; c < CHARS; c++) { delete this->array[c]; } + delete this->sa_samples; + delete this->end_points; +} + +void +RLCSA::writeTo(const std::string& base_name) +{ + std::string array_name = base_name + ARRAY_EXTENSION; + std::ofstream array_file(array_name.c_str(), std::ios_base::binary); + if(!array_file) + { + std::cerr << "RLCSA: Error creating Psi array file!" << std::endl; + return; + } + + usint distribution[CHARS]; + for(usint c = 0; c < CHARS; c++) + { + distribution[c] = length(this->index_ranges[c]); + } + array_file.write((char*)distribution, CHARS * sizeof(usint)); + for(usint c = 0; c < CHARS; c++) + { + if(this->array[c] != 0) + { + this->array[c]->writeTo(array_file); + } + } + + this->end_points->writeTo(array_file); + array_file.write((char*)&(this->sample_rate), sizeof(this->sample_rate)); + array_file.close(); + + if(this->support_locate || this->support_display) + { + std::string sa_sample_name = base_name + SA_SAMPLES_EXTENSION; + std::ofstream sa_sample_file(sa_sample_name.c_str(), std::ios_base::binary); + if(!sa_sample_file) + { + std::cerr << "RLCSA: Error creating suffix array sample file!" << std::endl; + return; + } + + this->sa_samples->writeTo(sa_sample_file); + sa_sample_file.close(); + } +} + +bool +RLCSA::isOk() +{ + return this->ok; +} + +//-------------------------------------------------------------------------- + +pair_type +RLCSA::count(const std::string& pattern) +{ + if(pattern.length() == 0) { return pair_type(0, this->data_size - 1); } + + pair_type index_range = this->index_ranges[(usint)*(pattern.rbegin())]; + if(isEmpty(index_range)) { return index_range; } + index_range.first += this->number_of_sequences; + index_range.second += this->number_of_sequences; + + for(std::string::const_reverse_iterator iter = ++pattern.rbegin(); iter != pattern.rend(); iter++) + { + RLEVector* vector = this->array[(usint)*iter]; + usint start = this->index_ranges[(usint)*iter].first; + + index_range.first = start + vector->rank(index_range.first, true) - 1 + this->number_of_sequences; + index_range.second = start + vector->rank(index_range.second) - 1 + this->number_of_sequences; + + if(isEmpty(index_range)) { return index_range; } + } + + // Suffix array indexes are 0-based. + index_range.first -= this->number_of_sequences; + index_range.second -= this->number_of_sequences; + + return index_range; +} + +void +RLCSA::reportPositions(uchar* data, usint length, usint* positions) +{ + if(data == 0 || length == 0 || positions == 0) { return; } + + usint current = this->number_of_sequences - 1; + positions[length] = current; // "immediately after current" + for(sint i = (sint)(length - 1); i >= 0; i--) + { +// positions[i] = current; // "immediately after current" + usint c = (usint)data[i]; + if(array[c]) + { + current = this->index_ranges[c].first + this->array[c]->rank(current) - 1 + this->number_of_sequences; + } + else + { + if(c < this->text_chars[0]) // No previous characters either. + { + current = this->number_of_sequences - 1; + } + else + { + current = this->index_ranges[c].first - 1 + this->number_of_sequences; + } + } + positions[i] = current; // "immediately after current" + } +} + +//-------------------------------------------------------------------------- + +LocateItem* +RLCSA::locate(pair_type range) +{ + if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size) { return 0; } + + LocateItem* data = new LocateItem[range.second + 1 - range.first]; + if(!data) { return 0; } + this->locateUnsafe(range, data); + + return data; +} + +LocateItem* +RLCSA::locate(pair_type range, LocateItem* data) +{ + if(!(this->support_locate) || isEmpty(range) || range.second >= this->data_size || data == 0) { return 0; } + this->locateUnsafe(range, data); + return data; +} + +void +RLCSA::locateUnsafe(pair_type range, LocateItem* data) +{ + usint items = range.second + 1 - range.first; + for(usint i = 0, j = range.first; i < items; i++, j++) + { + data[i].value = j + this->number_of_sequences; + data[i].offset = 0; + data[i].found = false; + } + + bool found = false; + while(!found) + { + found = true; + pair_type run = EMPTY_PAIR; + for(usint i = 0; i < items; i++) + { + if(data[i].found) + { + continue; // The run might continue after this. + } + else if(isEmpty(run)) + { + run = pair_type(i, i); + } + else if(data[i].value - data[run.first].value == i - run.first) + { + run.second = i; + } + else + { + found &= this->processRun(run, data); + run = pair_type(i, i); + } + } + if(!isEmpty(run)) { found &= this->processRun(run, data); } + } +} + +bool +RLCSA::processRun(pair_type run, LocateItem* data) +{ + bool found = true; + usint run_start = 0, run_left = 0; + pair_type next_sample = pair_type(0, 0); + + for(usint i = run.first; i <= run.second; i++) + { + if(data[i].found) + { + if(run_left > 0) { run_left--; } + continue; + } + if(data[i].value < this->number_of_sequences) // Implicit sample here. + { + data[i].value = this->end_points->select(data[i].value) + 1 - data[i].offset; + data[i].found = true; + if(run_left > 0) { run_left--; } + continue; + } + if(next_sample.first < data[i].value) // Need another sample. + { + next_sample = this->sa_samples->getFirstSampleAfter(data[i].value - this->number_of_sequences); + next_sample.first += this->number_of_sequences; + } + if(data[i].value < next_sample.first) // No sample found for current position. + { + if(run_left > 0) + { + data[i].value = data[run_start].value + i - run_start; + run_left--; + } + else + { + pair_type value = this->psi(data[i].value - this->number_of_sequences, run.second - i); + data[i].value = value.first; + run_left = value.second; + run_start = i; + } + data[i].offset++; + found = false; + } + else // Sampled position found. + { + data[i].value = this->sa_samples->getSample(next_sample.second) - data[i].offset; + data[i].found = true; + if(run_left > 0) { run_left--; } + } + } + return found; +} + +usint +RLCSA::locate(usint index) +{ + if(!(this->support_locate) || index >= this->data_size) { return this->data_size; } + + usint offset = 0; + index += this->number_of_sequences; + while(true) + { + if(index < this->number_of_sequences) // Implicit sample here + { + return this->end_points->select(index) + 1 - offset; + } + pair_type next_sample = this->sa_samples->getFirstSampleAfter(index - this->number_of_sequences); + next_sample.first += this->number_of_sequences; + if(next_sample.first == index) + { + return this->sa_samples->getSample(next_sample.second) - offset; + } + index = this->psi(index - this->number_of_sequences); + offset++; + } +} + +//-------------------------------------------------------------------------- + +uchar* +RLCSA::display(usint sequence, pair_type range) +{ + if(!(this->support_display) || isEmpty(range)) { return 0; } + + pair_type seq_range = this->getSequence(sequence); + if(isEmpty(seq_range)) { return 0; } + + range.first += seq_range.first; range.second += seq_range.first; + if(range.second > seq_range.second) { return 0; } + + uchar* data = new uchar[range.second + 1 - range.first]; + if(!data) { return 0; } + this->displayUnsafe(range, data); + + return data; +} + +uchar* +RLCSA::display(usint sequence, pair_type range, uchar* data) +{ + if(!(this->support_display) || isEmpty(range) || data == 0) { return 0; } + + pair_type seq_range = this->getSequence(sequence); + if(isEmpty(seq_range)) { return 0; } + + range.first += seq_range.first; range.second += seq_range.first; + if(range.second > seq_range.second) { return 0; } + + this->displayUnsafe(range, data); + return data; +} + +void +RLCSA::displayUnsafe(pair_type range, uchar* data) +{ + usint i = range.first - range.first % this->sa_samples->getSampleRate(); + + usint pos = this->sa_samples->inverseSA(i); + + for(; i < range.first; i++) + { + pos = this->psi(pos) - this->number_of_sequences; + } + for(; i <= range.second; i++) + { + data[i - range.first] = this->getCharacter(pos); + pos = this->psi(pos) - this->number_of_sequences; + } +} + +//-------------------------------------------------------------------------- + +void +RLCSA::decompressInto(std::ofstream& psi_file) +{ + for(usint c = 0; c < CHARS; c++) + { + if(!(this->array[c])) { continue; } + usint value = this->array[c]->select(0); + psi_file.write((char*)&(value), sizeof(value)); + while(this->array[c]->hasNext()) + { + value = this->array[c]->selectNext(); + psi_file.write((char*)&value, sizeof(value)); + } + } +} + +uchar* +RLCSA::readBWT() +{ + usint n = this->data_size + this->number_of_sequences; + + uchar* bwt = new uchar[n]; + memset(bwt, 0, n); + + for(usint c = 0; c < CHARS; c++) + { + RLEVector* vector = this->array[c]; + if(vector != 0) + { + bwt[vector->select(0)] = c; + while(vector->hasNext()) { bwt[vector->selectNext()] = c; } + } + } + + return bwt; +} + +//-------------------------------------------------------------------------- + +usint +RLCSA::psi(usint index) +{ + if(index >= this->data_size) + { + return this->data_size + this->number_of_sequences; + } + + usint c = this->getCharacter(index); + return this->array[c]->select(index - this->index_ranges[c].first); +} + +pair_type +RLCSA::psi(usint index, usint max_length) +{ + if(index >= this->data_size) + { + return pair_type(this->data_size + this->number_of_sequences, 0); + } + + usint c = this->getCharacter(index); + return this->array[c]->selectRun(index - this->index_ranges[c].first, max_length); +} + +//-------------------------------------------------------------------------- + +pair_type +RLCSA::getSequence(usint number) +{ + if(number >= this->number_of_sequences) { return EMPTY_PAIR; } + + pair_type result; + if(number == 0) + { + result.first = 0; + result.second = this->end_points->select(number); + } + else + { + if(this->sa_samples == 0) { return EMPTY_PAIR; } + usint d = this->sa_samples->getSampleRate(); + result.first = d * ((this->end_points->select(number - 1) / d) + 1); + result.second = this->end_points->selectNext(); + } + + return result; +} + +usint +RLCSA::reportSize(bool print) +{ + usint bytes = 0, temp = 0; + + for(usint c = 0; c < CHARS; c++) + { + if(this->array[c]) { temp += this->array[c]->reportSize(); } + } + if(print) { std::cout << "Psi array: " << temp << std::endl; } + bytes += temp; + + if(this->support_locate || this->support_display) + { + temp = this->sa_samples->reportSize(); + if(print) { std::cout << "SA samples: " << temp << std::endl; } + bytes += temp; + } + + temp = sizeof(*this) + this->end_points->reportSize(); + if(print) { std::cout << "RLCSA overhead: " << temp << std::endl; } + bytes += temp; + + if(print) + { + std::cout << "Total size: " << bytes << std::endl; + std::cout << std::endl; + } + + return bytes; +} + +//-------------------------------------------------------------------------- + +void +RLCSA::buildCharIndexes(usint* distribution) +{ + this->data_size = buildRanges(distribution, this->index_ranges); + + usint i = 0, c = 0; + for(; c < CHARS; c++) + { + if(!isEmpty(this->index_ranges[c])) + { + this->text_chars[i] = c; + i++; + } + } + this->chars = i; + + this->index_rate = std::max((this->data_size + CHARS - 1) / CHARS, (usint)1); + usint current = 0; + + for(c = 0, i = 0; c < this->chars; c++) + { + pair_type range = this->index_ranges[this->text_chars[c]]; + while(current <= range.second) + { + this->index_pointers[i] = c; + current += this->index_rate; + i++; + } + } +} + +usint +buildRanges(usint* distribution, pair_type* index_ranges) +{ + if(distribution == 0 || index_ranges == 0) { return 0; } + + usint sum = 0; + for(usint c = 0; c < CHARS; c++) + { + if(distribution[c] == 0) + { + if(sum == 0) { index_ranges[c] = EMPTY_PAIR; } + else { index_ranges[c] = pair_type(sum, sum - 1); } + } + else + { + index_ranges[c].first = sum; + sum += distribution[c]; + index_ranges[c].second = sum - 1; + } + } + + return sum; +} + + +} // namespace CSA diff --git a/incbwt/rlcsa.h b/incbwt/rlcsa.h new file mode 100644 index 0000000..c97f73e --- /dev/null +++ b/incbwt/rlcsa.h @@ -0,0 +1,138 @@ +#ifndef RLCSA_H +#define RLCSA_H + +#include + +#include "bits/vectors.h" +#include "sasamples.h" +#include "misc/parameters.h" + + +namespace CSA +{ + + +const std::string SA_EXTENSION = ".sa"; +const std::string PSI_EXTENSION = ".psi"; +const std::string ARRAY_EXTENSION = ".rlcsa.array"; +const std::string SA_SAMPLES_EXTENSION = ".rlcsa.sa_samples"; +const std::string PARAMETERS_EXTENSION = ".rlcsa.parameters"; + + +const parameter_type RLCSA_BLOCK_SIZE = parameter_type("RLCSA_BLOCK_SIZE", 32); +const parameter_type SAMPLE_RATE = parameter_type("SAMPLE_RATE", 512); +const parameter_type SUPPORT_LOCATE = parameter_type("SUPPORT_LOCATE", 0); +const parameter_type SUPPORT_DISPLAY = parameter_type("SUPPORT_DISPLAY", 0); + + +struct LocateItem +{ + usint value; + usint offset; + bool found; +}; + + + +class RLCSA +{ + public: + const static usint ENDPOINT_BLOCK_SIZE = 16; + + RLCSA(const std::string& base_name, bool print = false); + + /* + If multiple_sequences is true, each 0 is treated as a end of sequence marker. + There must be nonzero characters between the 0s. data[bytes - 1] must also be 0. + */ + RLCSA(uchar* data, usint bytes, usint block_size, usint sa_sample_rate, bool multiple_sequences, bool delete_data); + + // Destroys contents of index and increment. + RLCSA(RLCSA& index, RLCSA& increment, usint* positions, usint block_size); + ~RLCSA(); + + void writeTo(const std::string& base_name); + + bool isOk(); + + // Returns the closed interval of suffix array containing the matches. + pair_type count(const std::string& pattern); + + void reportPositions(uchar* data, usint length, usint* positions); + + // Returns SA[range]. User must free the buffer. Latter version uses buffer provided by the user. + LocateItem* locate(pair_type range); + LocateItem* locate(pair_type range, LocateItem* data); + + // Returns SA[index]. + usint locate(usint index); + + // Returns Ti[range]. User must free the buffer. Latter version uses buffer provided by the user. + uchar* display(usint sequence, pair_type range); + uchar* display(usint sequence, pair_type range, uchar* data); + + // Writes the Psi array into given file. End of sequence markers are not written. + void decompressInto(std::ofstream& psi_file); + + // Returns the BWT of the collection including end of sequence markers. + uchar* readBWT(); + + // Return value includes the implicit end of sequence markers. To get suffix array indexes, + // subtract getNumberOfSequences() from the value. + usint psi(usint index); + pair_type psi(usint index, usint max_length); // This version returns a run. + + inline bool supportsLocate() { return this->support_locate; } + inline bool supportsDisplay() { return this->support_display; } + inline usint getSize() { return this->data_size; } + inline usint getNumberOfSequences() { return this->number_of_sequences; } + + pair_type getSequence(usint number); + + // Returns the size of the data structure. + usint reportSize(bool print = false); + + private: + bool ok; + + RLEVector* array[CHARS]; + SASamples* sa_samples; + + pair_type index_ranges[CHARS]; + usint data_size; + + usint text_chars[CHARS]; // which characters are present in the text + usint chars; + + usint index_pointers[CHARS]; // which of the above is at i * index_rate + usint index_rate; + + bool support_locate, support_display; + usint sample_rate; + + usint number_of_sequences; + DeltaVector* end_points; + + void locateUnsafe(pair_type range, LocateItem* data); + bool processRun(pair_type run, LocateItem* data); + void displayUnsafe(pair_type range, uchar* data); + + inline usint getCharacter(usint pos) + { + usint* curr = &(this->text_chars[this->index_pointers[pos / this->index_rate]]); + while(pos > this->index_ranges[*curr].second) { curr++; } + return *curr; + } + + void buildCharIndexes(usint* distribution); +}; + + +// Returns the total number of characters. +usint buildRanges(usint* distribution, pair_type* index_ranges); + + +} // namespace CSA + + +#endif // RLCSA_H diff --git a/incbwt/rlcsa_builder.cpp b/incbwt/rlcsa_builder.cpp new file mode 100644 index 0000000..43d5a7d --- /dev/null +++ b/incbwt/rlcsa_builder.cpp @@ -0,0 +1,199 @@ +#include + +#include "rlcsa_builder.h" + + +namespace CSA +{ + + +RLCSABuilder::RLCSABuilder(usint _block_size, usint _sample_rate, usint _buffer_size) : + block_size(_block_size), sample_rate(_sample_rate), buffer_size(_buffer_size), + buffer(0) +{ + this->reset(); +} + +RLCSABuilder::~RLCSABuilder() +{ + delete this->index; + delete[] this->buffer; +} + +//-------------------------------------------------------------------------- + +void +RLCSABuilder::insertSequence(char* sequence, usint length, bool delete_sequence) +{ + if(sequence == 0 || length == 0 || !this->ok) + { + if(delete_sequence) { delete[] sequence; } + return; + } + + if(this->buffer == 0) + { + clock_t start = clock(); + RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false); + this->build_time += clock() - start; + this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence); + return; + } + + if(this->buffer_size - this->chars > length) + { + memcpy(this->buffer + this->chars, sequence, length); + if(delete_sequence) { delete[] sequence; } + this->chars += length; + this->buffer[this->chars] = 0; + this->chars++; + } + else + { + this->flush(); + this->buffer = new uchar[this->buffer_size]; + if(length >= this->buffer_size - 1) + { + clock_t start = clock(); + RLCSA* temp = new RLCSA((uchar*)sequence, length, this->block_size, this->sample_rate, false, false); + this->build_time += clock() - start; + this->addRLCSA(temp, (uchar*)sequence, length + 1, delete_sequence); + } + else + { + memcpy(this->buffer + this->chars, sequence, length); + if(delete_sequence) { delete[] sequence; } + this->chars += length; + this->buffer[this->chars] = 0; + this->chars++; + } + } +} + +RLCSA* +RLCSABuilder::getRLCSA() +{ + if(this->chars > 0) { this->flush(); } + + RLCSA* temp = this->index; + this->reset(); + + return temp; +} + +char* +RLCSABuilder::getBWT(usint& length) +{ + if(this->chars > 0) + { + this->flush(); + if(this->buffer_size > 0) { this->buffer = new uchar[this->buffer_size]; } + } + + if(this->index == 0 || !(this->ok)) + { + length = 0; + return 0; + } + + length = this->index->getSize() + this->index->getNumberOfSequences(); + return (char*)(this->index->readBWT()); +} + +bool +RLCSABuilder::isOk() +{ + return this->ok; +} + +double +RLCSABuilder::getBuildTime() +{ + return this->build_time / (double)CLOCKS_PER_SEC; +} + +double +RLCSABuilder::getSearchTime() +{ + return this->search_time / (double)CLOCKS_PER_SEC; +} + +double +RLCSABuilder::getMergeTime() +{ + return this->merge_time / (double)CLOCKS_PER_SEC; +} + +//-------------------------------------------------------------------------- + +void +RLCSABuilder::flush() +{ + clock_t start = clock(); + RLCSA* temp = new RLCSA(this->buffer, this->chars, this->block_size, this->sample_rate, true, (this->index == 0)); + this->build_time += clock() - start; + this->addRLCSA(temp, this->buffer, this->chars, true); + this->buffer = 0; this->chars = 0; +} + +void +RLCSABuilder::addRLCSA(RLCSA* increment, uchar* sequence, usint length, bool delete_sequence) +{ + if(this->index != 0) + { + clock_t start = clock(); + + usint* positions = new usint[length]; + usint begin = 0; + for(usint i = 0; i < length - 1; i++) + { + if(sequence[i] == 0) + { + this->index->reportPositions(&(sequence[begin]), i - begin, &(positions[begin])); + begin = i + 1; + } + } + this->index->reportPositions(&(sequence[begin]), length - 1 - begin, &(positions[begin])); + + std::sort(positions, positions + length); + for(usint i = 0; i < length; i++) + { + positions[i] += i + 1; // +1 because the insertion will be after positions[i] + } + if(delete_sequence) { delete[] sequence; } + + clock_t mark = clock(); + this->search_time += mark - start; + + RLCSA* merged = new RLCSA(*(this->index), *increment, positions, this->block_size); + delete[] positions; + delete this->index; + delete increment; + this->index = merged; + + this->merge_time += clock() - mark; + } + else + { + this->index = increment; + } + + this->ok &= this->index->isOk(); +} + +void +RLCSABuilder::reset() +{ + this->index = 0; + + if(this->buffer_size != 0) + { + this->buffer = new uchar[this->buffer_size]; + } + this->chars = 0; + + this->ok = true; +} + + +} // namespace CSA diff --git a/incbwt/rlcsa_builder.h b/incbwt/rlcsa_builder.h new file mode 100644 index 0000000..0b0b6b2 --- /dev/null +++ b/incbwt/rlcsa_builder.h @@ -0,0 +1,58 @@ +#ifndef RLCSA_BUILDER_H +#define RLCSA_BUILDER_H + +#include +#include "rlcsa.h" + + +namespace CSA +{ + + +class RLCSABuilder +{ + public: + RLCSABuilder(usint _block_size, usint _sample_rate, usint _buffer_size); + ~RLCSABuilder(); + + void insertSequence(char* sequence, usint length, bool delete_sequence); + + // User must free the index. Builder no longer contains it. + RLCSA* getRLCSA(); + + // User must free the BWT. length becomes the length of BWT. + char* getBWT(usint& length); + + bool isOk(); + + // These times are not reset with the rest of the builder. + double getBuildTime(); + double getSearchTime(); + double getMergeTime(); + + private: + RLCSA* index; + + usint block_size; + usint sample_rate; + usint buffer_size; + + uchar* buffer; + usint chars; + + bool ok; + + clock_t build_time; + clock_t search_time; + clock_t merge_time; + + void flush(); + void addRLCSA(RLCSA* increment, uchar* sequence, usint length, bool delete_sequence); + void reset(); +}; + + +} // namespace CSA + + +#endif // RLCSA_BUILDER_H diff --git a/incbwt/sasamples.cpp b/incbwt/sasamples.cpp new file mode 100644 index 0000000..8b1aef3 --- /dev/null +++ b/incbwt/sasamples.cpp @@ -0,0 +1,188 @@ +#include +#include +#include + +#include "sasamples.h" +#include "misc/utils.h" + + +namespace CSA +{ + + +SASamples::SASamples(std::ifstream& sample_file, usint sample_rate) : + rate(sample_rate) +{ + this->indexes = new DeltaVector(sample_file); + + this->size = indexes->getSize(); + this->items = indexes->getNumberOfItems(); + this->integer_bits = length(this->items - 1); + + this->samples = new FastBitBuffer(sample_file, this->items, this->integer_bits); + this->buildInverseSamples(); +} + +SASamples::SASamples(usint* array, usint data_size, usint sample_rate) : + rate(sample_rate), + size(data_size) +{ + this->items = (this->size - 1) / this->rate + 1; + this->integer_bits = length(this->items - 1); + + this->samples = new FastBitBuffer(this->items, this->integer_bits); + DeltaEncoder encoder(SASamples::BLOCK_SIZE); + for(usint i = 0; i < this->size; i++) + { + if(array[i] % rate == 0) + { + encoder.setBit(i); + this->samples->writeItem(array[i] / sample_rate); + } + } + this->indexes = new DeltaVector(encoder, this->size); + + this->buildInverseSamples(); +} + +SASamples::SASamples(SASamples& index, SASamples& increment, usint* positions, usint number_of_positions) : + rate(index.rate), + size(index.size + increment.size), + items(index.items + increment.items) +{ + this->mergeSamples(index, increment, positions, number_of_positions); + + index.indexes = 0; + index.samples = 0; + index.inverse_samples = 0; + increment.indexes = 0; + increment.samples = 0; + increment.inverse_samples = 0; + + this->buildInverseSamples(); +} + +SASamples::~SASamples() +{ + delete this->indexes; + delete this->samples; + delete this->inverse_samples; +} + +void +SASamples::writeTo(std::ofstream& sample_file) +{ + this->indexes->writeTo(sample_file); + this->samples->writeBuffer(sample_file, false); +} + +//-------------------------------------------------------------------------- + +usint +SASamples::inverseSA(usint value) +{ + if(value >= this->size) { return this->size; } + usint i = this->inverse_samples->readItem(value / this->rate); + return this->indexes->select(i); +} + +pair_type +SASamples::getFirstSampleAfter(usint index) +{ + if(index >= this->size) { return pair_type(this->size, this->size); } + + return this->indexes->valueAfter(index); +} + +//-------------------------------------------------------------------------- + +usint +SASamples::reportSize() +{ + usint bytes = sizeof(*this) - sizeof(this->indexes); + bytes += this->indexes->reportSize(); + bytes += this->samples->reportSize(); + bytes += this->inverse_samples->reportSize(); + return bytes; +} + +//-------------------------------------------------------------------------- + +void +SASamples::buildInverseSamples() +{ + this->inverse_samples = new FastBitBuffer(this->items, this->integer_bits); + this->samples->goToItem(0); + for(usint i = 0; i < this->items; i++) + { + this->inverse_samples->goToItem(this->samples->readItem()); + this->inverse_samples->writeItem(i); + } +} + + +//-------------------------------------------------------------------------- + + +void +SASamples::mergeSamples(SASamples& index, SASamples& increment, usint* positions, usint n) +{ + DeltaVector* first = index.indexes; + DeltaVector* second = increment.indexes; + FastBitBuffer* first_samples = index.samples; + FastBitBuffer* second_samples = increment.samples; + + usint first_bit = first->select(0); + bool first_finished = false; + usint second_bit = second->select(0); + usint sum = index.items; + first_samples->goToItem(0); + second_samples->goToItem(0); + + DeltaEncoder encoder(SASamples::BLOCK_SIZE); + this->integer_bits = length(this->items - 1); + this->samples = new FastBitBuffer(this->items, this->integer_bits); + for(usint i = 0; i < n; i++, first_bit++) + { + while(!first_finished && first_bit < positions[i]) + { + encoder.setBit(first_bit); + this->samples->writeItem(first_samples->readItem()); + if(first->hasNext()) + { + first_bit = first->selectNext() + i; + } + else + { + first_finished = true; + } + } + + if(i == second_bit) // positions[i] is one + { + encoder.setBit(positions[i]); + this->samples->writeItem(second_samples->readItem() + sum); + second_bit = second->selectNext(); + } + } + + while(!first_finished) + { + encoder.setBit(first_bit); + this->samples->writeItem(first_samples->readItem()); + if(!first->hasNext()) { break; } + first_bit = first->selectNext() + n; + } + + delete index.indexes; + delete index.samples; + delete index.inverse_samples; + delete increment.indexes; + delete increment.samples; + delete increment.inverse_samples; + + this->indexes = new DeltaVector(encoder, size); +} + + +} // namespace CSA diff --git a/incbwt/sasamples.h b/incbwt/sasamples.h new file mode 100644 index 0000000..bb4eee8 --- /dev/null +++ b/incbwt/sasamples.h @@ -0,0 +1,69 @@ +#ifndef SASAMPLES_H +#define SASAMPLES_H + +#include + +#include "misc/definitions.h" +#include "bits/bitbuffer.h" +#include "bits/deltavector.h" + + +namespace CSA +{ + + +class SASamples +{ + public: + const static usint BLOCK_SIZE = 16; + + SASamples(std::ifstream& sample_file, usint sample_rate); + SASamples(usint* array, usint data_size, usint sample_rate); + ~SASamples(); + + // Destroys contents of index and increment. + // We assume index and increment have same sample rate. + SASamples(SASamples& index, SASamples& increment, usint* positions, usint number_of_positions); + + void writeTo(std::ofstream& sample_file); + + // Returns i such that SA[i] = value. + // If SA[i] is not sampled, returns the next sampled value. (Don't try!) + // Value is actual 0-based suffix array value. + // Returns size if value is too large. + usint inverseSA(usint value); + + // Returns the value of ith sample in suffix array order. + inline usint getSample(usint i) + { + return std::min(this->samples->readItem(i) * this->rate, this->size - 1); + } + + // Returns (ind, sample number) where ind >= index or (size, ???). + pair_type getFirstSampleAfter(usint index); + + inline usint getSampleRate() { return this->rate; } + inline usint getNumberOfSamples() { return this->items; } + + usint reportSize(); + + private: + usint integer_bits; + usint rate, size, items; + + DeltaVector* indexes; + + FastBitBuffer* samples; + FastBitBuffer* inverse_samples; + + void buildInverseSamples(); + + // Note: contents of original samples are deleted. + void mergeSamples(SASamples& index, SASamples& increment, usint* positions, usint n); +}; + + +} // namespace CSA + + +#endif // SASAMPLES_H diff --git a/makefile b/makefile index fc1c3ca..ecf0ff1 100644 --- a/makefile +++ b/makefile @@ -5,21 +5,27 @@ LIBCDSA = $(LIBCDSPATH)/lib/libcds.a dcover_obs = dcover/difference_cover.o -testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o RLWaveletTree.o GapEncode.o BSGAP.o ${LIBCDSA} ${dcover_obs} +TextCollection_obs = TextCollection.o TextCollectionBuilder.o TCImplementation.o Tools.o BitRank.o \ + BSGAP.o incbwt/rlcsa.a ${LIBCDSA} ${dcover_obs} +TCDebug_obs = bittree.o rbtree.o dynFMI.o -timeTextCollection_obs = timeTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o RLWaveletTree.o GapEncode.o BSGAP.o ${LIBCDSA} ${dcover_obs} +all: testTextCollection -all: $(testTextCollection_obs) testTextCollection +testTextCollection: testTextCollection.o $(TextCollection_obs) $(TCDebug_obs) HeapProfiler.o + $(CC) -o testTextCollection testTextCollection.o $(TextCollection_obs) $(TCDebug_obs) HeapProfiler.o -testTextCollection: $(testTextCollection_obs) HeapProfiler.o - $(CC) -o testTextCollection $(testTextCollection_obs) HeapProfiler.o +timeTextCollection: timeTextCollection.o $(TextCollection_obs) $(TCDebug_obs) + $(CC) -o timeTextCollection timeTextCollection.o $(TextCollection_obs) $(TCDebug_obs) + +incbwt/rlcsa.a: + @make -C incbwt -timeTextCollection: $(timeTextCollection_obs) - $(CC) -o timeTextCollection $(timeTextCollection_obs) clean: + @make clean -C incbwt rm -f core *.o *~ testTextCollection timeTextCollection dcover/*.o dcover/*~ depend: - $(CC) -MM *.cpp *.c > dependencies.mk + @make depend -C incbwt + $(CC) -MM *.cpp > dependencies.mk include dependencies.mk diff --git a/testTextCollection.cpp b/testTextCollection.cpp index 59b6b43..166c019 100644 --- a/testTextCollection.cpp +++ b/testTextCollection.cpp @@ -7,8 +7,9 @@ using std::endl; using std::cin; using std::string; -#include "TextCollection.h" +#include "TextCollectionBuilder.h" #include "HeapProfiler.h" +using SXSI::TextCollectionBuilder; using SXSI::TextCollection; void printDocumentResult(TextCollection::document_result dr) @@ -39,20 +40,23 @@ int main() int i = 0 ,j = 0; int heap_base = HeapProfiler::GetHeapConsumption(); std::cerr << "Initial heap usage : " << heap_base << "\n"; - TextCollection *csa = TextCollection::InitTextCollection(5); // Avoid small samplerates ;) + TextCollectionBuilder *tcb = new TextCollectionBuilder(32); heap_base = HeapProfiler::GetHeapConsumption (); std::cerr << "Heap usage after InitTextCollection : " << heap_base << "\n"; - + Tools::StartTimer(); while (not(cin.eof())){ getline(cin,str); // Read line by line. // cin >> str; // Read word by word. data = (uchar *) str.c_str(); - csa->InsertText(data); + if (str.size() == 0) + continue; + + tcb->InsertText(data); i++; j+= str.size(); str.clear(); - if ( i % 1000 == 0) { + if ( i % 100000 == 0) { std::cerr << "Inserted : " << i << " strings\n"; std::cerr << "Number of bytes inserted : " << j << "b \n"; std::cerr << "Heap usage used for strings: " << HeapProfiler::GetHeapConsumption() - heap_base @@ -62,26 +66,43 @@ int main() }; }; +/**/ + //the whole file as 20 strings: + /* uchar *temp = Tools::GetFileContents("data/english.100MB", 0); + ulong n = strlen((char *)temp); + std::cout << "n = " << n << std::endl; + ulong offset = n/40; + uchar *it = temp; + for (i = 0; i < 5; ++i) + { + it[offset] = '\0'; + tcb->InsertText(it); + std::cout << "inserted " << strlen((char *)it) << " bytes." << std::endl; + it += offset +1; + } + it -= offset+1; -/* the whole file as one string: - uchar *temp = Tools::GetFileContents("data.txt", 0); - csa->InsertText(temp); + if (it > temp + n) + std::cout << "over bounds" << std::endl; delete [] temp;*/ + + HeapProfiler::ResetMaxHeapConsumption(); std::cerr << "Creating new text collection with " << i << " strings (total " << j/1024 << " kb)\n"; std::cerr << "Before MakeStatic() [press enter]\n"; - std::cin >> kbd; + //std::cin >> kbd; // This will print the maximum mem usage during construction time: std::cerr << "max heap usage: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - csa->MakeStatic(); + TextCollection* tc = tcb->InitTextCollection(); + delete tcb; tcb = 0; std::cerr << "After MakeStatic() [press enter]\n"; // This will print the maximum mem usage during MakeStatic(): std::cerr << "max heap usage: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - std::cin >> kbd; + //std::cin >> kbd; std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - delete csa; + delete tc; std::cerr << "After Delete [press enter]\n"; - std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl; - std::cin >> kbd; + std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption() << " bytes" << std::endl; + //std::cin >> kbd; return 0; } diff --git a/timeTextCollection.cpp b/timeTextCollection.cpp index 91c9404..47d1976 100644 --- a/timeTextCollection.cpp +++ b/timeTextCollection.cpp @@ -18,8 +18,9 @@ static struct timeval t2; -#include "TextCollection.h" +#include "TextCollectionBuilder.h" using SXSI::TextCollection; +using SXSI::TextCollectionBuilder; int main(int argc, char**argv) { @@ -35,7 +36,7 @@ int main(int argc, char**argv) - TextCollection *csa = TextCollection::InitTextCollection(64); + TextCollectionBuilder *tcb = new TextCollectionBuilder(64); STARTTIMER(); @@ -44,7 +45,7 @@ int main(int argc, char**argv) while (not(cin.eof()) && num_str < 100000 ){ getline(cin,str); // Read line by line. if (str.compare("----------") == 0){ - csa->InsertText((unsigned char*) buffer.c_str()); + tcb->InsertText((unsigned char*) buffer.c_str()); if (num_str % 10000 == 0){ STOPTIMER(); @@ -65,7 +66,7 @@ int main(int argc, char**argv) }; std::cerr << "Calling MakeStatic()\n"; - csa->MakeStatic(); + TextCollection *tc = tcb->InitTextCollection(); std::cerr << "Statistics: " << num_str << " strings, " << max_str << " = max length\n"; int count; @@ -74,7 +75,7 @@ int main(int argc, char**argv) for (unsigned int i = 0; i < (sizeof(words)/sizeof(char*)) ; i++){ STARTTIMER(); - is = csa->IsContains((unsigned char*) words[i].c_str()); + is = tc->IsContains((unsigned char*) words[i].c_str()); STOPTIMER(); time = GETTIME(); @@ -82,7 +83,7 @@ int main(int argc, char**argv) STARTTIMER(); - count = csa->Count((unsigned char*) words[i].c_str()); + count = tc->Count((unsigned char*) words[i].c_str()); STOPTIMER(); time = GETTIME(); @@ -90,7 +91,7 @@ int main(int argc, char**argv) STARTTIMER(); - count = csa->CountContains((unsigned char*) words[i].c_str()); + count = tc->CountContains((unsigned char*) words[i].c_str()); STOPTIMER(); time = GETTIME(); @@ -98,7 +99,7 @@ int main(int argc, char**argv) STARTTIMER(); - res = csa->Contains((unsigned char*) words[i].c_str()); + res = tc->Contains((unsigned char*) words[i].c_str()); STOPTIMER(); time = GETTIME(); -- 2.17.1