throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
#endif
+ if (texts.size() == 0) // Empty collection
+ {
+ ++n;
+ texts.append(1, '\0');
+ }
+
// Bitvector of empty texts, FIXME Remove?
{
//std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
}
+ texts.reserve(0); // Release extra capacity
+
+/* FILE *fp = fopen("texts.txt", "wb");
+ std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
+ fclose(fp);*/
+
uchar *bwt = new uchar[n];
{
// More succinct solution via StringIterator, see below.
texts.reserve(0); // Release capacity
delete br; // bitvector of endmarkers
} // End of bw transform
+// std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
+
+/* fp = fopen("bwt.txt", "wb");
+ std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
+ fclose(fp);*/
+
#ifdef CSA_TEST_BWT
{
return false;
}
-bool CSA::IsLessThan(uchar const*) const
+bool CSA::IsLessThan(uchar const * pattern) const
{
- // TODO
- assert(0);
+ if (CountLessThan(pattern) > 0)
+ return true;
return false;
}
return result.size();
}
+// Less than or equal
unsigned CSA::CountLessThan(uchar const * pattern) const
{
- // TODO
- assert(0);
- return 0;
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ return CountEndmarkers(sp, ep);
}
/**
TextCollection::document_result CSA::LessThan(uchar const * pattern) const
{
- // TODO
- assert(0);
- return document_result();
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result(); // empty result set
+
+ TextPosition sp = 0, ep = 0;
+ SearchLessThan(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ unsigned resultSize = CountEndmarkers(sp, ep);
+ if (resultSize == 0)
+ return result;
+
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep]:
+ unsigned i = 0;
+ if (sp > 0)
+ i = alphabetrank->rank(0, sp - 1);
+ while (resultSize)
+ {
+ // End-marker we found belongs to the "preceeding" doc in the collection
+ DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
+
+ -- resultSize;
+ ++ i;
+ }
+
+ return result;
}
/**
return 0;
}
+
+ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
+{
+ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+ uint c = (int)pattern[m-1];
+ TextPosition i=m-1;
+ TextPosition sp = 1;
+ TextPosition ep = C[c+1]-1;
+ while (sp<=ep && i>=1)
+ {
+// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ c = (int)pattern[--i];
+ uint result = alphabetrank->rank(c,ep);
+ if (result == ~0u)
+ ep = 0;
+ else
+ ep = C[c]+result-1;
+ }
+ *spResult = sp;
+ *epResult = ep;
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+}
+
+
CSA::~CSA() {
#ifdef CSA_TEST_BWT
delete dynFMI;
// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
// delete [] bwt;
//alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
+ std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+ std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+
+ HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
alphabet_mapper * am = new alphabet_mapper_none();
- static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
- wt_coder * wtc = new wt_coder_huff(bwt,n,am);
+ static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
+// static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
+ wt_coder * wtc = new wt_coder_binary(bwt,n,am);
alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
delete bmb;
- delete [] bwt;
+ bwt = 0; // already deleted
+
+ std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
+ std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
}
void CSA::maketables()
// Note: bwtEndPos already set!
// and the length of the first text (counter l):
#ifdef CSA_TEST_BWT
- ulong i = 0;
- ulong l = 1;
-
- uchar c = alphabetrank->access(i);
- while (c != '\0')
{
- i = C[c]+alphabetrank->rank(c, i)-1;
- ++l;
- c = alphabetrank->access(i);
+ ulong i = 0;
+ ulong l = 1;
+ uint alphabetrank_i_tmp = 0;
+ uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
+ while (c != '\0')
+ {
+ i = C[c]+alphabetrank_i_tmp-1;
+ ++l;
+ c = alphabetrank->access(i, alphabetrank_i_tmp);
+ }
+
+ if (i != bwtEndPos) // compare result
+ {
+ std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
+ exit(0);
+ }
}
- assert(i == bwtEndPos); // compare result
#endif
// Build up arrays for text length and starting positions
DocId textId = numberOfTexts;
ulong ulongmax = 0;
ulongmax--;
+ uint alphabetrank_i_tmp =0;
//positions:
for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
sampleCount ++;
}
- uchar c = alphabetrank->access(p);
+ uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
if (c == '\0')
{
--textId;
// Record the order of end-markers in BWT:
- ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
+ ulong endmarkerRank = alphabetrank_i_tmp - 1;
(*endmarkerDocId)[endmarkerRank] = textId;
// Store text length and text start position:
p = textId; // Correct LF-mapping to the last char of the previous text.
}
else
- p = C[c]+alphabetrank->rank(c, p)-1;
+ p = C[c]+alphabetrank_i_tmp-1;
}
assert(textId == 0);