{
// Save file version info
-const uchar TCImplementation::versionFlag = 6;
+const uchar TCImplementation::versionFlag = 8;
/**
* Constructor inits an empty dynamic FM-index.
* Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
*/
TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_,
- unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_)
+ unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_, char tsType)
: n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
{
makewavelet(bwt); // Deletes bwt!
bwt = 0;
-
+
// Make sampling tables
- maketables(numberOfSamples_);
+ maketables(numberOfSamples_, tsType);
}
bool TCImplementation::EmptyText(DocId k) const
return result;
}
+
+/**
+ ***
+* * FIXME Lessthan or equal
+ */
TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
{
TextPosition m = strlen((char *)pattern);
}
-TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const
+TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
return result;
}
-TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const
+TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
return result;
}
-TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const
+TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
return result;
}
-TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const
+TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
: n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
{
+// Tools::StartTimer();
+// std::cout << std::endl << "Loading..."<< std::endl;
+
uchar verFlag = 0;
if (std::fread(&verFlag, 1, 1, file) != 1)
throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
+// std::cout << "Loading alphabet rank (" << Tools::GetTime() << " s)." << std::endl;
alphabetrank = static_sequence::load(file);
+// std::cout << "Loading samples (" << Tools::GetTime() << " s)." << std::endl;
sampled = static_bitsequence::load(file);
suffixes = new BlockArray(file);
suffixDocId = new BlockArray(file);
if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
- Doc = static_sequence::load(file);
- textStorage = new TextStorage(file);
+// std::cout << "Loading Doc (" << Tools::GetTime() << " s)." << std::endl;
+ Doc = new ArrayDoc(file); //static_sequence::load(file);
+// std::cout << "Loading text storage (" << Tools::GetTime() << " s)." << std::endl;
+ textStorage = TextStorage::Load(file);
+
+// std::cout << "Loading done(" << Tools::GetTime() << " s)." << std::endl;
// FIXME Construct data structures with new samplerate
//maketables();
//first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
{
+ ulong sum=0;
if (d[m]<=k) // range of suffixes with at most k-errors found
{
if (sp<=ep)
result.push_back(std::make_pair(sp, ep));
- return (sp<=ep)?ep-sp+1:0ul;
+ sum += (sp<=ep)?ep-sp+1:0ul;
}
if (sp>ep || j==0)
- return 0;
+ return sum;
ulong *dnew = new ulong[m+1];
int c;
ulong spnew;
ulong p,lowerbound;
ulong epnew;
- ulong sum=0;
vector<int> chars = alphabetrank->accessAll(sp, ep);
for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
{
#endif
alphabet_mapper * am = new alphabet_mapper_none();
- static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
- wt_coder * wtc = new wt_coder_binary(bwt,n,am);
+ static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate?
+ wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape
alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
delete bmb;
bwt = 0; // already deleted
#endif
}
-void TCImplementation::maketables(ulong sampleLength)
+void TCImplementation::maketables(ulong sampleLength, char tsType)
{
// Calculate BWT end-marker position (of last inserted text)
{
// Mapping from end-markers to doc ID's:
unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts);
- uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1];
+// uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1];
+ BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts);
BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n));
uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
// Record the order of end-markers in BWT:
ulong endmarkerRank = alphabetrank_i_tmp - 1;
- set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts);
+ //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts);
+ (*endmarkerDocId)[endmarkerRank] = (textId + 1) % numberOfTexts;
// Store text length and text start position:
if (textId < (DocId)numberOfTexts - 1)
HeapProfiler::ResetMaxHeapConsumption();
#endif
- textStorage = tsbuilder.InitTextStorage();
+ textStorage = tsbuilder.InitTextStorage(tsType);
#ifdef DEBUG_MEMUSAGE
std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
-#ifdef DEBUG_MEMUSAGE
- std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
- HeapProfiler::ResetMaxHeapConsumption();
-#endif
-
x = n - 2;
posOfSuccEndmarker = n-1;
for(ulong i=0; i<sampleLength; i++) {
while ((posOfSuccEndmarker - x) % samplerate != 0)
{
--x;
- if (tsbuilder[x] == '\0')
+ assert(x != ~0lu);
+ if (textStorage->IsEndmarker(x))
posOfSuccEndmarker = x--;
}
assert((*positions)[i] < n);
// calculate offset from text start:
(*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]);
--x;
- if (tsbuilder[x] == '\0')
+ if (x != ~0lu && textStorage->IsEndmarker(x))
posOfSuccEndmarker = x--;
}
delete positions;
+#ifdef DEBUG_MEMUSAGE
+ std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
+ HeapProfiler::ResetMaxHeapConsumption();
+#endif
+
/**
* Second pass: check tables
*/
HeapProfiler::ResetMaxHeapConsumption();
#endif
- alphabet_mapper * am = new alphabet_mapper_none();
+ /*alphabet_mapper * am = new alphabet_mapper_none();
static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate?
Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true);
- delete bmb;
+ delete bmb;*/
// delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
+ Doc = new ArrayDoc(endmarkerDocId);
+
#ifdef DEBUG_MEMUSAGE
std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
#endif